Skip to main content

tokmd_model/
lib.rs

1//! # tokmd-model
2//!
3//! **Tier 1 (Logic)**
4//!
5//! This crate contains the core business logic for aggregating and transforming code statistics.
6//! It handles the conversion from raw Tokei scan results into `tokmd` receipts.
7//!
8//! ## What belongs here
9//! * Aggregation logic (rolling up stats to modules/languages)
10//! * Deterministic sorting and filtering
11//! * Path normalization rules
12//! * Receipt generation logic
13//!
14//! ## What does NOT belong here
15//! * CLI argument parsing
16//! * Output formatting (printing to stdout/file)
17//! * Tokei interaction (use tokmd-scan)
18
19use std::borrow::Cow;
20use std::collections::{BTreeMap, BTreeSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23
24use tokei::{CodeStats, Config, LanguageType, Languages};
25use tokmd_module_key::module_key_from_normalized;
26use tokmd_types::{
27    ChildIncludeMode, ChildrenMode, ExportData, FileKind, FileRow, LangReport, LangRow,
28    ModuleReport, ModuleRow, Totals,
29};
30
31/// Simple heuristic: 1 token ~= 4 chars (bytes).
32const CHARS_PER_TOKEN: usize = 4;
33
34#[derive(Default, Clone, Copy)]
35struct Agg {
36    code: usize,
37    comments: usize,
38    blanks: usize,
39    bytes: usize,
40    tokens: usize,
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
44struct Key {
45    path: String,
46    lang: String,
47    kind: FileKind,
48}
49
50/// A logical in-memory file used to synthesize `FileRow`s without the host filesystem.
51pub struct InMemoryRowInput<'a> {
52    pub logical_path: &'a Path,
53    pub bytes: &'a [u8],
54}
55
56impl<'a> InMemoryRowInput<'a> {
57    #[must_use]
58    pub fn new(logical_path: &'a Path, bytes: &'a [u8]) -> Self {
59        Self {
60            logical_path,
61            bytes,
62        }
63    }
64}
65
66fn get_file_metrics(path: &Path) -> (usize, usize) {
67    // Best-effort size calculation.
68    // If the file was deleted or is inaccessible during the scan post-processing,
69    // we return 0 bytes/tokens rather than crashing.
70    let bytes = fs::metadata(path).map(|m| m.len() as usize).unwrap_or(0);
71    metrics_from_byte_len(bytes)
72}
73
74fn metrics_from_bytes(bytes: &[u8]) -> (usize, usize) {
75    metrics_from_byte_len(bytes.len())
76}
77
78fn metrics_from_byte_len(bytes: usize) -> (usize, usize) {
79    let tokens = bytes / CHARS_PER_TOKEN;
80    (bytes, tokens)
81}
82
83fn synthetic_detection_path(logical_path: &Path) -> PathBuf {
84    let mut path = PathBuf::from("__tokmd_in_memory_detection__");
85    path.push(logical_path.file_name().unwrap_or(logical_path.as_os_str()));
86    path
87}
88
89fn language_from_in_memory_shebang(bytes: &[u8]) -> Option<LanguageType> {
90    const READ_LIMIT: usize = 128;
91
92    let first_line = bytes[..bytes.len().min(READ_LIMIT)]
93        .split(|b| *b == b'\n')
94        .next()?;
95    let first_line = std::str::from_utf8(first_line).ok()?;
96
97    let direct = LanguageType::list()
98        .iter()
99        .map(|(lang, _)| *lang)
100        .find(|lang| lang.shebangs().contains(&first_line));
101    if direct.is_some() {
102        return direct;
103    }
104
105    let mut words = first_line.split_whitespace();
106    if words.next() == Some("#!/usr/bin/env") {
107        let interpreter = env_interpreter_token(words)?;
108        return language_from_env_interpreter(interpreter);
109    }
110
111    None
112}
113
114fn env_interpreter_token<'a>(words: impl Iterator<Item = &'a str>) -> Option<&'a str> {
115    let mut skip_next = false;
116
117    for word in words {
118        if skip_next {
119            skip_next = false;
120            continue;
121        }
122
123        if word.is_empty() {
124            continue;
125        }
126
127        if looks_like_env_assignment(word) {
128            continue;
129        }
130
131        match word {
132            "-S" | "--split-string" | "-i" | "--ignore-environment" => continue,
133            "-u" | "--unset" | "-C" | "--chdir" | "-P" | "--default-path" | "-a" | "--argv0"
134            | "--default-signal" | "--ignore-signal" | "--block-signal" => {
135                skip_next = true;
136                continue;
137            }
138            _ if word.starts_with("--unset=")
139                || word.starts_with("--chdir=")
140                || word.starts_with("--default-path=")
141                || word.starts_with("--argv0=")
142                || word.starts_with("--default-signal=")
143                || word.starts_with("--ignore-signal=")
144                || word.starts_with("--block-signal=") =>
145            {
146                continue;
147            }
148            _ if word.starts_with('-') => continue,
149            _ => return Some(word),
150        }
151    }
152
153    None
154}
155
156fn looks_like_env_assignment(word: &str) -> bool {
157    let Some((name, _)) = word.split_once('=') else {
158        return false;
159    };
160
161    if name.is_empty() {
162        return false;
163    }
164
165    let mut chars = name.chars();
166    let Some(first) = chars.next() else {
167        return false;
168    };
169
170    if !(first == '_' || first.is_ascii_alphabetic()) {
171        return false;
172    }
173
174    chars.all(|ch| ch == '_' || ch.is_ascii_alphanumeric())
175}
176
177fn language_from_env_interpreter(interpreter: &str) -> Option<LanguageType> {
178    let token = interpreter
179        .rsplit('/')
180        .next()
181        .unwrap_or(interpreter)
182        // Some shells and malformed env invocations can surface "-python3"-style
183        // interpreter tokens; strip the leading dash defensively before matching.
184        .trim_start_matches('-');
185
186    if token.starts_with("python") {
187        return LanguageType::from_file_extension("py");
188    }
189
190    match token {
191        "bash" | "sh" | "zsh" | "ksh" | "fish" => LanguageType::from_name("Bash"),
192        "node" | "nodejs" => LanguageType::from_name("JavaScript"),
193        "ruby" => LanguageType::from_name("Ruby"),
194        "perl" | "perl5" => LanguageType::from_name("Perl"),
195        "php" => LanguageType::from_name("PHP"),
196        "pwsh" | "powershell" => LanguageType::from_name("PowerShell"),
197        _ => None,
198    }
199}
200
201fn detect_in_memory_language(
202    logical_path: &Path,
203    bytes: &[u8],
204    config: &Config,
205) -> Option<LanguageType> {
206    let detection_path = synthetic_detection_path(logical_path);
207    LanguageType::from_path(&detection_path, config)
208        .or_else(|| language_from_in_memory_shebang(bytes))
209}
210
211fn insert_row(
212    map: &mut BTreeMap<Key, (String, Agg)>,
213    key: Key,
214    module: String,
215    stats: &CodeStats,
216    bytes: usize,
217    tokens: usize,
218) {
219    let entry = map.entry(key).or_insert_with(|| (module, Agg::default()));
220    entry.1.code += stats.code;
221    entry.1.comments += stats.comments;
222    entry.1.blanks += stats.blanks;
223    entry.1.bytes += bytes;
224    entry.1.tokens += tokens;
225}
226
227fn rows_from_map(map: BTreeMap<Key, (String, Agg)>) -> Vec<FileRow> {
228    map.into_iter()
229        .map(|(key, (module, agg))| {
230            let lines = agg.code + agg.comments + agg.blanks;
231            FileRow {
232                path: key.path,
233                module,
234                lang: key.lang,
235                kind: key.kind,
236                code: agg.code,
237                comments: agg.comments,
238                blanks: agg.blanks,
239                lines,
240                bytes: agg.bytes,
241                tokens: agg.tokens,
242            }
243        })
244        .collect()
245}
246
247/// Collect `FileRow`s directly from ordered in-memory inputs.
248///
249/// This path avoids host filesystem metadata and keeps logical paths intact,
250/// which makes it suitable for browser/WASM callers.
251pub fn collect_in_memory_file_rows(
252    inputs: &[InMemoryRowInput<'_>],
253    module_roots: &[String],
254    module_depth: usize,
255    children: ChildIncludeMode,
256    config: &Config,
257) -> Vec<FileRow> {
258    let mut map: BTreeMap<Key, (String, Agg)> = BTreeMap::new();
259
260    for input in inputs {
261        let Some(lang_type) = detect_in_memory_language(input.logical_path, input.bytes, config)
262        else {
263            continue;
264        };
265
266        let path = normalize_path(input.logical_path, None);
267        let module = module_key_from_normalized(&path, module_roots, module_depth);
268        let stats = lang_type.parse_from_slice(input.bytes, config);
269        let summary = stats.summarise();
270        let (bytes, tokens) = metrics_from_bytes(input.bytes);
271
272        insert_row(
273            &mut map,
274            Key {
275                path: path.clone(),
276                lang: lang_type.name().to_string(),
277                kind: FileKind::Parent,
278            },
279            module.clone(),
280            &summary,
281            bytes,
282            tokens,
283        );
284
285        if children == ChildIncludeMode::Separate {
286            for (child_type, child_stats) in &stats.blobs {
287                let child_summary = child_stats.summarise();
288                insert_row(
289                    &mut map,
290                    Key {
291                        path: path.clone(),
292                        lang: child_type.name().to_string(),
293                        kind: FileKind::Child,
294                    },
295                    module.clone(),
296                    &child_summary,
297                    0,
298                    0,
299                );
300            }
301        }
302    }
303
304    rows_from_map(map)
305}
306
307pub fn create_lang_report(
308    languages: &Languages,
309    top: usize,
310    with_files: bool,
311    children: ChildrenMode,
312) -> LangReport {
313    let rows = collect_file_rows(languages, &[], 1, ChildIncludeMode::Separate, None);
314    create_lang_report_from_rows(&rows, top, with_files, children)
315}
316
317pub fn create_lang_report_from_rows(
318    file_rows: &[FileRow],
319    top: usize,
320    with_files: bool,
321    children: ChildrenMode,
322) -> LangReport {
323    #[derive(Default)]
324    struct LangAgg {
325        code: usize,
326        lines: usize,
327        bytes: usize,
328        tokens: usize,
329    }
330
331    let parent_lang_by_path: BTreeMap<&str, &str> = file_rows
332        .iter()
333        .filter(|row| row.kind == FileKind::Parent)
334        .map(|row| (row.path.as_str(), row.lang.as_str()))
335        .collect();
336    let mut child_totals_by_path: BTreeMap<&str, (usize, usize)> = BTreeMap::new();
337    for row in file_rows.iter().filter(|row| row.kind == FileKind::Child) {
338        let entry = child_totals_by_path.entry(row.path.as_str()).or_default();
339        entry.0 += row.code;
340        entry.1 += row.lines;
341    }
342
343    let mut by_lang: BTreeMap<String, (LangAgg, BTreeSet<&str>)> = BTreeMap::new();
344
345    for row in file_rows {
346        match (children, row.kind) {
347            (ChildrenMode::Collapse, FileKind::Parent) => {
348                let entry = by_lang
349                    .entry(row.lang.clone())
350                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
351                entry.0.code += row.code;
352                entry.0.lines += row.lines;
353                entry.0.bytes += row.bytes;
354                entry.0.tokens += row.tokens;
355                entry.1.insert(row.path.as_str());
356            }
357            (ChildrenMode::Collapse, FileKind::Child) => {
358                if !parent_lang_by_path.contains_key(row.path.as_str()) {
359                    let entry = by_lang
360                        .entry(row.lang.clone())
361                        .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
362                    entry.0.code += row.code;
363                    entry.0.lines += row.lines;
364                    entry.1.insert(row.path.as_str());
365                }
366            }
367            (ChildrenMode::Separate, FileKind::Parent) => {
368                let (child_code, child_lines) = child_totals_by_path
369                    .get(row.path.as_str())
370                    .copied()
371                    .unwrap_or((0, 0));
372
373                let entry = by_lang
374                    .entry(row.lang.clone())
375                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
376                entry.0.code += row.code.saturating_sub(child_code);
377                entry.0.lines += row.lines.saturating_sub(child_lines);
378                entry.0.bytes += row.bytes;
379                entry.0.tokens += row.tokens;
380                entry.1.insert(row.path.as_str());
381            }
382            (ChildrenMode::Separate, FileKind::Child) => {
383                let entry = by_lang
384                    .entry(format!("{} (embedded)", row.lang))
385                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
386                entry.0.code += row.code;
387                entry.0.lines += row.lines;
388                entry.1.insert(row.path.as_str());
389            }
390        }
391    }
392
393    let mut rows: Vec<LangRow> = Vec::with_capacity(by_lang.len());
394    for (lang, (agg, files_set)) in by_lang {
395        if agg.code == 0 {
396            continue;
397        }
398        let files = files_set.len();
399        rows.push(LangRow {
400            lang: lang.to_string(),
401            code: agg.code,
402            lines: agg.lines,
403            files,
404            bytes: agg.bytes,
405            tokens: agg.tokens,
406            avg_lines: avg(agg.lines, files),
407        });
408    }
409
410    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.lang.cmp(&b.lang)));
411
412    let total_code: usize = rows.iter().map(|r| r.code).sum();
413    let total_lines: usize = rows.iter().map(|r| r.lines).sum();
414    let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
415    let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
416    let total_files = unique_parent_file_count_from_rows(file_rows);
417
418    let total = Totals {
419        code: total_code,
420        lines: total_lines,
421        files: total_files,
422        bytes: total_bytes,
423        tokens: total_tokens,
424        avg_lines: avg(total_lines, total_files),
425    };
426
427    if top > 0 && rows.len() > top {
428        let other = fold_other_lang(&rows[top..]);
429        rows.truncate(top);
430        rows.push(other);
431    }
432
433    LangReport {
434        rows,
435        total,
436        with_files,
437        children,
438        top,
439    }
440}
441
442fn fold_other_lang(rows: &[LangRow]) -> LangRow {
443    let mut code = 0usize;
444    let mut lines = 0usize;
445    let mut files = 0usize;
446    let mut bytes = 0usize;
447    let mut tokens = 0usize;
448
449    for r in rows {
450        code += r.code;
451        lines += r.lines;
452        files += r.files;
453        bytes += r.bytes;
454        tokens += r.tokens;
455    }
456
457    LangRow {
458        lang: "Other".to_string(),
459        code,
460        lines,
461        files,
462        bytes,
463        tokens,
464        avg_lines: avg(lines, files),
465    }
466}
467
468pub fn create_module_report(
469    languages: &Languages,
470    module_roots: &[String],
471    module_depth: usize,
472    children: ChildIncludeMode,
473    top: usize,
474) -> ModuleReport {
475    let file_rows = collect_file_rows(languages, module_roots, module_depth, children, None);
476    create_module_report_from_rows(&file_rows, module_roots, module_depth, children, top)
477}
478
479pub fn create_module_report_from_rows(
480    file_rows: &[FileRow],
481    module_roots: &[String],
482    module_depth: usize,
483    children: ChildIncludeMode,
484    top: usize,
485) -> ModuleReport {
486    #[derive(Default)]
487    struct Agg {
488        code: usize,
489        lines: usize,
490        bytes: usize,
491        tokens: usize,
492    }
493
494    let mut by_module: BTreeMap<&str, (Agg, BTreeSet<&str>)> = BTreeMap::new();
495    let mut total_code = 0;
496    let mut total_lines = 0;
497    let mut total_bytes = 0;
498    let mut total_tokens = 0;
499
500    for r in file_rows {
501        total_code += r.code;
502        total_lines += r.lines;
503        total_bytes += r.bytes;
504        total_tokens += r.tokens;
505
506        let entry = by_module
507            .entry(r.module.as_str())
508            .or_insert_with(|| (Agg::default(), BTreeSet::new()));
509        entry.0.code += r.code;
510        entry.0.lines += r.lines;
511        entry.0.bytes += r.bytes;
512        entry.0.tokens += r.tokens;
513
514        if r.kind == FileKind::Parent {
515            entry.1.insert(r.path.as_str());
516        }
517    }
518
519    let mut rows: Vec<ModuleRow> = Vec::with_capacity(by_module.len());
520    for (module, (agg, files_set)) in by_module {
521        let files = files_set.len();
522        rows.push(ModuleRow {
523            module: module.to_string(),
524            code: agg.code,
525            lines: agg.lines,
526            files,
527            bytes: agg.bytes,
528            tokens: agg.tokens,
529            avg_lines: avg(agg.lines, files),
530        });
531    }
532
533    // Sort descending by code, then by module name for determinism.
534    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.module.cmp(&b.module)));
535
536    if top > 0 && rows.len() > top {
537        let other = fold_other_module(&rows[top..]);
538        rows.truncate(top);
539        rows.push(other);
540    }
541
542    let total_files = unique_parent_file_count_from_rows(file_rows);
543
544    let total = Totals {
545        code: total_code,
546        lines: total_lines,
547        files: total_files,
548        bytes: total_bytes,
549        tokens: total_tokens,
550        avg_lines: avg(total_lines, total_files),
551    };
552
553    ModuleReport {
554        rows,
555        total,
556        module_roots: module_roots.to_vec(),
557        module_depth,
558        children,
559        top,
560    }
561}
562
563fn fold_other_module(rows: &[ModuleRow]) -> ModuleRow {
564    let mut code = 0usize;
565    let mut lines = 0usize;
566    let mut files = 0usize;
567    let mut bytes = 0usize;
568    let mut tokens = 0usize;
569
570    for r in rows {
571        code += r.code;
572        lines += r.lines;
573        files += r.files;
574        bytes += r.bytes;
575        tokens += r.tokens;
576    }
577
578    ModuleRow {
579        module: "Other".to_string(),
580        code,
581        lines,
582        files,
583        bytes,
584        tokens,
585        avg_lines: avg(lines, files),
586    }
587}
588
589pub fn create_export_data(
590    languages: &Languages,
591    module_roots: &[String],
592    module_depth: usize,
593    children: ChildIncludeMode,
594    strip_prefix: Option<&Path>,
595    min_code: usize,
596    max_rows: usize,
597) -> ExportData {
598    let rows = collect_file_rows(
599        languages,
600        module_roots,
601        module_depth,
602        children,
603        strip_prefix,
604    );
605    create_export_data_from_rows(
606        rows,
607        module_roots,
608        module_depth,
609        children,
610        min_code,
611        max_rows,
612    )
613}
614
615pub fn create_export_data_from_rows(
616    mut rows: Vec<FileRow>,
617    module_roots: &[String],
618    module_depth: usize,
619    children: ChildIncludeMode,
620    min_code: usize,
621    max_rows: usize,
622) -> ExportData {
623    // Filter and sort for determinism.
624    if min_code > 0 {
625        rows.retain(|r| r.code >= min_code);
626    }
627    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.path.cmp(&b.path)));
628
629    if max_rows > 0 && rows.len() > max_rows {
630        rows.truncate(max_rows);
631    }
632
633    ExportData {
634        rows,
635        module_roots: module_roots.to_vec(),
636        module_depth,
637        children,
638    }
639}
640
641/// Collect per-file contributions, optionally including embedded language reports.
642///
643/// This returns one row per (path, lang, kind), aggregated if tokei produced multiple
644/// reports for the same tuple.
645pub fn collect_file_rows(
646    languages: &Languages,
647    module_roots: &[String],
648    module_depth: usize,
649    children: ChildIncludeMode,
650    strip_prefix: Option<&Path>,
651) -> Vec<FileRow> {
652    let mut map: BTreeMap<Key, (String /*module*/, Agg)> = BTreeMap::new();
653
654    // Parent reports
655    for (lang_type, lang) in languages.iter() {
656        for report in &lang.reports {
657            let path = normalize_path(&report.name, strip_prefix);
658            let module = module_key_from_normalized(&path, module_roots, module_depth);
659            let st = report.stats.summarise();
660            let (bytes, tokens) = get_file_metrics(&report.name);
661            insert_row(
662                &mut map,
663                Key {
664                    path,
665                    lang: lang_type.name().to_string(),
666                    kind: FileKind::Parent,
667                },
668                module,
669                &st,
670                bytes,
671                tokens,
672            );
673        }
674    }
675
676    if children == ChildIncludeMode::Separate {
677        for (_lang_type, lang) in languages.iter() {
678            for (child_type, reports) in &lang.children {
679                for report in reports {
680                    let path = normalize_path(&report.name, strip_prefix);
681                    let module = module_key_from_normalized(&path, module_roots, module_depth);
682                    let st = report.stats.summarise();
683                    insert_row(
684                        &mut map,
685                        Key {
686                            path,
687                            lang: child_type.name().to_string(),
688                            kind: FileKind::Child,
689                        },
690                        module,
691                        &st,
692                        0,
693                        0,
694                    );
695                }
696            }
697        }
698    }
699
700    rows_from_map(map)
701}
702
703pub fn unique_parent_file_count(languages: &Languages) -> usize {
704    let rows = collect_file_rows(languages, &[], 1, ChildIncludeMode::ParentsOnly, None);
705    unique_parent_file_count_from_rows(&rows)
706}
707
708pub fn unique_parent_file_count_from_rows(file_rows: &[FileRow]) -> usize {
709    file_rows
710        .iter()
711        .filter(|row| row.kind == FileKind::Parent)
712        .map(|row| row.path.as_str())
713        .collect::<BTreeSet<_>>()
714        .len()
715}
716
717/// Compute the average of `lines` over `files`, rounding to nearest integer.
718///
719/// Returns 0 if `files` is zero.
720///
721/// # Examples
722///
723/// ```
724/// use tokmd_model::avg;
725///
726/// assert_eq!(avg(300, 3), 100);
727/// assert_eq!(avg(0, 5), 0);
728/// assert_eq!(avg(100, 0), 0);
729/// // Rounds to nearest: 7 / 2 = 3.5 → 4
730/// assert_eq!(avg(7, 2), 4);
731/// ```
732pub fn avg(lines: usize, files: usize) -> usize {
733    if files == 0 {
734        return 0;
735    }
736    // Round to nearest integer.
737    (lines + (files / 2)) / files
738}
739
740/// Normalize a path for portable output.
741///
742/// - Uses `/` separators
743/// - Strips leading `./`
744/// - Optionally strips a user-provided prefix (after normalization)
745///
746/// # Examples
747///
748/// ```
749/// use std::path::Path;
750/// use tokmd_model::normalize_path;
751///
752/// // Normalizes backslashes to forward slashes
753/// let p = Path::new("src\\main.rs");
754/// assert_eq!(normalize_path(p, None), "src/main.rs");
755///
756/// // Strips a prefix
757/// let p = Path::new("project/src/lib.rs");
758/// let prefix = Path::new("project");
759/// assert_eq!(normalize_path(&p, Some(&prefix)), "src/lib.rs");
760/// ```
761pub fn normalize_path(path: &Path, strip_prefix: Option<&Path>) -> String {
762    let s_cow = path.to_string_lossy();
763    let s: Cow<str> = if s_cow.contains('\\') {
764        Cow::Owned(s_cow.replace('\\', "/"))
765    } else {
766        s_cow
767    };
768
769    let mut slice: &str = &s;
770
771    // Strip leading ./ first, so strip_prefix can match against "src/" instead of "./src/"
772    if let Some(stripped) = slice.strip_prefix("./") {
773        slice = stripped;
774    }
775
776    if let Some(prefix) = strip_prefix {
777        let p_cow = prefix.to_string_lossy();
778        // Strip leading ./ from prefix so it can match normalized paths
779        let p_cow_stripped: Cow<str> = if let Some(stripped) = p_cow.strip_prefix("./") {
780            Cow::Borrowed(stripped)
781        } else {
782            p_cow
783        };
784
785        let needs_replace = p_cow_stripped.contains('\\');
786        let needs_slash = !p_cow_stripped.ends_with('/');
787
788        if !needs_replace && !needs_slash {
789            // Fast path: prefix is already clean and ends with slash
790            if slice.starts_with(p_cow_stripped.as_ref()) {
791                slice = &slice[p_cow_stripped.len()..];
792            }
793        } else {
794            // Slow path: normalize prefix
795            let mut pfx = if needs_replace {
796                p_cow_stripped.replace('\\', "/")
797            } else {
798                p_cow_stripped.into_owned()
799            };
800            if needs_slash {
801                pfx.push('/');
802            }
803            if slice.starts_with(&pfx) {
804                slice = &slice[pfx.len()..];
805            }
806        }
807    }
808
809    slice = slice.trim_start_matches('/');
810
811    // After trimming slashes, we might be left with a leading ./ (e.g. from "/./")
812    if let Some(stripped) = slice.strip_prefix("./") {
813        slice = stripped;
814    }
815    slice = slice.trim_start_matches('/');
816
817    if slice.len() == s.len() {
818        s.into_owned()
819    } else {
820        slice.to_string()
821    }
822}
823
824/// Compute a "module key" from an input path.
825///
826/// Rules:
827/// - Root-level files become "(root)".
828/// - If the first directory segment is in `module_roots`, join `module_depth` *directory* segments.
829/// - Otherwise, module key is the top-level directory.
830///
831/// # Examples
832///
833/// ```
834/// use tokmd_model::module_key;
835///
836/// let roots = vec!["crates".to_string()];
837/// assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 2), "crates/foo");
838/// assert_eq!(module_key("src/lib.rs", &roots, 2), "src");
839/// assert_eq!(module_key("Cargo.toml", &roots, 2), "(root)");
840/// ```
841pub fn module_key(path: &str, module_roots: &[String], module_depth: usize) -> String {
842    tokmd_module_key::module_key(path, module_roots, module_depth)
843}
844
845#[cfg(test)]
846mod tests {
847    use super::*;
848    use std::path::PathBuf;
849
850    #[test]
851    fn module_key_root_level_file() {
852        assert_eq!(module_key("Cargo.toml", &["crates".into()], 2), "(root)");
853        assert_eq!(module_key("./Cargo.toml", &["crates".into()], 2), "(root)");
854    }
855
856    #[test]
857    fn module_key_crates_depth_2() {
858        let roots = vec!["crates".into(), "packages".into()];
859        assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 2), "crates/foo");
860        assert_eq!(
861            module_key("packages/bar/src/main.rs", &roots, 2),
862            "packages/bar"
863        );
864    }
865
866    #[test]
867    fn module_key_crates_depth_1() {
868        let roots = vec!["crates".into(), "packages".into()];
869        assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 1), "crates");
870    }
871
872    #[test]
873    fn module_key_non_root() {
874        let roots = vec!["crates".into()];
875        assert_eq!(module_key("src/lib.rs", &roots, 2), "src");
876        assert_eq!(module_key("tools/gen.rs", &roots, 2), "tools");
877    }
878
879    #[test]
880    fn module_key_depth_overflow_does_not_include_filename() {
881        let roots = vec!["crates".into()];
882        // File directly under a root: depth=2 should NOT include the filename
883        assert_eq!(module_key("crates/foo.rs", &roots, 2), "crates");
884        // Depth exceeds available directories: should stop at deepest directory
885        assert_eq!(
886            module_key("crates/foo/src/lib.rs", &roots, 10),
887            "crates/foo/src"
888        );
889    }
890
891    #[test]
892    fn normalize_path_strips_prefix() {
893        let p = PathBuf::from("C:/Code/Repo/src/main.rs");
894        let prefix = PathBuf::from("C:/Code/Repo");
895        let got = normalize_path(&p, Some(&prefix));
896        assert_eq!(got, "src/main.rs");
897    }
898
899    #[test]
900    fn normalize_path_normalization_slashes() {
901        let p = PathBuf::from(r"C:\Code\Repo\src\main.rs");
902        let got = normalize_path(&p, None);
903        assert_eq!(got, "C:/Code/Repo/src/main.rs");
904    }
905
906    mod normalize_properties {
907        use super::*;
908        use proptest::prelude::*;
909
910        fn arb_path_component() -> impl Strategy<Value = String> {
911            "[a-zA-Z0-9_.-]+"
912        }
913
914        fn arb_path(max_depth: usize) -> impl Strategy<Value = String> {
915            prop::collection::vec(arb_path_component(), 1..=max_depth)
916                .prop_map(|comps| comps.join("/"))
917        }
918
919        proptest! {
920            #[test]
921            fn normalize_path_is_idempotent(path in arb_path(5)) {
922                let p = PathBuf::from(&path);
923                let norm1 = normalize_path(&p, None);
924                let p2 = PathBuf::from(&norm1);
925                let norm2 = normalize_path(&p2, None);
926                prop_assert_eq!(norm1, norm2);
927            }
928
929            #[test]
930            fn normalize_path_handles_windows_separators(path in arb_path(5)) {
931                let win_path = path.replace('/', "\\");
932                let p_win = PathBuf::from(&win_path);
933                let p_unix = PathBuf::from(&path);
934
935                let norm_win = normalize_path(&p_win, None);
936                let norm_unix = normalize_path(&p_unix, None);
937
938                prop_assert_eq!(norm_win, norm_unix);
939            }
940
941            #[test]
942            fn normalize_path_no_leading_slash(path in arb_path(5)) {
943                let p = PathBuf::from(&path);
944                let norm = normalize_path(&p, None);
945                prop_assert!(!norm.starts_with('/'));
946            }
947
948            #[test]
949            fn normalize_path_no_leading_dot_slash(path in arb_path(5)) {
950                let p = PathBuf::from(&path);
951                let norm = normalize_path(&p, None);
952                prop_assert!(!norm.starts_with("./"));
953            }
954
955            #[test]
956            fn module_key_deterministic(
957                path in arb_path(5),
958                roots in prop::collection::vec(arb_path_component(), 1..3),
959                depth in 1usize..5
960            ) {
961                let k1 = module_key(&path, &roots, depth);
962                let k2 = module_key(&path, &roots, depth);
963                prop_assert_eq!(k1, k2);
964            }
965        }
966    }
967
968    // Property-based tests for fold_other_* functions
969    mod fold_properties {
970        use super::*;
971        use proptest::prelude::*;
972
973        fn arb_lang_row() -> impl Strategy<Value = LangRow> {
974            (
975                "[a-zA-Z]+",
976                0usize..10000,
977                0usize..20000,
978                0usize..1000,
979                0usize..1000000,
980                0usize..100000,
981            )
982                .prop_map(|(lang, code, lines, files, bytes, tokens)| {
983                    let avg_lines = (lines + (files / 2)).checked_div(files).unwrap_or(0);
984                    LangRow {
985                        lang,
986                        code,
987                        lines,
988                        files,
989                        bytes,
990                        tokens,
991                        avg_lines,
992                    }
993                })
994        }
995
996        fn arb_module_row() -> impl Strategy<Value = ModuleRow> {
997            (
998                "[a-zA-Z0-9_/]+",
999                0usize..10000,
1000                0usize..20000,
1001                0usize..1000,
1002                0usize..1000000,
1003                0usize..100000,
1004            )
1005                .prop_map(|(module, code, lines, files, bytes, tokens)| {
1006                    let avg_lines = (lines + (files / 2)).checked_div(files).unwrap_or(0);
1007                    ModuleRow {
1008                        module,
1009                        code,
1010                        lines,
1011                        files,
1012                        bytes,
1013                        tokens,
1014                        avg_lines,
1015                    }
1016                })
1017        }
1018
1019        proptest! {
1020            #[test]
1021            fn fold_lang_preserves_totals(rows in prop::collection::vec(arb_lang_row(), 0..10)) {
1022                let folded = fold_other_lang(&rows);
1023
1024                let total_code: usize = rows.iter().map(|r| r.code).sum();
1025                let total_lines: usize = rows.iter().map(|r| r.lines).sum();
1026                let total_files: usize = rows.iter().map(|r| r.files).sum();
1027                let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
1028                let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
1029
1030                prop_assert_eq!(folded.code, total_code, "Code mismatch");
1031                prop_assert_eq!(folded.lines, total_lines, "Lines mismatch");
1032                prop_assert_eq!(folded.files, total_files, "Files mismatch");
1033                prop_assert_eq!(folded.bytes, total_bytes, "Bytes mismatch");
1034                prop_assert_eq!(folded.tokens, total_tokens, "Tokens mismatch");
1035            }
1036
1037            #[test]
1038            fn fold_lang_empty_is_zero(_dummy in 0..1u8) {
1039                let folded = fold_other_lang(&[]);
1040                prop_assert_eq!(folded.code, 0);
1041                prop_assert_eq!(folded.lines, 0);
1042                prop_assert_eq!(folded.files, 0);
1043                prop_assert_eq!(folded.bytes, 0);
1044                prop_assert_eq!(folded.tokens, 0);
1045                prop_assert_eq!(folded.lang, "Other");
1046            }
1047
1048            #[test]
1049            fn fold_module_preserves_totals(rows in prop::collection::vec(arb_module_row(), 0..10)) {
1050                let folded = fold_other_module(&rows);
1051
1052                let total_code: usize = rows.iter().map(|r| r.code).sum();
1053                let total_lines: usize = rows.iter().map(|r| r.lines).sum();
1054                let total_files: usize = rows.iter().map(|r| r.files).sum();
1055                let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
1056                let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
1057
1058                prop_assert_eq!(folded.code, total_code, "Code mismatch");
1059                prop_assert_eq!(folded.lines, total_lines, "Lines mismatch");
1060                prop_assert_eq!(folded.files, total_files, "Files mismatch");
1061                prop_assert_eq!(folded.bytes, total_bytes, "Bytes mismatch");
1062                prop_assert_eq!(folded.tokens, total_tokens, "Tokens mismatch");
1063            }
1064
1065            #[test]
1066            fn fold_module_empty_is_zero(_dummy in 0..1u8) {
1067                let folded = fold_other_module(&[]);
1068                prop_assert_eq!(folded.code, 0);
1069                prop_assert_eq!(folded.lines, 0);
1070                prop_assert_eq!(folded.files, 0);
1071                prop_assert_eq!(folded.bytes, 0);
1072                prop_assert_eq!(folded.tokens, 0);
1073                prop_assert_eq!(folded.module, "Other");
1074            }
1075
1076            #[test]
1077            fn fold_associative_lang(
1078                rows1 in prop::collection::vec(arb_lang_row(), 0..5),
1079                rows2 in prop::collection::vec(arb_lang_row(), 0..5)
1080            ) {
1081                // Folding all at once should equal folding parts and combining
1082                let all: Vec<_> = rows1.iter().chain(rows2.iter()).cloned().collect();
1083                let fold_all = fold_other_lang(&all);
1084
1085                let fold1 = fold_other_lang(&rows1);
1086                let fold2 = fold_other_lang(&rows2);
1087                let combined = fold_other_lang(&[fold1, fold2]);
1088
1089                prop_assert_eq!(fold_all.code, combined.code);
1090                prop_assert_eq!(fold_all.lines, combined.lines);
1091                prop_assert_eq!(fold_all.files, combined.files);
1092                prop_assert_eq!(fold_all.bytes, combined.bytes);
1093                prop_assert_eq!(fold_all.tokens, combined.tokens);
1094            }
1095        }
1096    }
1097}