tokmd_model/
lib.rs

1//! # tokmd-model
2//!
3//! **Tier 1 (Logic)**
4//!
5//! This crate contains the core business logic for aggregating and transforming code statistics.
6//! It handles the conversion from raw Tokei scan results into `tokmd` receipts.
7//!
8//! ## What belongs here
9//! * Aggregation logic (rolling up stats to modules/languages)
10//! * Deterministic sorting and filtering
11//! * Path normalization rules
12//! * Receipt generation logic
13//!
14//! ## What does NOT belong here
15//! * CLI argument parsing
16//! * Output formatting (printing to stdout/file)
17//! * Tokei interaction (use tokmd-scan)
18
19use std::borrow::Cow;
20use std::collections::{BTreeMap, BTreeSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23
24pub mod module_key;
25
26use crate::module_key::module_key_from_normalized;
27use tokei::{CodeStats, Config, LanguageType, Languages};
28use tokmd_types::{
29    ChildIncludeMode, ChildrenMode, ExportData, FileKind, FileRow, LangReport, LangRow,
30    ModuleReport, ModuleRow, Totals,
31};
32
33/// Simple heuristic: 1 token ~= 4 chars (bytes).
34const CHARS_PER_TOKEN: usize = 4;
35
36#[derive(Default, Clone, Copy)]
37struct Agg {
38    code: usize,
39    comments: usize,
40    blanks: usize,
41    bytes: usize,
42    tokens: usize,
43}
44
45#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
46struct Key<'a> {
47    path: String,
48    lang: &'a str,
49    kind: FileKind,
50}
51
52/// A logical in-memory file used to synthesize `FileRow`s without the host filesystem.
53pub struct InMemoryRowInput<'a> {
54    pub logical_path: &'a Path,
55    pub bytes: &'a [u8],
56}
57
58impl<'a> InMemoryRowInput<'a> {
59    #[must_use]
60    pub fn new(logical_path: &'a Path, bytes: &'a [u8]) -> Self {
61        Self {
62            logical_path,
63            bytes,
64        }
65    }
66}
67
68fn get_file_metrics(path: &Path) -> (usize, usize) {
69    // Best-effort size calculation.
70    // If the file was deleted or is inaccessible during the scan post-processing,
71    // we return 0 bytes/tokens rather than crashing.
72    let bytes = fs::metadata(path).map(|m| m.len() as usize).unwrap_or(0);
73    metrics_from_byte_len(bytes)
74}
75
76fn metrics_from_bytes(bytes: &[u8]) -> (usize, usize) {
77    metrics_from_byte_len(bytes.len())
78}
79
80fn metrics_from_byte_len(bytes: usize) -> (usize, usize) {
81    let tokens = bytes / CHARS_PER_TOKEN;
82    (bytes, tokens)
83}
84
85fn synthetic_detection_path(logical_path: &Path) -> PathBuf {
86    let mut path = PathBuf::from("__tokmd_in_memory_detection__");
87    path.push(logical_path.file_name().unwrap_or(logical_path.as_os_str()));
88    path
89}
90
91fn language_from_in_memory_shebang(bytes: &[u8]) -> Option<LanguageType> {
92    const READ_LIMIT: usize = 128;
93
94    let first_line = bytes[..bytes.len().min(READ_LIMIT)]
95        .split(|b| *b == b'\n')
96        .next()?;
97    let first_line = std::str::from_utf8(first_line).ok()?;
98
99    let direct = LanguageType::list()
100        .iter()
101        .map(|(lang, _)| *lang)
102        .find(|lang| lang.shebangs().contains(&first_line));
103    if direct.is_some() {
104        return direct;
105    }
106
107    let mut words = first_line.split_whitespace();
108    if words.next() == Some("#!/usr/bin/env") {
109        let interpreter = env_interpreter_token(words)?;
110        return language_from_env_interpreter(interpreter);
111    }
112
113    None
114}
115
116fn env_interpreter_token<'a>(words: impl Iterator<Item = &'a str>) -> Option<&'a str> {
117    let mut skip_next = false;
118
119    for word in words {
120        if skip_next {
121            skip_next = false;
122            continue;
123        }
124
125        if word.is_empty() {
126            continue;
127        }
128
129        if looks_like_env_assignment(word) {
130            continue;
131        }
132
133        match word {
134            "-S" | "--split-string" | "-i" | "--ignore-environment" => continue,
135            "-u" | "--unset" | "-C" | "--chdir" | "-P" | "--default-path" | "-a" | "--argv0"
136            | "--default-signal" | "--ignore-signal" | "--block-signal" => {
137                skip_next = true;
138                continue;
139            }
140            _ if word.starts_with("--unset=")
141                || word.starts_with("--chdir=")
142                || word.starts_with("--default-path=")
143                || word.starts_with("--argv0=")
144                || word.starts_with("--default-signal=")
145                || word.starts_with("--ignore-signal=")
146                || word.starts_with("--block-signal=") =>
147            {
148                continue;
149            }
150            _ if word.starts_with('-') => continue,
151            _ => return Some(word),
152        }
153    }
154
155    None
156}
157
158fn looks_like_env_assignment(word: &str) -> bool {
159    let Some((name, _)) = word.split_once('=') else {
160        return false;
161    };
162
163    if name.is_empty() {
164        return false;
165    }
166
167    let mut chars = name.chars();
168    let Some(first) = chars.next() else {
169        return false;
170    };
171
172    if !(first == '_' || first.is_ascii_alphabetic()) {
173        return false;
174    }
175
176    chars.all(|ch| ch == '_' || ch.is_ascii_alphanumeric())
177}
178
179fn language_from_env_interpreter(interpreter: &str) -> Option<LanguageType> {
180    let token = interpreter
181        .rsplit('/')
182        .next()
183        .unwrap_or(interpreter)
184        // Some shells and malformed env invocations can surface "-python3"-style
185        // interpreter tokens; strip the leading dash defensively before matching.
186        .trim_start_matches('-');
187
188    if token.starts_with("python") {
189        return LanguageType::from_file_extension("py");
190    }
191
192    match token {
193        "bash" | "sh" | "zsh" | "ksh" | "fish" => LanguageType::from_name("Bash"),
194        "node" | "nodejs" => LanguageType::from_name("JavaScript"),
195        "ruby" => LanguageType::from_name("Ruby"),
196        "perl" | "perl5" => LanguageType::from_name("Perl"),
197        "php" => LanguageType::from_name("PHP"),
198        "pwsh" | "powershell" => LanguageType::from_name("PowerShell"),
199        _ => None,
200    }
201}
202
203fn detect_in_memory_language(
204    logical_path: &Path,
205    bytes: &[u8],
206    config: &Config,
207) -> Option<LanguageType> {
208    let detection_path = synthetic_detection_path(logical_path);
209    LanguageType::from_path(&detection_path, config)
210        .or_else(|| language_from_in_memory_shebang(bytes))
211}
212
213fn insert_row<'a>(
214    map: &mut BTreeMap<Key<'a>, (String, Agg)>,
215    key: Key<'a>,
216    module: String,
217    stats: &CodeStats,
218    bytes: usize,
219    tokens: usize,
220) {
221    let entry = map.entry(key).or_insert_with(|| (module, Agg::default()));
222    entry.1.code += stats.code;
223    entry.1.comments += stats.comments;
224    entry.1.blanks += stats.blanks;
225    entry.1.bytes += bytes;
226    entry.1.tokens += tokens;
227}
228
229fn rows_from_map<'a>(map: BTreeMap<Key<'a>, (String, Agg)>) -> Vec<FileRow> {
230    map.into_iter()
231        .map(|(key, (module, agg))| {
232            let lines = agg.code + agg.comments + agg.blanks;
233            FileRow {
234                path: key.path,
235                module,
236                lang: key.lang.to_string(),
237                kind: key.kind,
238                code: agg.code,
239                comments: agg.comments,
240                blanks: agg.blanks,
241                lines,
242                bytes: agg.bytes,
243                tokens: agg.tokens,
244            }
245        })
246        .collect()
247}
248
249/// Collect `FileRow`s directly from ordered in-memory inputs.
250///
251/// This path avoids host filesystem metadata and keeps logical paths intact,
252/// which makes it suitable for browser/WASM callers.
253pub fn collect_in_memory_file_rows(
254    inputs: &[InMemoryRowInput<'_>],
255    module_roots: &[String],
256    module_depth: usize,
257    children: ChildIncludeMode,
258    config: &Config,
259) -> Vec<FileRow> {
260    let mut map = BTreeMap::new();
261
262    for input in inputs {
263        let Some(lang_type) = detect_in_memory_language(input.logical_path, input.bytes, config)
264        else {
265            continue;
266        };
267
268        let path = normalize_path(input.logical_path, None);
269        let module = module_key_from_normalized(&path, module_roots, module_depth);
270        let stats = lang_type.parse_from_slice(input.bytes, config);
271        let summary = stats.summarise();
272        let (bytes, tokens) = metrics_from_bytes(input.bytes);
273
274        if children == ChildIncludeMode::Separate {
275            for (child_type, child_stats) in &stats.blobs {
276                let child_summary = child_stats.summarise();
277                insert_row(
278                    &mut map,
279                    Key {
280                        path: path.clone(),
281                        lang: child_type.name(),
282                        kind: FileKind::Child,
283                    },
284                    module.clone(),
285                    &child_summary,
286                    0,
287                    0,
288                );
289            }
290        }
291
292        insert_row(
293            &mut map,
294            Key {
295                path,
296                lang: lang_type.name(),
297                kind: FileKind::Parent,
298            },
299            module,
300            &summary,
301            bytes,
302            tokens,
303        );
304    }
305
306    rows_from_map(map)
307}
308
309pub fn create_lang_report(
310    languages: &Languages,
311    top: usize,
312    with_files: bool,
313    children: ChildrenMode,
314) -> LangReport {
315    let rows = collect_file_rows(languages, &[], 1, ChildIncludeMode::Separate, None);
316    create_lang_report_from_rows(&rows, top, with_files, children)
317}
318
319pub fn create_lang_report_from_rows(
320    file_rows: &[FileRow],
321    top: usize,
322    with_files: bool,
323    children: ChildrenMode,
324) -> LangReport {
325    #[derive(Default)]
326    struct LangAgg {
327        code: usize,
328        lines: usize,
329        bytes: usize,
330        tokens: usize,
331    }
332
333    let parent_lang_by_path: BTreeMap<&str, &str> = file_rows
334        .iter()
335        .filter(|row| row.kind == FileKind::Parent)
336        .map(|row| (row.path.as_str(), row.lang.as_str()))
337        .collect();
338    let mut child_totals_by_path: BTreeMap<&str, (usize, usize)> = BTreeMap::new();
339    for row in file_rows.iter().filter(|row| row.kind == FileKind::Child) {
340        let entry = child_totals_by_path.entry(row.path.as_str()).or_default();
341        entry.0 += row.code;
342        entry.1 += row.lines;
343    }
344
345    let mut by_lang: BTreeMap<(&str, bool), (LangAgg, BTreeSet<&str>)> = BTreeMap::new();
346
347    for row in file_rows {
348        match (children, row.kind) {
349            (ChildrenMode::Collapse, FileKind::Parent) => {
350                let entry = by_lang
351                    .entry((row.lang.as_str(), false))
352                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
353                entry.0.code += row.code;
354                entry.0.lines += row.lines;
355                entry.0.bytes += row.bytes;
356                entry.0.tokens += row.tokens;
357                entry.1.insert(row.path.as_str());
358            }
359            (ChildrenMode::Collapse, FileKind::Child) => {
360                if !parent_lang_by_path.contains_key(row.path.as_str()) {
361                    let entry = by_lang
362                        .entry((row.lang.as_str(), false))
363                        .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
364                    entry.0.code += row.code;
365                    entry.0.lines += row.lines;
366                    entry.1.insert(row.path.as_str());
367                }
368            }
369            (ChildrenMode::Separate, FileKind::Parent) => {
370                let (child_code, child_lines) = child_totals_by_path
371                    .get(row.path.as_str())
372                    .copied()
373                    .unwrap_or((0, 0));
374
375                let entry = by_lang
376                    .entry((row.lang.as_str(), false))
377                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
378                entry.0.code += row.code.saturating_sub(child_code);
379                entry.0.lines += row.lines.saturating_sub(child_lines);
380                entry.0.bytes += row.bytes;
381                entry.0.tokens += row.tokens;
382                entry.1.insert(row.path.as_str());
383            }
384            (ChildrenMode::Separate, FileKind::Child) => {
385                let entry = by_lang
386                    .entry((row.lang.as_str(), true))
387                    .or_insert_with(|| (LangAgg::default(), BTreeSet::new()));
388                entry.0.code += row.code;
389                entry.0.lines += row.lines;
390                entry.1.insert(row.path.as_str());
391            }
392        }
393    }
394
395    let mut rows: Vec<LangRow> = Vec::with_capacity(by_lang.len());
396    for ((lang, is_embedded), (agg, files_set)) in by_lang {
397        if agg.code == 0 {
398            continue;
399        }
400        let files = files_set.len();
401        rows.push(LangRow {
402            lang: if is_embedded {
403                format!("{} (embedded)", lang)
404            } else {
405                lang.to_string()
406            },
407            code: agg.code,
408            lines: agg.lines,
409            files,
410            bytes: agg.bytes,
411            tokens: agg.tokens,
412            avg_lines: avg(agg.lines, files),
413        });
414    }
415
416    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.lang.cmp(&b.lang)));
417
418    let total_code: usize = rows.iter().map(|r| r.code).sum();
419    let total_lines: usize = rows.iter().map(|r| r.lines).sum();
420    let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
421    let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
422    let total_files = unique_parent_file_count_from_rows(file_rows);
423
424    let total = Totals {
425        code: total_code,
426        lines: total_lines,
427        files: total_files,
428        bytes: total_bytes,
429        tokens: total_tokens,
430        avg_lines: avg(total_lines, total_files),
431    };
432
433    if top > 0 && rows.len() > top {
434        let other = fold_other_lang(&rows[top..]);
435        rows.truncate(top);
436        rows.push(other);
437    }
438
439    LangReport {
440        rows,
441        total,
442        with_files,
443        children,
444        top,
445    }
446}
447
448fn fold_other_lang(rows: &[LangRow]) -> LangRow {
449    let mut code = 0usize;
450    let mut lines = 0usize;
451    let mut files = 0usize;
452    let mut bytes = 0usize;
453    let mut tokens = 0usize;
454
455    for r in rows {
456        code += r.code;
457        lines += r.lines;
458        files += r.files;
459        bytes += r.bytes;
460        tokens += r.tokens;
461    }
462
463    LangRow {
464        lang: "Other".to_string(),
465        code,
466        lines,
467        files,
468        bytes,
469        tokens,
470        avg_lines: avg(lines, files),
471    }
472}
473
474pub fn create_module_report(
475    languages: &Languages,
476    module_roots: &[String],
477    module_depth: usize,
478    children: ChildIncludeMode,
479    top: usize,
480) -> ModuleReport {
481    let file_rows = collect_file_rows(languages, module_roots, module_depth, children, None);
482    create_module_report_from_rows(&file_rows, module_roots, module_depth, children, top)
483}
484
485pub fn create_module_report_from_rows(
486    file_rows: &[FileRow],
487    module_roots: &[String],
488    module_depth: usize,
489    children: ChildIncludeMode,
490    top: usize,
491) -> ModuleReport {
492    #[derive(Default)]
493    struct Agg {
494        code: usize,
495        lines: usize,
496        bytes: usize,
497        tokens: usize,
498    }
499
500    let mut by_module: BTreeMap<&str, (Agg, BTreeSet<&str>)> = BTreeMap::new();
501    let mut total_code = 0;
502    let mut total_lines = 0;
503    let mut total_bytes = 0;
504    let mut total_tokens = 0;
505
506    for r in file_rows {
507        total_code += r.code;
508        total_lines += r.lines;
509        total_bytes += r.bytes;
510        total_tokens += r.tokens;
511
512        let entry = by_module
513            .entry(r.module.as_str())
514            .or_insert_with(|| (Agg::default(), BTreeSet::new()));
515        entry.0.code += r.code;
516        entry.0.lines += r.lines;
517        entry.0.bytes += r.bytes;
518        entry.0.tokens += r.tokens;
519
520        if r.kind == FileKind::Parent {
521            entry.1.insert(r.path.as_str());
522        }
523    }
524
525    let mut rows: Vec<ModuleRow> = Vec::with_capacity(by_module.len());
526    for (module, (agg, files_set)) in by_module {
527        let files = files_set.len();
528        rows.push(ModuleRow {
529            module: module.to_string(),
530            code: agg.code,
531            lines: agg.lines,
532            files,
533            bytes: agg.bytes,
534            tokens: agg.tokens,
535            avg_lines: avg(agg.lines, files),
536        });
537    }
538
539    // Sort descending by code, then by module name for determinism.
540    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.module.cmp(&b.module)));
541
542    if top > 0 && rows.len() > top {
543        let other = fold_other_module(&rows[top..]);
544        rows.truncate(top);
545        rows.push(other);
546    }
547
548    let total_files = unique_parent_file_count_from_rows(file_rows);
549
550    let total = Totals {
551        code: total_code,
552        lines: total_lines,
553        files: total_files,
554        bytes: total_bytes,
555        tokens: total_tokens,
556        avg_lines: avg(total_lines, total_files),
557    };
558
559    ModuleReport {
560        rows,
561        total,
562        module_roots: module_roots.to_vec(),
563        module_depth,
564        children,
565        top,
566    }
567}
568
569fn fold_other_module(rows: &[ModuleRow]) -> ModuleRow {
570    let mut code = 0usize;
571    let mut lines = 0usize;
572    let mut files = 0usize;
573    let mut bytes = 0usize;
574    let mut tokens = 0usize;
575
576    for r in rows {
577        code += r.code;
578        lines += r.lines;
579        files += r.files;
580        bytes += r.bytes;
581        tokens += r.tokens;
582    }
583
584    ModuleRow {
585        module: "Other".to_string(),
586        code,
587        lines,
588        files,
589        bytes,
590        tokens,
591        avg_lines: avg(lines, files),
592    }
593}
594
595pub fn create_export_data(
596    languages: &Languages,
597    module_roots: &[String],
598    module_depth: usize,
599    children: ChildIncludeMode,
600    strip_prefix: Option<&Path>,
601    min_code: usize,
602    max_rows: usize,
603) -> ExportData {
604    let rows = collect_file_rows(
605        languages,
606        module_roots,
607        module_depth,
608        children,
609        strip_prefix,
610    );
611    create_export_data_from_rows(
612        rows,
613        module_roots,
614        module_depth,
615        children,
616        min_code,
617        max_rows,
618    )
619}
620
621pub fn create_export_data_from_rows(
622    mut rows: Vec<FileRow>,
623    module_roots: &[String],
624    module_depth: usize,
625    children: ChildIncludeMode,
626    min_code: usize,
627    max_rows: usize,
628) -> ExportData {
629    // Filter and sort for determinism.
630    if min_code > 0 {
631        rows.retain(|r| r.code >= min_code);
632    }
633    rows.sort_by(|a, b| b.code.cmp(&a.code).then_with(|| a.path.cmp(&b.path)));
634
635    if max_rows > 0 && rows.len() > max_rows {
636        rows.truncate(max_rows);
637    }
638
639    ExportData {
640        rows,
641        module_roots: module_roots.to_vec(),
642        module_depth,
643        children,
644    }
645}
646
647/// Collect per-file contributions, optionally including embedded language reports.
648///
649/// This returns one row per (path, lang, kind), aggregated if tokei produced multiple
650/// reports for the same tuple.
651pub fn collect_file_rows(
652    languages: &Languages,
653    module_roots: &[String],
654    module_depth: usize,
655    children: ChildIncludeMode,
656    strip_prefix: Option<&Path>,
657) -> Vec<FileRow> {
658    let mut map = BTreeMap::new();
659
660    // Parent reports
661    for (lang_type, lang) in languages.iter() {
662        for report in &lang.reports {
663            let path = normalize_path(&report.name, strip_prefix);
664            let module = module_key_from_normalized(&path, module_roots, module_depth);
665            let st = report.stats.summarise();
666            let (bytes, tokens) = get_file_metrics(&report.name);
667            insert_row(
668                &mut map,
669                Key {
670                    path,
671                    lang: lang_type.name(),
672                    kind: FileKind::Parent,
673                },
674                module,
675                &st,
676                bytes,
677                tokens,
678            );
679        }
680    }
681
682    if children == ChildIncludeMode::Separate {
683        for (_lang_type, lang) in languages.iter() {
684            for (child_type, reports) in &lang.children {
685                for report in reports {
686                    let path = normalize_path(&report.name, strip_prefix);
687                    let module = module_key_from_normalized(&path, module_roots, module_depth);
688                    let st = report.stats.summarise();
689                    insert_row(
690                        &mut map,
691                        Key {
692                            path,
693                            lang: child_type.name(),
694                            kind: FileKind::Child,
695                        },
696                        module,
697                        &st,
698                        0,
699                        0,
700                    );
701                }
702            }
703        }
704    }
705
706    rows_from_map(map)
707}
708
709pub fn unique_parent_file_count(languages: &Languages) -> usize {
710    let rows = collect_file_rows(languages, &[], 1, ChildIncludeMode::ParentsOnly, None);
711    unique_parent_file_count_from_rows(&rows)
712}
713
714pub fn unique_parent_file_count_from_rows(file_rows: &[FileRow]) -> usize {
715    file_rows
716        .iter()
717        .filter(|row| row.kind == FileKind::Parent)
718        .map(|row| row.path.as_str())
719        .collect::<BTreeSet<_>>()
720        .len()
721}
722
723/// Compute the average of `lines` over `files`, rounding to nearest integer.
724///
725/// Returns 0 if `files` is zero.
726///
727/// # Examples
728///
729/// ```
730/// use tokmd_model::avg;
731///
732/// assert_eq!(avg(300, 3), 100);
733/// assert_eq!(avg(0, 5), 0);
734/// assert_eq!(avg(100, 0), 0);
735/// // Rounds to nearest: 7 / 2 = 3.5 → 4
736/// assert_eq!(avg(7, 2), 4);
737/// ```
738pub fn avg(lines: usize, files: usize) -> usize {
739    if files == 0 {
740        return 0;
741    }
742    // Round to nearest integer.
743    (lines + (files / 2)) / files
744}
745
746/// Normalize a path for portable output.
747///
748/// - Uses `/` separators
749/// - Strips leading `./`
750/// - Optionally strips a user-provided prefix (after normalization)
751///
752/// # Examples
753///
754/// ```
755/// use std::path::Path;
756/// use tokmd_model::normalize_path;
757///
758/// // Normalizes backslashes to forward slashes
759/// let p = Path::new("src\\main.rs");
760/// assert_eq!(normalize_path(p, None), "src/main.rs");
761///
762/// // Strips a prefix
763/// let p = Path::new("project/src/lib.rs");
764/// let prefix = Path::new("project");
765/// assert_eq!(normalize_path(&p, Some(&prefix)), "src/lib.rs");
766/// ```
767pub fn normalize_path(path: &Path, strip_prefix: Option<&Path>) -> String {
768    let s_cow = path.to_string_lossy();
769    let s: Cow<str> = if s_cow.contains('\\') {
770        Cow::Owned(s_cow.replace('\\', "/"))
771    } else {
772        s_cow
773    };
774
775    let mut slice: &str = &s;
776
777    // Strip leading ./ first, so strip_prefix can match against "src/" instead of "./src/"
778    if let Some(stripped) = slice.strip_prefix("./") {
779        slice = stripped;
780    }
781
782    if let Some(prefix) = strip_prefix {
783        let p_cow = prefix.to_string_lossy();
784        // Strip leading ./ from prefix so it can match normalized paths
785        let p_cow_stripped: Cow<str> = if let Some(stripped) = p_cow.strip_prefix("./") {
786            Cow::Borrowed(stripped)
787        } else {
788            p_cow
789        };
790
791        let needs_replace = p_cow_stripped.contains('\\');
792        let needs_slash = !p_cow_stripped.ends_with('/');
793
794        if !needs_replace && !needs_slash {
795            // Fast path: prefix is already clean and ends with slash
796            if slice.starts_with(p_cow_stripped.as_ref()) {
797                slice = &slice[p_cow_stripped.len()..];
798            }
799        } else {
800            // Slow path: normalize prefix
801            let mut pfx = if needs_replace {
802                p_cow_stripped.replace('\\', "/")
803            } else {
804                p_cow_stripped.into_owned()
805            };
806            if needs_slash {
807                pfx.push('/');
808            }
809            if slice.starts_with(&pfx) {
810                slice = &slice[pfx.len()..];
811            }
812        }
813    }
814
815    slice = slice.trim_start_matches('/');
816
817    // After trimming slashes, we might be left with a leading ./ (e.g. from "/./")
818    if let Some(stripped) = slice.strip_prefix("./") {
819        slice = stripped;
820    }
821    slice = slice.trim_start_matches('/');
822
823    if slice.len() == s.len() {
824        s.into_owned()
825    } else {
826        slice.to_string()
827    }
828}
829
830/// Compute a "module key" from an input path.
831///
832/// Rules:
833/// - Root-level files become "(root)".
834/// - If the first directory segment is in `module_roots`, join `module_depth` *directory* segments.
835/// - Otherwise, module key is the top-level directory.
836///
837/// # Examples
838///
839/// ```
840/// use tokmd_model::module_key;
841///
842/// let roots = vec!["crates".to_string()];
843/// assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 2), "crates/foo");
844/// assert_eq!(module_key("src/lib.rs", &roots, 2), "src");
845/// assert_eq!(module_key("Cargo.toml", &roots, 2), "(root)");
846/// ```
847pub fn module_key(path: &str, module_roots: &[String], module_depth: usize) -> String {
848    module_key::module_key(path, module_roots, module_depth)
849}
850
851#[cfg(test)]
852mod tests {
853    use super::*;
854    use std::path::PathBuf;
855
856    #[test]
857    fn module_key_root_level_file() {
858        assert_eq!(module_key("Cargo.toml", &["crates".into()], 2), "(root)");
859        assert_eq!(module_key("./Cargo.toml", &["crates".into()], 2), "(root)");
860    }
861
862    #[test]
863    fn module_key_crates_depth_2() {
864        let roots = vec!["crates".into(), "packages".into()];
865        assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 2), "crates/foo");
866        assert_eq!(
867            module_key("packages/bar/src/main.rs", &roots, 2),
868            "packages/bar"
869        );
870    }
871
872    #[test]
873    fn module_key_crates_depth_1() {
874        let roots = vec!["crates".into(), "packages".into()];
875        assert_eq!(module_key("crates/foo/src/lib.rs", &roots, 1), "crates");
876    }
877
878    #[test]
879    fn module_key_non_root() {
880        let roots = vec!["crates".into()];
881        assert_eq!(module_key("src/lib.rs", &roots, 2), "src");
882        assert_eq!(module_key("tools/gen.rs", &roots, 2), "tools");
883    }
884
885    #[test]
886    fn module_key_depth_overflow_does_not_include_filename() {
887        let roots = vec!["crates".into()];
888        // File directly under a root: depth=2 should NOT include the filename
889        assert_eq!(module_key("crates/foo.rs", &roots, 2), "crates");
890        // Depth exceeds available directories: should stop at deepest directory
891        assert_eq!(
892            module_key("crates/foo/src/lib.rs", &roots, 10),
893            "crates/foo/src"
894        );
895    }
896
897    #[test]
898    fn normalize_path_strips_prefix() {
899        let p = PathBuf::from("C:/Code/Repo/src/main.rs");
900        let prefix = PathBuf::from("C:/Code/Repo");
901        let got = normalize_path(&p, Some(&prefix));
902        assert_eq!(got, "src/main.rs");
903    }
904
905    #[test]
906    fn normalize_path_normalization_slashes() {
907        let p = PathBuf::from(r"C:\Code\Repo\src\main.rs");
908        let got = normalize_path(&p, None);
909        assert_eq!(got, "C:/Code/Repo/src/main.rs");
910    }
911
912    mod normalize_properties {
913        use super::*;
914        use proptest::prelude::*;
915
916        fn arb_path_component() -> impl Strategy<Value = String> {
917            "[a-zA-Z0-9_.-]+"
918        }
919
920        fn arb_path(max_depth: usize) -> impl Strategy<Value = String> {
921            prop::collection::vec(arb_path_component(), 1..=max_depth)
922                .prop_map(|comps| comps.join("/"))
923        }
924
925        proptest! {
926            #[test]
927            fn normalize_path_is_idempotent(path in arb_path(5)) {
928                let p = PathBuf::from(&path);
929                let norm1 = normalize_path(&p, None);
930                let p2 = PathBuf::from(&norm1);
931                let norm2 = normalize_path(&p2, None);
932                prop_assert_eq!(norm1, norm2);
933            }
934
935            #[test]
936            fn normalize_path_handles_windows_separators(path in arb_path(5)) {
937                let win_path = path.replace('/', "\\");
938                let p_win = PathBuf::from(&win_path);
939                let p_unix = PathBuf::from(&path);
940
941                let norm_win = normalize_path(&p_win, None);
942                let norm_unix = normalize_path(&p_unix, None);
943
944                prop_assert_eq!(norm_win, norm_unix);
945            }
946
947            #[test]
948            fn normalize_path_no_leading_slash(path in arb_path(5)) {
949                let p = PathBuf::from(&path);
950                let norm = normalize_path(&p, None);
951                prop_assert!(!norm.starts_with('/'));
952            }
953
954            #[test]
955            fn normalize_path_no_leading_dot_slash(path in arb_path(5)) {
956                let p = PathBuf::from(&path);
957                let norm = normalize_path(&p, None);
958                prop_assert!(!norm.starts_with("./"));
959            }
960
961            #[test]
962            fn module_key_deterministic(
963                path in arb_path(5),
964                roots in prop::collection::vec(arb_path_component(), 1..3),
965                depth in 1usize..5
966            ) {
967                let k1 = module_key(&path, &roots, depth);
968                let k2 = module_key(&path, &roots, depth);
969                prop_assert_eq!(k1, k2);
970            }
971        }
972    }
973
974    // Property-based tests for fold_other_* functions
975    mod fold_properties {
976        use super::*;
977        use proptest::prelude::*;
978
979        fn arb_lang_row() -> impl Strategy<Value = LangRow> {
980            (
981                "[a-zA-Z]+",
982                0usize..10000,
983                0usize..20000,
984                0usize..1000,
985                0usize..1000000,
986                0usize..100000,
987            )
988                .prop_map(|(lang, code, lines, files, bytes, tokens)| {
989                    let avg_lines = (lines + (files / 2)).checked_div(files).unwrap_or(0);
990                    LangRow {
991                        lang,
992                        code,
993                        lines,
994                        files,
995                        bytes,
996                        tokens,
997                        avg_lines,
998                    }
999                })
1000        }
1001
1002        fn arb_module_row() -> impl Strategy<Value = ModuleRow> {
1003            (
1004                "[a-zA-Z0-9_/]+",
1005                0usize..10000,
1006                0usize..20000,
1007                0usize..1000,
1008                0usize..1000000,
1009                0usize..100000,
1010            )
1011                .prop_map(|(module, code, lines, files, bytes, tokens)| {
1012                    let avg_lines = (lines + (files / 2)).checked_div(files).unwrap_or(0);
1013                    ModuleRow {
1014                        module,
1015                        code,
1016                        lines,
1017                        files,
1018                        bytes,
1019                        tokens,
1020                        avg_lines,
1021                    }
1022                })
1023        }
1024
1025        proptest! {
1026            #[test]
1027            fn fold_lang_preserves_totals(rows in prop::collection::vec(arb_lang_row(), 0..10)) {
1028                let folded = fold_other_lang(&rows);
1029
1030                let total_code: usize = rows.iter().map(|r| r.code).sum();
1031                let total_lines: usize = rows.iter().map(|r| r.lines).sum();
1032                let total_files: usize = rows.iter().map(|r| r.files).sum();
1033                let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
1034                let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
1035
1036                prop_assert_eq!(folded.code, total_code, "Code mismatch");
1037                prop_assert_eq!(folded.lines, total_lines, "Lines mismatch");
1038                prop_assert_eq!(folded.files, total_files, "Files mismatch");
1039                prop_assert_eq!(folded.bytes, total_bytes, "Bytes mismatch");
1040                prop_assert_eq!(folded.tokens, total_tokens, "Tokens mismatch");
1041            }
1042
1043            #[test]
1044            fn fold_lang_empty_is_zero(_dummy in 0..1u8) {
1045                let folded = fold_other_lang(&[]);
1046                prop_assert_eq!(folded.code, 0);
1047                prop_assert_eq!(folded.lines, 0);
1048                prop_assert_eq!(folded.files, 0);
1049                prop_assert_eq!(folded.bytes, 0);
1050                prop_assert_eq!(folded.tokens, 0);
1051                prop_assert_eq!(folded.lang, "Other");
1052            }
1053
1054            #[test]
1055            fn fold_module_preserves_totals(rows in prop::collection::vec(arb_module_row(), 0..10)) {
1056                let folded = fold_other_module(&rows);
1057
1058                let total_code: usize = rows.iter().map(|r| r.code).sum();
1059                let total_lines: usize = rows.iter().map(|r| r.lines).sum();
1060                let total_files: usize = rows.iter().map(|r| r.files).sum();
1061                let total_bytes: usize = rows.iter().map(|r| r.bytes).sum();
1062                let total_tokens: usize = rows.iter().map(|r| r.tokens).sum();
1063
1064                prop_assert_eq!(folded.code, total_code, "Code mismatch");
1065                prop_assert_eq!(folded.lines, total_lines, "Lines mismatch");
1066                prop_assert_eq!(folded.files, total_files, "Files mismatch");
1067                prop_assert_eq!(folded.bytes, total_bytes, "Bytes mismatch");
1068                prop_assert_eq!(folded.tokens, total_tokens, "Tokens mismatch");
1069            }
1070
1071            #[test]
1072            fn fold_module_empty_is_zero(_dummy in 0..1u8) {
1073                let folded = fold_other_module(&[]);
1074                prop_assert_eq!(folded.code, 0);
1075                prop_assert_eq!(folded.lines, 0);
1076                prop_assert_eq!(folded.files, 0);
1077                prop_assert_eq!(folded.bytes, 0);
1078                prop_assert_eq!(folded.tokens, 0);
1079                prop_assert_eq!(folded.module, "Other");
1080            }
1081
1082            #[test]
1083            fn fold_associative_lang(
1084                rows1 in prop::collection::vec(arb_lang_row(), 0..5),
1085                rows2 in prop::collection::vec(arb_lang_row(), 0..5)
1086            ) {
1087                // Folding all at once should equal folding parts and combining
1088                let all: Vec<_> = rows1.iter().chain(rows2.iter()).cloned().collect();
1089                let fold_all = fold_other_lang(&all);
1090
1091                let fold1 = fold_other_lang(&rows1);
1092                let fold2 = fold_other_lang(&rows2);
1093                let combined = fold_other_lang(&[fold1, fold2]);
1094
1095                prop_assert_eq!(fold_all.code, combined.code);
1096                prop_assert_eq!(fold_all.lines, combined.lines);
1097                prop_assert_eq!(fold_all.files, combined.files);
1098                prop_assert_eq!(fold_all.bytes, combined.bytes);
1099                prop_assert_eq!(fold_all.tokens, combined.tokens);
1100            }
1101        }
1102    }
1103
1104    #[test]
1105    fn test_looks_like_env_assignment() {
1106        assert!(looks_like_env_assignment("FOO=bar"));
1107        assert!(looks_like_env_assignment("_FOO=bar"));
1108        assert!(looks_like_env_assignment("A_B_C=123"));
1109
1110        assert!(!looks_like_env_assignment("="));
1111        assert!(!looks_like_env_assignment("=bar"));
1112        assert!(!looks_like_env_assignment("1FOO=bar"));
1113        assert!(!looks_like_env_assignment("FOO-BAR=baz"));
1114    }
1115
1116    #[test]
1117    fn test_env_interpreter_token() {
1118        // Simple case
1119        assert_eq!(
1120            env_interpreter_token(vec!["python"].into_iter()),
1121            Some("python")
1122        );
1123
1124        // Skip env assignments
1125        assert_eq!(
1126            env_interpreter_token(vec!["FOO=bar", "python"].into_iter()),
1127            Some("python")
1128        );
1129
1130        // Skip common env flags without args
1131        assert_eq!(
1132            env_interpreter_token(vec!["-S", "-i", "python"].into_iter()),
1133            Some("python")
1134        );
1135
1136        // Skip flags with next argument
1137        assert_eq!(
1138            env_interpreter_token(vec!["-u", "FOO", "-C", "/tmp", "python"].into_iter()),
1139            Some("python")
1140        );
1141        assert_eq!(
1142            env_interpreter_token(vec!["--unset", "FOO", "python"].into_iter()),
1143            Some("python")
1144        );
1145
1146        // Skip long flags with = assignment
1147        assert_eq!(
1148            env_interpreter_token(vec!["--unset=FOO", "python"].into_iter()),
1149            Some("python")
1150        );
1151        assert_eq!(
1152            env_interpreter_token(vec!["--chdir=/tmp", "python"].into_iter()),
1153            Some("python")
1154        );
1155        assert_eq!(
1156            env_interpreter_token(vec!["--default-path=/bin", "python"].into_iter()),
1157            Some("python")
1158        );
1159        assert_eq!(
1160            env_interpreter_token(vec!["--argv0=sh", "python"].into_iter()),
1161            Some("python")
1162        );
1163        assert_eq!(
1164            env_interpreter_token(vec!["--default-signal=SIGINT", "python"].into_iter()),
1165            Some("python")
1166        );
1167        assert_eq!(
1168            env_interpreter_token(vec!["--ignore-signal=SIGINT", "python"].into_iter()),
1169            Some("python")
1170        );
1171        assert_eq!(
1172            env_interpreter_token(vec!["--block-signal=SIGINT", "python"].into_iter()),
1173            Some("python")
1174        );
1175
1176        // Unknown flags starting with - are skipped (mimicking coreutils env behavior)
1177        assert_eq!(
1178            env_interpreter_token(vec!["--unknown-flag", "python"].into_iter()),
1179            Some("python")
1180        );
1181
1182        // Empty words
1183        assert_eq!(
1184            env_interpreter_token(vec!["", "python"].into_iter()),
1185            Some("python")
1186        );
1187
1188        // No interpreter found
1189        assert_eq!(env_interpreter_token(vec!["FOO=bar"].into_iter()), None);
1190    }
1191}
tokmd_model/lib.rs

tokmd_model/
lib.rs