tldr-core 0.4.0

Core analysis engine for TLDR code analysis tool
Documentation
//! Clone detection module -- tree-sitter fragment extraction,
//! two-phase hash detection, and correct line numbers.

// All type definitions and utility functions
mod types;
pub use types::*;

// Submodules
pub mod detect;
pub mod extract;
pub mod filter;
pub mod tokenize;

use std::collections::HashSet;
use std::path::Path;
use std::time::Instant;

// Re-export is_test_file from filter
pub use filter::is_test_file;

use detect::PairKey;
use extract::extract_fragments_from_file;
use filter::{discover_source_files, get_language_from_path};
use tokenize::tokenize_file_v2;

/// schema-cleanup-v2 (P2.BUG-7): pick the dominant language string
/// across the discovered files. Tally the per-extension language label
/// from `get_language_from_path` and return the most-frequent one. Ties
/// resolve by the first language encountered (deterministic ordering on
/// the input slice — `discover_source_files` already returns files in
/// directory-walk order).
///
/// Returns the literal `"auto"` only when no file has a recognised
/// extension (degenerate case — `discover_source_files` should have
/// rejected those via `is_source_file_for_clones`, so this branch is
/// effectively unreachable in production).
fn resolve_dominant_language(files: &[std::path::PathBuf]) -> String {
    use std::collections::HashMap;
    let mut counts: HashMap<&'static str, usize> = HashMap::new();
    let mut order: Vec<&'static str> = Vec::new();
    for f in files {
        if let Some(lang) = get_language_from_path(f) {
            if !counts.contains_key(lang) {
                order.push(lang);
            }
            *counts.entry(lang).or_insert(0) += 1;
        }
    }
    // Pick the language with the highest count; on ties, the first
    // language seen (insertion order) wins.
    let mut best: Option<&'static str> = None;
    let mut best_count: usize = 0;
    for lang in &order {
        let c = counts[lang];
        if c > best_count {
            best_count = c;
            best = Some(lang);
        }
    }
    best.map(|s| s.to_string())
        .unwrap_or_else(|| "auto".to_string())
}

/// Detect code clones in a directory using the v2 pipeline.
///
/// Pipeline:
/// 1. Discover source files
/// 2. Tokenize each file
/// 3. Extract fragments per file (tree-sitter function boundaries)
/// 4. Detect Type-1/Type-2 clones (hash buckets)
/// 5. Detect Type-3 clones (inverted index)
/// 6. Assemble ClonesReport
pub fn detect_clones(path: &Path, options: &ClonesOptions) -> anyhow::Result<ClonesReport> {
    let start = Instant::now();

    // Step 1: Discover files
    let files = discover_source_files(
        path,
        options.language.as_deref(),
        options.max_files,
        options.exclude_generated,
        options.exclude_tests,
    );

    if files.is_empty() {
        return Ok(empty_report(path, options, &start));
    }

    // schema-cleanup-v2 (P2.BUG-7): resolve the report's `language` field
    // to the actual analyzed language string rather than the literal
    // `"auto"` placeholder. Pre-fix the field always echoed `"auto"` (or
    // the user's `--lang` flag verbatim), making it impossible for
    // consumers to programmatically tell what the autodetector actually
    // picked. Now we either honor the explicit `options.language` or
    // derive a single dominant language string from the discovered files
    // via `get_language_from_path` majority count. Falls back to `"auto"`
    // only when no file in `files` has a recognised extension (degenerate
    // case — `discover_source_files` should have rejected those files via
    // `is_source_file_for_clones`).
    let resolved_language = options
        .language
        .clone()
        .unwrap_or_else(|| resolve_dominant_language(&files));

    // Step 2: Tokenize files
    let file_tokens: Vec<tokenize::FileTokens> = files
        .iter()
        .filter_map(|f| tokenize_file_v2(f).ok())
        .collect();

    if file_tokens.is_empty() {
        return Ok(empty_report(path, options, &start));
    }

    // Step 3: Extract fragments
    let mut all_fragments: Vec<extract::FragmentData> = Vec::new();
    for (idx, ft) in file_tokens.iter().enumerate() {
        let frags = extract_fragments_from_file(
            ft,
            idx,
            options.min_tokens,
            options.min_lines,
            options.normalization,
        );
        all_fragments.extend(frags);
    }

    let total_tokens: usize = file_tokens.iter().map(|ft| ft.raw_tokens.len()).sum();

    if all_fragments.is_empty() {
        return Ok(ClonesReport {
            root: path.to_path_buf(),
            language: resolved_language.clone(),
            clone_pairs: vec![],
            clone_classes: vec![],
            stats: CloneStats {
                files_analyzed: file_tokens.len(),
                total_tokens,
                clones_found: 0,
                type1_count: 0,
                type2_count: 0,
                type3_count: 0,
                class_count: None,
                detection_time_ms: start.elapsed().as_millis() as u64,
            },
            config: CloneConfig {
                min_tokens: options.min_tokens,
                min_lines: options.min_lines,
                similarity_threshold: options.threshold,
                normalization: options.normalization,
                type_filter: options.type_filter,
            },
        });
    }

    // Step 4: Type-1 and Type-2 detection
    let mut found_pairs: HashSet<PairKey> = HashSet::new();
    let mut clone_pairs = detect::detect_type1_type2(&all_fragments, options, &mut found_pairs);

    // Step 5: Type-3 detection (if type_filter allows and not at max_clones)
    let should_detect_type3 = clone_pairs.len() < options.max_clones
        && options.type_filter.is_none_or(|t| t == CloneType::Type3);

    if should_detect_type3 {
        let type3_pairs = detect::detect_type3(&all_fragments, options, &mut found_pairs);
        clone_pairs.extend(type3_pairs);
    }

    // determinism-and-stderr-hygiene-v1 (BUG-2): both `detect_type1_type2`
    // and `detect_type3` walk `HashMap<u64, Vec<usize>>` /
    // `HashMap<usize, usize>` buckets via `.values()` / `.iter()`, whose
    // iteration order is non-deterministic across runs (DefaultHasher
    // seeds per-process). The same set of pairs is found, but the order
    // of `clone_pairs[]` shuffles across invocations, so two runs of
    // `tldr clones <repo>` produce byte-different stdout — breaking
    // CI byte-diff gates and any downstream tooling that hashes the
    // report. Sort here, BEFORE id assignment, so IDs are also stable.
    // Tiebreaker chain matches `ClonePair::canonical()` ordering
    // (`fragment1.file → fragment1.start_line → fragment2.file →
    // fragment2.start_line`) plus `clone_type` and `similarity` to
    // produce a total order over the bag.
    clone_pairs.sort_by(|a, b| {
        a.fragment1
            .file
            .cmp(&b.fragment1.file)
            .then_with(|| a.fragment1.start_line.cmp(&b.fragment1.start_line))
            .then_with(|| a.fragment1.end_line.cmp(&b.fragment1.end_line))
            .then_with(|| a.fragment2.file.cmp(&b.fragment2.file))
            .then_with(|| a.fragment2.start_line.cmp(&b.fragment2.start_line))
            .then_with(|| a.fragment2.end_line.cmp(&b.fragment2.end_line))
            .then_with(|| (a.clone_type as u8).cmp(&(b.clone_type as u8)))
            .then_with(|| {
                b.similarity
                    .partial_cmp(&a.similarity)
                    .unwrap_or(std::cmp::Ordering::Equal)
            })
    });

    // Assign sequential 1-indexed IDs
    for (i, pair) in clone_pairs.iter_mut().enumerate() {
        pair.id = i + 1;
    }

    // Compute statistics
    let type1_count = clone_pairs
        .iter()
        .filter(|p| p.clone_type == CloneType::Type1)
        .count();
    let type2_count = clone_pairs
        .iter()
        .filter(|p| p.clone_type == CloneType::Type2)
        .count();
    let type3_count = clone_pairs
        .iter()
        .filter(|p| p.clone_type == CloneType::Type3)
        .count();

    let (clone_classes, class_count) = if options.show_classes {
        let classes = compute_clone_classes_v2(&clone_pairs);
        let count = classes.len();
        (classes, Some(count))
    } else {
        (vec![], None)
    };

    Ok(ClonesReport {
        root: path.to_path_buf(),
        language: resolved_language,
        clone_pairs,
        clone_classes,
        stats: CloneStats {
            files_analyzed: file_tokens.len(),
            total_tokens,
            clones_found: type1_count + type2_count + type3_count,
            type1_count,
            type2_count,
            type3_count,
            class_count,
            detection_time_ms: start.elapsed().as_millis() as u64,
        },
        config: CloneConfig {
            min_tokens: options.min_tokens,
            min_lines: options.min_lines,
            similarity_threshold: options.threshold,
            normalization: options.normalization,
            type_filter: options.type_filter,
        },
    })
}

/// Compute clone classes from pairs using Union-Find (for show_classes=true).
fn compute_clone_classes_v2(pairs: &[ClonePair]) -> Vec<CloneClass> {
    use std::collections::HashMap;

    if pairs.is_empty() {
        return vec![];
    }

    // Map fragments to indices
    let mut fragment_map: HashMap<CloneFragment, usize> = HashMap::new();
    let mut fragments: Vec<CloneFragment> = Vec::new();

    for pair in pairs {
        if !fragment_map.contains_key(&pair.fragment1) {
            fragment_map.insert(pair.fragment1.clone(), fragments.len());
            fragments.push(pair.fragment1.clone());
        }
        if !fragment_map.contains_key(&pair.fragment2) {
            fragment_map.insert(pair.fragment2.clone(), fragments.len());
            fragments.push(pair.fragment2.clone());
        }
    }

    // Build Union-Find
    let mut uf = UnionFind::new(fragments.len());
    let mut pair_similarities: HashMap<(usize, usize), (f64, CloneType)> = HashMap::new();

    for pair in pairs {
        let idx1 = fragment_map[&pair.fragment1];
        let idx2 = fragment_map[&pair.fragment2];
        uf.union(idx1, idx2);
        pair_similarities.insert(
            (idx1.min(idx2), idx1.max(idx2)),
            (pair.similarity, pair.clone_type),
        );
    }

    // Extract components
    let components = uf.components();
    let mut classes: Vec<CloneClass> = Vec::new();
    let mut class_id = 1;

    for (_root, member_indices) in components {
        if member_indices.len() < 2 {
            continue;
        }

        let class_fragments: Vec<CloneFragment> = member_indices
            .iter()
            .map(|&i| fragments[i].clone())
            .collect();

        let mut total_sim = 0.0f64;
        let mut count = 0usize;
        let mut type_counts: HashMap<CloneType, usize> = HashMap::new();

        for i in 0..member_indices.len() {
            for j in (i + 1)..member_indices.len() {
                let key = (
                    member_indices[i].min(member_indices[j]),
                    member_indices[i].max(member_indices[j]),
                );
                if let Some(&(sim, ct)) = pair_similarities.get(&key) {
                    total_sim += sim;
                    count += 1;
                    *type_counts.entry(ct).or_insert(0) += 1;
                }
            }
        }

        let avg_similarity = if count > 0 {
            total_sim / count as f64
        } else {
            1.0
        };
        let dominant_type = type_counts
            .into_iter()
            .max_by_key(|&(_, c)| c)
            .map(|(t, _)| t)
            .unwrap_or(CloneType::Type1);

        classes.push(CloneClass {
            id: class_id,
            clone_type: dominant_type,
            avg_similarity,
            size: class_fragments.len(),
            fragments: class_fragments,
        });
        class_id += 1;
    }

    classes
}

/// Create an empty report for edge cases (no files, etc.)
fn empty_report(path: &Path, options: &ClonesOptions, start: &Instant) -> ClonesReport {
    ClonesReport {
        root: path.to_path_buf(),
        language: options
            .language
            .clone()
            .unwrap_or_else(|| "auto".to_string()),
        clone_pairs: vec![],
        clone_classes: vec![],
        stats: CloneStats {
            files_analyzed: 0,
            total_tokens: 0,
            clones_found: 0,
            type1_count: 0,
            type2_count: 0,
            type3_count: 0,
            class_count: None,
            detection_time_ms: start.elapsed().as_millis() as u64,
        },
        config: CloneConfig {
            min_tokens: options.min_tokens,
            min_lines: options.min_lines,
            similarity_threshold: options.threshold,
            normalization: options.normalization,
            type_filter: options.type_filter,
        },
    }
}