dupe-core 0.1.0

//! PolyDup Core - Cross-language duplicate code detection engine
//!
//! This library provides the core functionality for detecting duplicate code
//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
//! Rabin-Karp/MinHash algorithms, and parallel processing.

mod queries;
mod parsing;
mod hashing;

#[cfg(test)]
mod proptest_fuzzing;

#[cfg(test)]
mod snapshot_tests;

// Re-export public types
pub use parsing::{
    extract_functions, extract_javascript_functions, extract_python_functions,
    extract_rust_functions, FunctionNode,
};
pub use hashing::{normalize, Token, RollingHash, compute_rolling_hashes, CloneMatch, detect_duplicates_with_extension};

use anyhow::{Context, Result, anyhow};
use ignore::WalkBuilder;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use tree_sitter::Language;

/// Represents a detected duplicate code fragment
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DuplicateMatch {
    pub file1: String,
    pub file2: String,
    pub start_line1: usize,
    pub start_line2: usize,
    pub length: usize,
    pub similarity: f64,
    pub hash: u64,
}

/// Represents a function with its tokens for duplicate detection
#[derive(Debug, Clone)]
struct FunctionHash {
    file_path: String,
    function_name: Option<String>,
    start_byte: usize,
    end_byte: usize,
    start_line: usize,
    end_line: usize,
    tokens: Vec<Token>, // Full token sequence for extension
}

/// Report containing scan results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Report {
    /// Total number of files scanned
    pub files_scanned: usize,
    /// Total number of functions analyzed
    pub functions_analyzed: usize,
    /// Detected duplicate matches
    pub duplicates: Vec<DuplicateMatch>,
    /// Scan statistics
    pub stats: ScanStats,
}

/// Statistics from the scanning process
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanStats {
    /// Total lines of code scanned
    pub total_lines: usize,
    /// Total tokens processed
    pub total_tokens: usize,
    /// Number of unique hashes computed
    pub unique_hashes: usize,
    /// Scan duration in milliseconds
    pub duration_ms: u64,
}

/// Main scanner for detecting duplicates
pub struct Scanner {
    /// Minimum code block size to consider (in tokens)
    min_block_size: usize,
    /// Similarity threshold (0.0 - 1.0)
    similarity_threshold: f64,
}

impl Scanner {
    /// Creates a new Scanner with default settings
    pub fn new() -> Result<Self> {
        Ok(Self {
            min_block_size: 50,
            similarity_threshold: 0.85,
        })
    }

    /// Creates a new Scanner with custom settings
    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
        Ok(Self {
            min_block_size,
            similarity_threshold,
        })
    }

    /// Scans the given paths and returns a Report with detected duplicates
    ///
    /// Uses Rayon for parallel file processing:
    /// 1. Read and parse files
    /// 2. Extract functions
    /// 3. Normalize and hash function bodies
    /// 4. Compare hashes to find duplicates
    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
        use std::time::Instant;
        let start_time = Instant::now();

        // Collect all source files
        let source_files = self.collect_source_files(paths)?;
        
        // Process files in parallel using Rayon
        let function_hashes: Vec<FunctionHash> = source_files
            .par_iter()
            .filter_map(|path| self.process_file(path).ok())
            .flatten()
            .collect();

        // Find duplicates by comparing hashes
        let duplicates = self.find_duplicate_hashes(&function_hashes);

        // Calculate statistics
        let total_tokens: usize = function_hashes
            .iter()
            .map(|fh| fh.tokens.len())
            .sum();

        let unique_hashes: usize = {
            let mut hash_set = std::collections::HashSet::new();
            for fh in &function_hashes {
                // Compute rolling hashes just for statistics
                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
                for (hash, _) in hashes {
                    hash_set.insert(hash);
                }
            }
            hash_set.len()
        };

        let duration_ms = start_time.elapsed().as_millis() as u64;

        Ok(Report {
            files_scanned: source_files.len(),
            functions_analyzed: function_hashes.len(),
            duplicates,
            stats: ScanStats {
                total_lines: 0, // TODO: Count lines
                total_tokens,
                unique_hashes,
                duration_ms,
            },
        })
    }

    /// Collects all source files from the given paths
    /// 
    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
    /// and common ignore patterns (node_modules, target, etc.)
    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
        let mut files = Vec::new();

        for path in paths {
            if path.is_file() {
                if self.is_supported_file(&path) {
                    files.push(path);
                }
            } else if path.is_dir() {
                // Use ignore crate's WalkBuilder to respect .gitignore
                let walker = WalkBuilder::new(&path)
                    .git_ignore(true)           // Respect .gitignore
                    .git_global(true)           // Respect global gitignore
                    .git_exclude(true)          // Respect .git/info/exclude
                    .ignore(true)               // Respect .ignore files
                    .hidden(false)              // Don't skip hidden files (e.g., .config/)
                    .parents(true)              // Respect parent .gitignore files
                    .build();

                for entry in walker {
                    match entry {
                        Ok(entry) => {
                            let path = entry.path();
                            if path.is_file() && self.is_supported_file(path) {
                                files.push(path.to_path_buf());
                            }
                        }
                        Err(err) => {
                            // Log but don't fail on individual entry errors
                            eprintln!("Warning: Failed to access path: {}", err);
                        }
                    }
                }
            }
        }

        Ok(files)
    }

    /// Checks if a file is a supported source file
    fn is_supported_file(&self, path: &Path) -> bool {
        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
        } else {
            false
        }
    }

    /// Processes a single file and returns function hashes
    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
        let code = fs::read_to_string(path)
            .context(format!("Failed to read file: {:?}", path))?;

        let lang = self.detect_language(path)?;
        let functions = extract_functions(&code, lang)?;

        let file_path = path.to_string_lossy().to_string();
        let mut function_hashes = Vec::new();

        for func in functions {
            // Normalize the function body
            let tokens = normalize(&func.body);
            
            // Skip if too small
            if tokens.len() < self.min_block_size {
                continue;
            }

            // Store the full token sequence for extension-based detection
            function_hashes.push(FunctionHash {
                file_path: file_path.clone(),
                function_name: func.name.clone(),
                start_byte: func.start_byte,
                end_byte: func.end_byte,
                start_line: func.start_line,
                end_line: func.end_line,
                tokens,
            });
        }

        Ok(function_hashes)
    }

    /// Detects the Tree-sitter Language from file extension
    fn detect_language(&self, path: &Path) -> Result<Language> {
        let ext = path
            .extension()
            .and_then(|e| e.to_str())
            .ok_or_else(|| anyhow!("No file extension"))?;

        match ext {
            "rs" => Ok(tree_sitter_rust::language()),
            "py" => Ok(tree_sitter_python::language()),
            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
        }
    }

    /// Finds duplicate code using greedy extension algorithm
    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
        let mut duplicates = Vec::new();
        let mut seen_pairs = std::collections::HashSet::new();

        // Compare each pair of functions
        for i in 0..function_hashes.len() {
            for j in (i + 1)..function_hashes.len() {
                let func1 = &function_hashes[i];
                let func2 = &function_hashes[j];

                // Skip if same file
                if func1.file_path == func2.file_path {
                    continue;
                }

                // Use extension-based detection on each function's tokens
                let matches = self.find_clones_between_functions(func1, func2);

                for clone_match in matches {
                    // Create pair key for deduplication
                    let pair_key = if func1.file_path < func2.file_path {
                        (
                            func1.file_path.clone(),
                            func2.file_path.clone(),
                            clone_match.source_start,
                            clone_match.target_start,
                            clone_match.length,
                        )
                    } else {
                        (
                            func2.file_path.clone(),
                            func1.file_path.clone(),
                            clone_match.target_start,
                            clone_match.source_start,
                            clone_match.length,
                        )
                    };

                    if seen_pairs.contains(&pair_key) {
                        continue;
                    }
                    seen_pairs.insert(pair_key);

                    // Compute a hash for this match for reporting
                    use std::collections::hash_map::DefaultHasher;
                    use std::hash::{Hash, Hasher};
                    let mut hasher = DefaultHasher::new();
                    func1.tokens[clone_match.source_start
                        ..clone_match.source_start + clone_match.length]
                        .hash(&mut hasher);
                    let match_hash = hasher.finish();

                    duplicates.push(DuplicateMatch {
                        file1: func1.file_path.clone(),
                        file2: func2.file_path.clone(),
                        start_line1: func1.start_line,
                        start_line2: func2.start_line,
                        length: clone_match.length,
                        similarity: 1.0, // Exact match
                        hash: match_hash,
                    });
                }
            }
        }

        duplicates
    }

    /// Finds clone matches between two functions using extension algorithm
    fn find_clones_between_functions(
        &self,
        func1: &FunctionHash,
        func2: &FunctionHash,
    ) -> Vec<CloneMatch> {
        use std::collections::HashMap;

        let mut matches = Vec::new();
        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();

        // Index all windows in func1
        let mut i = 0;
        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
            let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
            hash_map.entry(hash).or_insert_with(Vec::new).push(i);
            i += 1;
        }

        // Search for matches in func2
        let mut j = 0;
        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
            let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);

            if let Some(func1_positions) = hash_map.get(&hash) {
                for &func1_pos in func1_positions {
                    // Verify exact match
                    if self.verify_window_match(
                        &func1.tokens,
                        &func2.tokens,
                        func1_pos,
                        j,
                        self.min_block_size,
                    ) {
                        // Greedy extension
                        let mut extension = 0;
                        while (func1_pos + self.min_block_size + extension < func1.tokens.len())
                            && (j + self.min_block_size + extension < func2.tokens.len())
                            && (func1.tokens[func1_pos + self.min_block_size + extension]
                                == func2.tokens[j + self.min_block_size + extension])
                        {
                            extension += 1;
                        }

                        let total_length = self.min_block_size + extension;

                        matches.push(CloneMatch {
                            source_start: func1_pos,
                            target_start: j,
                            length: total_length,
                        });

                        // Skip ahead
                        j += extension.max(1);
                        break;
                    }
                }
            }

            j += 1;
        }

        matches
    }

    /// Computes hash for a token window
    fn compute_window_hash(&self, window: &[Token]) -> u64 {
        const BASE: u64 = 257;
        const MODULUS: u64 = 1_000_000_007;

        let mut hash: u64 = 0;
        for token in window {
            use std::collections::hash_map::DefaultHasher;
            use std::hash::{Hash, Hasher};
            let mut hasher = DefaultHasher::new();
            token.as_hash_string().hash(&mut hasher);
            let token_hash = hasher.finish();
            hash = (hash.wrapping_mul(BASE).wrapping_add(token_hash)) % MODULUS;
        }
        hash
    }

    /// Verifies that two token windows are exactly identical
    fn verify_window_match(
        &self,
        tokens1: &[Token],
        tokens2: &[Token],
        idx1: usize,
        idx2: usize,
        len: usize,
    ) -> bool {
        if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
            return false;
        }
        tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
    }
}

impl Default for Scanner {
    fn default() -> Self {
        Self::new().expect("Failed to initialize default Scanner")
    }
}

/// Public API: Find duplicates in the given file paths
///
/// # Arguments
/// * `paths` - Vector of file paths to scan
///
/// # Returns
/// * `Result<Report>` - Scan report with detected duplicates
pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
    let scanner = Scanner::new()?;
    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
    scanner.scan(path_bufs)
}

/// Public API with custom configuration
pub fn find_duplicates_with_config(
    paths: Vec<String>,
    min_block_size: usize,
    similarity_threshold: f64,
) -> Result<Report> {
    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
    scanner.scan(path_bufs)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_scanner_creation() {
        let scanner = Scanner::new();
        assert!(scanner.is_ok());
    }

    #[test]
    fn test_scanner_with_config() {
        let scanner = Scanner::with_config(30, 0.9);
        assert!(scanner.is_ok());
        let s = scanner.unwrap();
        assert_eq!(s.min_block_size, 30);
        assert_eq!(s.similarity_threshold, 0.9);
    }

    #[test]
    fn test_find_duplicates_empty() {
        let result = find_duplicates(vec![]);
        assert!(result.is_ok());
        let report = result.unwrap();
        assert_eq!(report.duplicates.len(), 0);
    }

    #[test]
    fn test_is_supported_file() {
        let scanner = Scanner::new().unwrap();
        
        assert!(scanner.is_supported_file(Path::new("test.rs")));
        assert!(scanner.is_supported_file(Path::new("test.py")));
        assert!(scanner.is_supported_file(Path::new("test.js")));
        assert!(scanner.is_supported_file(Path::new("test.ts")));
        assert!(!scanner.is_supported_file(Path::new("test.txt")));
        assert!(!scanner.is_supported_file(Path::new("test.md")));
    }

    #[test]
    fn test_detect_language() {
        let scanner = Scanner::new().unwrap();
        
        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
    }
}