dupe_core/
hashing.rs

1//! Hashing and normalization logic for duplicate code detection
2//!
3//! This module implements:
4//! - Token normalization (Type-2 clone detection)
5//! - Rabin-Karp rolling hash algorithm
6//! - Hash-based code similarity detection
7//! - Edit distance calculation (Type-3 clone detection)
8
9use std::num::Wrapping;
10
11/// Normalized token representation
12///
13/// Identifiers and literals are normalized to allow detection of
14/// structurally similar code (Type-2 clones).
15#[derive(Debug, Clone, PartialEq, Eq, Hash)]
16pub enum Token {
17    /// Keyword (if, for, while, fn, def, etc.)
18    Keyword(String),
19    /// Normalized identifier placeholder
20    Identifier,
21    /// Normalized string literal placeholder
22    StringLiteral,
23    /// Normalized number literal placeholder
24    NumberLiteral,
25    /// Operator (+, -, *, /, etc.)
26    Operator(String),
27    /// Punctuation (parentheses, braces, semicolons)
28    Punctuation(String),
29}
30
31impl Token {
32    /// Returns a string representation for hashing
33    pub fn as_hash_string(&self) -> &str {
34        match self {
35            Token::Keyword(kw) => kw.as_str(),
36            Token::Identifier => "$$ID",
37            Token::StringLiteral => "$$STR",
38            Token::NumberLiteral => "$$NUM",
39            Token::Operator(op) => op.as_str(),
40            Token::Punctuation(p) => p.as_str(),
41        }
42    }
43}
44
45/// Normalizes source code into a token stream for duplicate detection
46///
47/// # Normalization Rules
48/// - Comments are ignored
49/// - Whitespace is ignored
50/// - Identifiers → `$$ID`
51/// - String literals → `$$STR`
52/// - Number literals → `$$NUM`
53/// - Keywords are preserved
54///
55/// # Arguments
56/// * `code` - The source code to normalize
57///
58/// # Returns
59/// * `Vec<Token>` - Normalized token stream
60pub fn normalize(code: &str) -> Vec<Token> {
61    let (tokens, _) = normalize_with_line_numbers(code);
62    tokens
63}
64
65/// Normalizes source code into tokens while tracking line offsets
66///
67/// Returns both the normalized tokens and the zero-based line offset (relative
68/// to the start of `code`) where each token begins.
69pub fn normalize_with_line_numbers(code: &str) -> (Vec<Token>, Vec<usize>) {
70    let keywords = get_keyword_set();
71    // Pre-allocate capacity based on heuristic: ~1 token per 3 characters
72    let estimated_tokens = code.len() / 3;
73    let mut tokens = Vec::with_capacity(estimated_tokens);
74    let mut line_offsets = Vec::with_capacity(estimated_tokens);
75    let chars: Vec<char> = code.chars().collect();
76    let mut i = 0;
77    let mut line = 0;
78
79    while i < chars.len() {
80        let ch = chars[i];
81
82        // Skip whitespace
83        if ch.is_whitespace() {
84            if ch == '\n' {
85                line += 1;
86            }
87            i += 1;
88            continue;
89        }
90
91        // Skip single-line comments (// and #)
92        if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
93            // Skip until end of line
94            while i < chars.len() && chars[i] != '\n' {
95                i += 1;
96            }
97            continue;
98        }
99
100        // Skip multi-line comments (/* */ and """ """)
101        if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
102            i += 2;
103            while i + 1 < chars.len() {
104                if chars[i] == '\n' {
105                    line += 1;
106                }
107                if chars[i] == '*' && chars[i + 1] == '/' {
108                    i += 2;
109                    break;
110                }
111                i += 1;
112            }
113            continue;
114        }
115
116        // String literals
117        if ch == '"' || ch == '\'' {
118            let quote = ch;
119            let token_line = line;
120            i += 1;
121            // Skip until closing quote
122            while i < chars.len() {
123                if chars[i] == '\\' {
124                    i += 2; // Skip escaped character
125                    continue;
126                }
127                if chars[i] == '\n' {
128                    line += 1;
129                }
130                if chars[i] == quote {
131                    i += 1;
132                    break;
133                }
134                i += 1;
135            }
136            tokens.push(Token::StringLiteral);
137            line_offsets.push(token_line);
138            continue;
139        }
140
141        // Numbers
142        if ch.is_ascii_digit() {
143            let token_line = line;
144            while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
145                i += 1;
146            }
147            tokens.push(Token::NumberLiteral);
148            line_offsets.push(token_line);
149            continue;
150        }
151
152        // Identifiers and keywords
153        if ch.is_alphabetic() || ch == '_' || ch == '$' {
154            let token_line = line;
155            let start = i;
156            while i < chars.len()
157                && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
158            {
159                i += 1;
160            }
161            let word: String = chars[start..i].iter().collect();
162
163            if keywords.contains(&word.as_str()) {
164                tokens.push(Token::Keyword(word));
165            } else {
166                tokens.push(Token::Identifier);
167            }
168            line_offsets.push(token_line);
169            continue;
170        }
171
172        // Operators (multi-char)
173        if i + 1 < chars.len() {
174            let two_char: String = chars[i..i + 2].iter().collect();
175            if is_operator(&two_char) {
176                let token_line = line;
177                tokens.push(Token::Operator(two_char));
178                i += 2;
179                line_offsets.push(token_line);
180                continue;
181            }
182        }
183
184        // Single-char operators and punctuation
185        let single = ch.to_string();
186        if is_operator(&single) {
187            let token_line = line;
188            tokens.push(Token::Operator(single));
189            line_offsets.push(token_line);
190        } else if is_punctuation(ch) {
191            let token_line = line;
192            tokens.push(Token::Punctuation(single));
193            line_offsets.push(token_line);
194        }
195
196        i += 1;
197    }
198
199    (tokens, line_offsets)
200}
201
202/// Rabin-Karp rolling hash for efficient substring comparison
203///
204/// Uses a rolling window to compute hashes of code blocks.
205/// Allows for efficient duplicate detection in O(n) time.
206#[derive(Debug, Clone)]
207pub struct RollingHash {
208    /// Window size for rolling hash
209    window_size: usize,
210    /// Base for polynomial rolling hash
211    base: Wrapping<u64>,
212    /// Current hash value
213    hash: Wrapping<u64>,
214    /// Power of base for window size (base^window_size)
215    base_power: Wrapping<u64>,
216    /// Current window contents
217    window: Vec<u64>,
218}
219
220impl RollingHash {
221    /// Creates a new RollingHash with the specified window size
222    ///
223    /// # Arguments
224    /// * `window_size` - Number of tokens in the rolling window (default: 50)
225    pub fn new(window_size: usize) -> Self {
226        let base = Wrapping(257u64);
227        let mut base_power = Wrapping(1u64);
228
229        // Calculate base^window_size
230        for _ in 0..window_size {
231            base_power *= base;
232        }
233
234        Self {
235            window_size,
236            base,
237            hash: Wrapping(0),
238            base_power,
239            window: Vec::with_capacity(window_size),
240        }
241    }
242
243    /// Adds a token to the rolling hash window
244    ///
245    /// If the window is full, the oldest token is removed.
246    ///
247    /// # Arguments
248    /// * `token_hash` - Hash value of the token to add
249    ///
250    /// # Returns
251    /// * `Option<u64>` - The current hash if window is full, None otherwise
252    pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
253        if self.window.len() < self.window_size {
254            // Window not full yet
255            self.window.push(token_hash);
256            self.hash = self.hash * self.base + Wrapping(token_hash);
257
258            if self.window.len() == self.window_size {
259                Some(self.hash.0)
260            } else {
261                None
262            }
263        } else {
264            // Window is full, remove oldest and add new
265            let old_token = self.window.remove(0);
266            self.window.push(token_hash);
267
268            // Remove contribution of old token
269            self.hash -= Wrapping(old_token) * self.base_power;
270            // Shift and add new token
271            self.hash = self.hash * self.base + Wrapping(token_hash);
272
273            Some(self.hash.0)
274        }
275    }
276
277    /// Resets the rolling hash to initial state
278    pub fn reset(&mut self) {
279        self.hash = Wrapping(0);
280        self.window.clear();
281    }
282
283    /// Returns the current hash value (if window is full)
284    pub fn current_hash(&self) -> Option<u64> {
285        if self.window.len() == self.window_size {
286            Some(self.hash.0)
287        } else {
288            None
289        }
290    }
291
292    /// Returns the current window size
293    pub fn window_size(&self) -> usize {
294        self.window_size
295    }
296}
297
298/// Computes rolling hashes for a token stream
299///
300/// # Arguments
301/// * `tokens` - Normalized token stream
302/// * `window_size` - Size of the rolling window
303///
304/// # Returns
305/// * `Vec<(u64, usize)>` - List of (hash, start_index) pairs
306pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
307    if tokens.len() < window_size {
308        return Vec::new();
309    }
310
311    let mut hasher = RollingHash::new(window_size);
312    let mut hashes = Vec::new();
313
314    for (idx, token) in tokens.iter().enumerate() {
315        let token_hash = hash_token(token);
316        if let Some(hash) = hasher.roll(token_hash) {
317            // idx is the last token in the window, so start_index is idx - window_size + 1
318            // but we need to ensure it doesn't underflow
319            let start_index = idx.saturating_sub(window_size - 1);
320            hashes.push((hash, start_index));
321        }
322    }
323
324    hashes
325}
326
327/// Computes a hash value for a single token
328fn hash_token(token: &Token) -> u64 {
329    use std::collections::hash_map::DefaultHasher;
330    use std::hash::{Hash, Hasher};
331
332    let mut hasher = DefaultHasher::new();
333    token.as_hash_string().hash(&mut hasher);
334    hasher.finish()
335}
336
337/// Represents a detected duplicate code block
338#[derive(Debug, Clone)]
339pub struct CloneMatch {
340    pub source_start: usize,
341    pub target_start: usize,
342    /// Length of the matched region in the source sequence
343    pub length: usize,
344    /// Length of the matched region in the target sequence (may differ for Type-3)
345    pub target_length: usize,
346    /// Token-level similarity (0.0-1.0). 1.0 for Type-1/2, calculated for Type-3
347    pub similarity: f64,
348}
349
350/// Detects duplicates using rolling hash with greedy extension
351///
352/// This implements the Hash-and-Extend strategy:
353/// 1. Use rolling hash to find candidate matches (50-token windows)
354/// 2. Verify the match to handle hash collisions
355/// 3. Greedily extend the match beyond the initial window
356/// 4. Skip ahead to avoid reporting overlapping duplicates
357///
358/// # Arguments
359/// * `tokens` - The token sequence to analyze
360/// * `window_size` - Size of the rolling window (default: 50)
361///
362/// # Returns
363/// * `Vec<CloneMatch>` - List of detected clones with variable lengths
364pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
365    use std::collections::HashMap;
366
367    if tokens.len() < window_size {
368        return Vec::new();
369    }
370
371    let mut matches = Vec::new();
372    let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
373    let mut i = 0;
374
375    // Build rolling hashes and detect matches with extension
376    while i <= tokens.len().saturating_sub(window_size) {
377        // 1. Compute hash for current window
378        let current_hash = compute_window_hash(&tokens[i..i + window_size]);
379
380        // 2. Check if we've seen this hash before
381        if let Some(prev_indices) = hash_map.get(&current_hash) {
382            // Try to match with each previous occurrence
383            for &prev_index in prev_indices.iter() {
384                // Skip if this would be a self-match or overlap
385                if prev_index >= i {
386                    continue;
387                }
388
389                // 3. Verify exact match (handle hash collisions)
390                if verify_window_match(tokens, prev_index, i, window_size) {
391                    // 4. GREEDY EXTENSION: Expand beyond the initial window
392                    let mut extension = 0;
393                    while (i + window_size + extension < tokens.len())
394                        && (prev_index + window_size + extension < i)
395                        && (tokens[prev_index + window_size + extension]
396                            == tokens[i + window_size + extension])
397                    {
398                        extension += 1;
399                    }
400
401                    let total_length = window_size + extension;
402
403                    // Record the full match
404                    matches.push(CloneMatch {
405                        source_start: prev_index,
406                        target_start: i,
407                        length: total_length,
408                        target_length: total_length,
409                        similarity: 1.0, // Exact match
410                    });
411
412                    // 5. Skip ahead to avoid reporting overlapping subsets
413                    i += extension.max(1);
414                    break; // Found a match, move to next position
415                }
416            }
417        }
418
419        // Store this position for future comparisons
420        hash_map.entry(current_hash).or_default().push(i);
421        i += 1;
422    }
423
424    matches
425}
426
427/// Computes hash for a specific token window
428///
429/// Uses Rabin-Karp polynomial rolling hash with:
430/// - BASE = 257 (prime > 256 to minimize collisions for all byte values)
431/// - MODULUS = 1e9+7 (large prime commonly used in hashing algorithms)
432pub fn compute_window_hash(window: &[Token]) -> u64 {
433    /// Prime base for polynomial rolling hash (chosen to be > 256)
434    const BASE: u64 = 257;
435    /// Large prime modulus to reduce hash collisions (1e9+7)
436    const MODULUS: u64 = 1_000_000_007;
437
438    let mut hash: u64 = 0;
439    for token in window {
440        let token_hash = hash_token(token);
441        // Use u128 to prevent overflow before modulo operation
442        let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
443        hash = wide_hash as u64;
444    }
445    hash
446}
447
448/// Verifies that two token windows within the same slice are exactly identical
449fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
450    if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
451        return false;
452    }
453    tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
454}
455
456/// Verifies that two token windows from different slices are exactly identical
457///
458/// Used for cross-file/cross-function duplicate detection.
459pub fn verify_cross_window_match(
460    tokens1: &[Token],
461    tokens2: &[Token],
462    idx1: usize,
463    idx2: usize,
464    len: usize,
465) -> bool {
466    if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
467        return false;
468    }
469    tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
470}
471
472/// Extends a token match greedily beyond the initial window size
473///
474/// Given two token sequences and starting positions with a known matching window,
475/// extends the match as far as possible while tokens continue to match.
476///
477/// # Arguments
478/// * `tokens1` - First token sequence
479/// * `tokens2` - Second token sequence
480/// * `pos1` - Starting position in tokens1
481/// * `pos2` - Starting position in tokens2
482/// * `initial_len` - Length of the initial matching window
483///
484/// # Returns
485/// * Number of additional tokens beyond `initial_len` that also match
486pub fn extend_match(
487    tokens1: &[Token],
488    tokens2: &[Token],
489    pos1: usize,
490    pos2: usize,
491    initial_len: usize,
492) -> usize {
493    let mut extension = 0;
494    while (pos1 + initial_len + extension < tokens1.len())
495        && (pos2 + initial_len + extension < tokens2.len())
496        && (tokens1[pos1 + initial_len + extension] == tokens2[pos2 + initial_len + extension])
497    {
498        extension += 1;
499    }
500    extension
501}
502
503/// Returns a set of keywords for all supported languages
504fn get_keyword_set() -> &'static [&'static str] {
505    &[
506        // Rust keywords
507        "as",
508        "break",
509        "const",
510        "continue",
511        "crate",
512        "else",
513        "enum",
514        "extern",
515        "false",
516        "fn",
517        "for",
518        "if",
519        "impl",
520        "in",
521        "let",
522        "loop",
523        "match",
524        "mod",
525        "move",
526        "mut",
527        "pub",
528        "ref",
529        "return",
530        "self",
531        "Self",
532        "static",
533        "struct",
534        "super",
535        "trait",
536        "true",
537        "type",
538        "unsafe",
539        "use",
540        "where",
541        "while",
542        "async",
543        "await",
544        "dyn",
545        // Python keywords
546        "and",
547        "assert",
548        "class",
549        "def",
550        "del",
551        "elif",
552        "except",
553        "finally",
554        "from",
555        "global",
556        "import",
557        "is",
558        "lambda",
559        "nonlocal",
560        "not",
561        "or",
562        "pass",
563        "raise",
564        "try",
565        "with",
566        "yield",
567        // JavaScript keywords
568        "await",
569        "case",
570        "catch",
571        "class",
572        "const",
573        "continue",
574        "debugger",
575        "default",
576        "delete",
577        "do",
578        "else",
579        "export",
580        "extends",
581        "finally",
582        "for",
583        "function",
584        "if",
585        "import",
586        "in",
587        "instanceof",
588        "let",
589        "new",
590        "return",
591        "super",
592        "switch",
593        "this",
594        "throw",
595        "try",
596        "typeof",
597        "var",
598        "void",
599        "while",
600        "with",
601        "yield",
602    ]
603}
604
605/// Checks if a string is an operator
606fn is_operator(s: &str) -> bool {
607    matches!(
608        s,
609        "+" | "-"
610            | "*"
611            | "/"
612            | "%"
613            | "="
614            | "=="
615            | "!="
616            | "<"
617            | ">"
618            | "<="
619            | ">="
620            | "&&"
621            | "||"
622            | "!"
623            | "&"
624            | "|"
625            | "^"
626            | "<<"
627            | ">>"
628            | "+="
629            | "-="
630            | "*="
631            | "/="
632            | "=>"
633            | "->"
634            | "::"
635            | "."
636    )
637}
638
639/// Checks if a character is punctuation
640fn is_punctuation(ch: char) -> bool {
641    matches!(
642        ch,
643        '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
644    )
645}
646
647/// Computes token-level similarity between two token sequences using edit distance
648///
649/// Returns a similarity score between 0.0 and 1.0, where:
650/// - 1.0 = identical sequences
651/// - 0.0 = completely different
652///
653/// Uses normalized Levenshtein distance:
654/// similarity = 1 - (edit_distance / max_length)
655///
656/// # Arguments
657/// * `tokens1` - First token sequence
658/// * `tokens2` - Second token sequence
659///
660/// # Returns
661/// * `f64` - Similarity score (0.0-1.0)
662pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
663    if tokens1.is_empty() && tokens2.is_empty() {
664        return 1.0;
665    }
666    if tokens1.is_empty() || tokens2.is_empty() {
667        return 0.0;
668    }
669
670    // Compute edit distance using custom token comparison
671    let distance = compute_token_edit_distance(tokens1, tokens2);
672    let max_len = tokens1.len().max(tokens2.len());
673
674    // Normalize to similarity score
675    let similarity = 1.0 - (distance as f64 / max_len as f64);
676    similarity.clamp(0.0, 1.0)
677}
678
679/// Computes Levenshtein edit distance between two token sequences
680///
681/// Uses dynamic programming to calculate the minimum number of
682/// insertions, deletions, or substitutions needed to transform
683/// tokens1 into tokens2.
684///
685/// # Arguments
686/// * `tokens1` - First token sequence
687/// * `tokens2` - Second token sequence
688///
689/// # Returns
690/// * `usize` - Edit distance
691pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
692    let len1 = tokens1.len();
693    let len2 = tokens2.len();
694
695    if len1 == 0 {
696        return len2;
697    }
698    if len2 == 0 {
699        return len1;
700    }
701
702    // Create DP table
703    let mut prev_row: Vec<usize> = (0..=len2).collect();
704    let mut curr_row: Vec<usize> = vec![0; len2 + 1];
705
706    for i in 1..=len1 {
707        curr_row[0] = i;
708
709        for j in 1..=len2 {
710            let cost = if tokens1[i - 1] == tokens2[j - 1] {
711                0
712            } else {
713                1
714            };
715
716            curr_row[j] = (prev_row[j - 1] + cost) // substitution
717                .min(prev_row[j] + 1) // deletion
718                .min(curr_row[j - 1] + 1); // insertion
719        }
720
721        std::mem::swap(&mut prev_row, &mut curr_row);
722    }
723
724    prev_row[len2]
725}
726
727/// Explores different window lengths to find the best Type-3 match
728///
729/// Tries extending both windows by small amounts to find the best similarity score.
730/// This accounts for insertions/deletions that make windows different lengths.
731///
732/// # Returns
733/// * `Option<(source_len, target_len, similarity)>` - Best match found, or None
734#[allow(clippy::too_many_arguments)]
735fn explore_length_variants(
736    tokens1: &[Token],
737    tokens2: &[Token],
738    i: usize,
739    j: usize,
740    window_size: usize,
741    base_similarity: f64,
742    length_delta_limit: usize,
743    tolerance: f64,
744) -> Option<(usize, usize, f64)> {
745    let max_extra1 = length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
746    let max_extra2 = length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
747
748    let mut best_match: Option<(usize, usize, f64)> = None;
749
750    for extra1 in 0..=max_extra1 {
751        let len1 = window_size + extra1;
752        let window1 = &tokens1[i..i + len1];
753
754        for extra2 in 0..=max_extra2 {
755            let len2 = window_size + extra2;
756
757            // Skip if length difference exceeds limit
758            if len1.abs_diff(len2) > length_delta_limit {
759                continue;
760            }
761
762            // Early rejection: minimum edits needed exceeds tolerance
763            let max_len = len1.max(len2);
764            let min_distance = len1.abs_diff(len2);
765            let max_allowed_distance = ((1.0 - tolerance) * max_len as f64).ceil() as usize;
766
767            if min_distance > max_allowed_distance {
768                continue;
769            }
770
771            // Compute similarity (reuse base if same window)
772            let candidate_similarity = if extra1 == 0 && extra2 == 0 {
773                base_similarity
774            } else {
775                compute_token_similarity(window1, &tokens2[j..j + len2])
776            };
777
778            if candidate_similarity >= tolerance {
779                best_match = match best_match {
780                    None => Some((len1, len2, candidate_similarity)),
781                    Some((best_len1, best_len2, best_sim)) => {
782                        let better_similarity = candidate_similarity > best_sim + f64::EPSILON;
783                        let better_coverage =
784                            !better_similarity && len1.max(len2) > best_len1.max(best_len2);
785
786                        if better_similarity || better_coverage {
787                            Some((len1, len2, candidate_similarity))
788                        } else {
789                            best_match
790                        }
791                    }
792                };
793            }
794        }
795    }
796
797    best_match
798}
799
800/// Detects Type-3 clones (gap-tolerant) between two token sequences
801///
802/// Uses a sliding window approach with edit distance calculation:
803/// 1. Find candidate regions where tokens partially match
804/// 2. Calculate edit distance for each candidate pair
805/// 3. Accept matches above the tolerance threshold
806///
807/// # Arguments
808/// * `tokens1` - First token sequence
809/// * `tokens2` - Second token sequence
810/// * `window_size` - Minimum block size to consider
811/// * `tolerance` - Minimum similarity threshold (0.0-1.0)
812///
813/// # Returns
814/// * `Vec<CloneMatch>` - List of Type-3 clone matches with similarity scores
815pub fn detect_type3_clones(
816    tokens1: &[Token],
817    tokens2: &[Token],
818    window_size: usize,
819    tolerance: f64,
820) -> Vec<CloneMatch> {
821    let mut matches = Vec::new();
822
823    // Performance optimization: Only check sequences within ±20% length difference
824    let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
825    if !(0.8..=1.2).contains(&len_ratio) {
826        return matches;
827    }
828    if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
829        return matches;
830    }
831
832    // Allow limited length drift so edit distance can align insertions/deletions.
833    // Bounded by tolerance (minimum edits needed) and capped to avoid quadratic blowup.
834    let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
835    let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
836    // Explore only when the base windows are at least somewhat similar to avoid exhaustive search
837    let similarity_floor = (tolerance - 0.3).max(0.0);
838
839    // Sliding window comparison
840    let mut i = 0;
841    while i + window_size <= tokens1.len() {
842        let mut j = 0;
843        while j + window_size <= tokens2.len() {
844            // Calculate base similarity for the minimum windows
845            let window1 = &tokens1[i..i + window_size];
846            let window2 = &tokens2[j..j + window_size];
847            let similarity = compute_token_similarity(window1, window2);
848
849            if similarity < similarity_floor {
850                j += 1;
851                continue;
852            }
853
854            // Explore length variants to find best match
855            let best_match = explore_length_variants(
856                tokens1,
857                tokens2,
858                i,
859                j,
860                window_size,
861                similarity,
862                length_delta_limit,
863                tolerance,
864            );
865
866            if let Some((source_len, target_len, final_similarity)) = best_match {
867                matches.push(CloneMatch {
868                    source_start: i,
869                    target_start: j,
870                    length: source_len,
871                    target_length: target_len,
872                    similarity: final_similarity,
873                });
874
875                // Skip ahead on the target side proportionally to the match size
876                let skip = target_len.saturating_sub(window_size).max(1);
877                j += skip;
878            } else {
879                j += 1;
880            }
881        }
882        i += 1;
883    }
884
885    matches
886}
887
888#[cfg(test)]
889mod tests {
890    use super::*;
891
892    #[test]
893    fn test_normalize_rust_code() {
894        let code = r#"
895        fn add(x: i32, y: i32) -> i32 {
896            x + y
897        }
898        "#;
899
900        let tokens = normalize(code);
901        assert!(!tokens.is_empty());
902
903        // Check that 'fn' is a keyword
904        assert!(tokens
905            .iter()
906            .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
907
908        // Check that identifiers are normalized
909        assert!(tokens.contains(&Token::Identifier));
910    }
911
912    #[test]
913    fn test_normalize_python_code() {
914        let code = r#"
915        def greet(name):
916            return f"Hello, {name}!"
917        "#;
918
919        let tokens = normalize(code);
920        assert!(!tokens.is_empty());
921
922        // Check that 'def' and 'return' are keywords
923        assert!(tokens
924            .iter()
925            .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
926        assert!(tokens
927            .iter()
928            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
929
930        // Check that string is normalized
931        assert!(tokens.contains(&Token::StringLiteral));
932    }
933
934    #[test]
935    fn test_normalize_javascript_code() {
936        let code = r#"
937        function multiply(a, b) {
938            return a * b;
939        }
940        "#;
941
942        let tokens = normalize(code);
943        assert!(!tokens.is_empty());
944
945        // Check that 'function' and 'return' are keywords
946        assert!(tokens
947            .iter()
948            .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
949        assert!(tokens
950            .iter()
951            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
952    }
953
954    #[test]
955    fn test_normalize_ignores_comments() {
956        let code = r#"
957        // This is a comment
958        fn test() {
959            /* Multi-line
960               comment */
961            let x = 5; // inline comment
962        }
963        "#;
964
965        let tokens = normalize(code);
966
967        // Should not contain comment text
968        for token in &tokens {
969            if let Token::Identifier = token {
970                // OK, identifier
971            } else if let Token::Keyword(_) = token {
972                // OK, keyword
973            }
974        }
975    }
976
977    #[test]
978    fn test_rolling_hash_creation() {
979        let hasher = RollingHash::new(50);
980        assert_eq!(hasher.window_size(), 50);
981        assert_eq!(hasher.current_hash(), None);
982    }
983
984    #[test]
985    fn test_rolling_hash_basic() {
986        let mut hasher = RollingHash::new(3);
987
988        // Add tokens one by one
989        assert_eq!(hasher.roll(1), None); // Window not full
990        assert_eq!(hasher.roll(2), None); // Window not full
991
992        let hash1 = hasher.roll(3); // Window full
993        assert!(hash1.is_some());
994
995        let hash2 = hasher.roll(4); // Window rolls
996        assert!(hash2.is_some());
997
998        // Hashes should be different
999        assert_ne!(hash1, hash2);
1000    }
1001
1002    #[test]
1003    fn test_compute_rolling_hashes() {
1004        let tokens = vec![
1005            Token::Keyword("fn".to_string()),
1006            Token::Identifier,
1007            Token::Punctuation("(".to_string()),
1008            Token::Identifier,
1009            Token::Punctuation(")".to_string()),
1010        ];
1011
1012        let hashes = compute_rolling_hashes(&tokens, 3);
1013        assert_eq!(hashes.len(), 3); // 5 tokens, window size 3 = 3 hashes
1014    }
1015
1016    #[test]
1017    fn test_hash_token_consistency() {
1018        let token1 = Token::Identifier;
1019        let token2 = Token::Identifier;
1020
1021        assert_eq!(hash_token(&token1), hash_token(&token2));
1022    }
1023
1024    #[test]
1025    fn test_token_as_hash_string() {
1026        assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
1027        assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
1028        assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
1029        assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
1030    }
1031}
dupe_core/hashing.rs

dupe_core/
hashing.rs