dupe_core/
hashing.rs

1//! Hashing and normalization logic for duplicate code detection
2//!
3//! This module implements:
4//! - Token normalization (Type-2 clone detection)
5//! - Rabin-Karp rolling hash algorithm
6//! - Hash-based code similarity detection
7//! - Edit distance calculation (Type-3 clone detection)
8
9use serde::{Deserialize, Serialize};
10use std::num::Wrapping;
11
12/// Normalized token representation
13///
14/// Identifiers and literals are normalized to allow detection of
15/// structurally similar code (Type-2 clones).
16#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
17pub enum Token {
18    /// Keyword (if, for, while, fn, def, etc.)
19    Keyword(String),
20    /// Normalized identifier placeholder
21    Identifier,
22    /// Normalized string literal placeholder
23    StringLiteral,
24    /// Normalized number literal placeholder
25    NumberLiteral,
26    /// Operator (+, -, *, /, etc.)
27    Operator(String),
28    /// Punctuation (parentheses, braces, semicolons)
29    Punctuation(String),
30}
31
32impl Token {
33    /// Returns a string representation for hashing
34    pub fn as_hash_string(&self) -> &str {
35        match self {
36            Token::Keyword(kw) => kw.as_str(),
37            Token::Identifier => "$$ID",
38            Token::StringLiteral => "$$STR",
39            Token::NumberLiteral => "$$NUM",
40            Token::Operator(op) => op.as_str(),
41            Token::Punctuation(p) => p.as_str(),
42        }
43    }
44}
45
46/// Normalizes source code into a token stream for duplicate detection
47///
48/// # Normalization Rules
49/// - Comments are ignored
50/// - Whitespace is ignored
51/// - Identifiers → `$$ID`
52/// - String literals → `$$STR`
53/// - Number literals → `$$NUM`
54/// - Keywords are preserved
55///
56/// # Arguments
57/// * `code` - The source code to normalize
58///
59/// # Returns
60/// * `Vec<Token>` - Normalized token stream
61pub fn normalize(code: &str) -> Vec<Token> {
62    let (tokens, _) = normalize_with_line_numbers(code);
63    tokens
64}
65
66/// Normalizes source code into tokens while tracking line offsets
67///
68/// Returns both the normalized tokens and the zero-based line offset (relative
69/// to the start of `code`) where each token begins.
70pub fn normalize_with_line_numbers(code: &str) -> (Vec<Token>, Vec<usize>) {
71    let keywords = get_keyword_set();
72    // Pre-allocate capacity based on heuristic: ~1 token per 3 characters
73    let estimated_tokens = code.len() / 3;
74    let mut tokens = Vec::with_capacity(estimated_tokens);
75    let mut line_offsets = Vec::with_capacity(estimated_tokens);
76    let chars: Vec<char> = code.chars().collect();
77    let mut i = 0;
78    let mut line = 0;
79
80    while i < chars.len() {
81        let ch = chars[i];
82
83        // Skip whitespace
84        if ch.is_whitespace() {
85            if ch == '\n' {
86                line += 1;
87            }
88            i += 1;
89            continue;
90        }
91
92        // Skip single-line comments (// and #)
93        if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
94            // Skip until end of line
95            while i < chars.len() && chars[i] != '\n' {
96                i += 1;
97            }
98            continue;
99        }
100
101        // Skip multi-line comments (/* */ and """ """)
102        if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
103            i += 2;
104            while i + 1 < chars.len() {
105                if chars[i] == '\n' {
106                    line += 1;
107                }
108                if chars[i] == '*' && chars[i + 1] == '/' {
109                    i += 2;
110                    break;
111                }
112                i += 1;
113            }
114            continue;
115        }
116
117        // String literals
118        if ch == '"' || ch == '\'' {
119            let quote = ch;
120            let token_line = line;
121            i += 1;
122            // Skip until closing quote
123            while i < chars.len() {
124                if chars[i] == '\\' {
125                    i += 2; // Skip escaped character
126                    continue;
127                }
128                if chars[i] == '\n' {
129                    line += 1;
130                }
131                if chars[i] == quote {
132                    i += 1;
133                    break;
134                }
135                i += 1;
136            }
137            tokens.push(Token::StringLiteral);
138            line_offsets.push(token_line);
139            continue;
140        }
141
142        // Numbers
143        if ch.is_ascii_digit() {
144            let token_line = line;
145            while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
146                i += 1;
147            }
148            tokens.push(Token::NumberLiteral);
149            line_offsets.push(token_line);
150            continue;
151        }
152
153        // Identifiers and keywords
154        if ch.is_alphabetic() || ch == '_' || ch == '$' {
155            let token_line = line;
156            let start = i;
157            while i < chars.len()
158                && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
159            {
160                i += 1;
161            }
162            let word: String = chars[start..i].iter().collect();
163
164            if keywords.contains(&word.as_str()) {
165                tokens.push(Token::Keyword(word));
166            } else {
167                tokens.push(Token::Identifier);
168            }
169            line_offsets.push(token_line);
170            continue;
171        }
172
173        // Operators (multi-char)
174        if i + 1 < chars.len() {
175            let two_char: String = chars[i..i + 2].iter().collect();
176            if is_operator(&two_char) {
177                let token_line = line;
178                tokens.push(Token::Operator(two_char));
179                i += 2;
180                line_offsets.push(token_line);
181                continue;
182            }
183        }
184
185        // Single-char operators and punctuation
186        let single = ch.to_string();
187        if is_operator(&single) {
188            let token_line = line;
189            tokens.push(Token::Operator(single));
190            line_offsets.push(token_line);
191        } else if is_punctuation(ch) {
192            let token_line = line;
193            tokens.push(Token::Punctuation(single));
194            line_offsets.push(token_line);
195        }
196
197        i += 1;
198    }
199
200    (tokens, line_offsets)
201}
202
203/// Rabin-Karp rolling hash for efficient substring comparison
204///
205/// Uses a rolling window to compute hashes of code blocks.
206/// Allows for efficient duplicate detection in O(n) time.
207#[derive(Debug, Clone)]
208pub struct RollingHash {
209    /// Window size for rolling hash
210    window_size: usize,
211    /// Base for polynomial rolling hash
212    base: Wrapping<u64>,
213    /// Current hash value
214    hash: Wrapping<u64>,
215    /// Power of base for window size (base^window_size)
216    base_power: Wrapping<u64>,
217    /// Current window contents
218    window: Vec<u64>,
219}
220
221impl RollingHash {
222    /// Creates a new RollingHash with the specified window size
223    ///
224    /// # Arguments
225    /// * `window_size` - Number of tokens in the rolling window (default: 50)
226    pub fn new(window_size: usize) -> Self {
227        let base = Wrapping(257u64);
228        let mut base_power = Wrapping(1u64);
229
230        // Calculate base^window_size
231        for _ in 0..window_size {
232            base_power *= base;
233        }
234
235        Self {
236            window_size,
237            base,
238            hash: Wrapping(0),
239            base_power,
240            window: Vec::with_capacity(window_size),
241        }
242    }
243
244    /// Adds a token to the rolling hash window
245    ///
246    /// If the window is full, the oldest token is removed.
247    ///
248    /// # Arguments
249    /// * `token_hash` - Hash value of the token to add
250    ///
251    /// # Returns
252    /// * `Option<u64>` - The current hash if window is full, None otherwise
253    pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
254        if self.window.len() < self.window_size {
255            // Window not full yet
256            self.window.push(token_hash);
257            self.hash = self.hash * self.base + Wrapping(token_hash);
258
259            if self.window.len() == self.window_size {
260                Some(self.hash.0)
261            } else {
262                None
263            }
264        } else {
265            // Window is full, remove oldest and add new
266            let old_token = self.window.remove(0);
267            self.window.push(token_hash);
268
269            // Remove contribution of old token
270            self.hash -= Wrapping(old_token) * self.base_power;
271            // Shift and add new token
272            self.hash = self.hash * self.base + Wrapping(token_hash);
273
274            Some(self.hash.0)
275        }
276    }
277
278    /// Resets the rolling hash to initial state
279    pub fn reset(&mut self) {
280        self.hash = Wrapping(0);
281        self.window.clear();
282    }
283
284    /// Returns the current hash value (if window is full)
285    pub fn current_hash(&self) -> Option<u64> {
286        if self.window.len() == self.window_size {
287            Some(self.hash.0)
288        } else {
289            None
290        }
291    }
292
293    /// Returns the current window size
294    pub fn window_size(&self) -> usize {
295        self.window_size
296    }
297}
298
299/// Computes rolling hashes for a token stream
300///
301/// # Arguments
302/// * `tokens` - Normalized token stream
303/// * `window_size` - Size of the rolling window
304///
305/// # Returns
306/// * `Vec<(u64, usize)>` - List of (hash, start_index) pairs
307pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
308    if tokens.len() < window_size {
309        return Vec::new();
310    }
311
312    let mut hasher = RollingHash::new(window_size);
313    let mut hashes = Vec::new();
314
315    for (idx, token) in tokens.iter().enumerate() {
316        let token_hash = hash_token(token);
317        if let Some(hash) = hasher.roll(token_hash) {
318            // idx is the last token in the window, so start_index is idx - window_size + 1
319            // but we need to ensure it doesn't underflow
320            let start_index = idx.saturating_sub(window_size - 1);
321            hashes.push((hash, start_index));
322        }
323    }
324
325    hashes
326}
327
328/// Computes a hash value for a single token
329fn hash_token(token: &Token) -> u64 {
330    use std::collections::hash_map::DefaultHasher;
331    use std::hash::{Hash, Hasher};
332
333    let mut hasher = DefaultHasher::new();
334    token.as_hash_string().hash(&mut hasher);
335    hasher.finish()
336}
337
338/// Represents a detected duplicate code block
339#[derive(Debug, Clone)]
340pub struct CloneMatch {
341    pub source_start: usize,
342    pub target_start: usize,
343    /// Length of the matched region in the source sequence
344    pub length: usize,
345    /// Length of the matched region in the target sequence (may differ for Type-3)
346    pub target_length: usize,
347    /// Token-level similarity (0.0-1.0). 1.0 for Type-1/2, calculated for Type-3
348    pub similarity: f64,
349}
350
351/// Detects duplicates using rolling hash with greedy extension
352///
353/// This implements the Hash-and-Extend strategy:
354/// 1. Use rolling hash to find candidate matches (50-token windows)
355/// 2. Verify the match to handle hash collisions
356/// 3. Greedily extend the match beyond the initial window
357/// 4. Skip ahead to avoid reporting overlapping duplicates
358///
359/// # Arguments
360/// * `tokens` - The token sequence to analyze
361/// * `window_size` - Size of the rolling window (default: 50)
362///
363/// # Returns
364/// * `Vec<CloneMatch>` - List of detected clones with variable lengths
365pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
366    use std::collections::HashMap;
367
368    if tokens.len() < window_size {
369        return Vec::new();
370    }
371
372    let mut matches = Vec::new();
373    let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
374    let mut i = 0;
375
376    // Build rolling hashes and detect matches with extension
377    while i <= tokens.len().saturating_sub(window_size) {
378        // 1. Compute hash for current window
379        let current_hash = compute_window_hash(&tokens[i..i + window_size]);
380
381        // 2. Check if we've seen this hash before
382        if let Some(prev_indices) = hash_map.get(&current_hash) {
383            // Try to match with each previous occurrence
384            for &prev_index in prev_indices.iter() {
385                // Skip if this would be a self-match or overlap
386                if prev_index >= i {
387                    continue;
388                }
389
390                // 3. Verify exact match (handle hash collisions)
391                if verify_window_match(tokens, prev_index, i, window_size) {
392                    // 4. GREEDY EXTENSION: Expand beyond the initial window
393                    let mut extension = 0;
394                    while (i + window_size + extension < tokens.len())
395                        && (prev_index + window_size + extension < i)
396                        && (tokens[prev_index + window_size + extension]
397                            == tokens[i + window_size + extension])
398                    {
399                        extension += 1;
400                    }
401
402                    let total_length = window_size + extension;
403
404                    // Record the full match
405                    matches.push(CloneMatch {
406                        source_start: prev_index,
407                        target_start: i,
408                        length: total_length,
409                        target_length: total_length,
410                        similarity: 1.0, // Exact match
411                    });
412
413                    // 5. Skip ahead to avoid reporting overlapping subsets
414                    i += extension.max(1);
415                    break; // Found a match, move to next position
416                }
417            }
418        }
419
420        // Store this position for future comparisons
421        hash_map.entry(current_hash).or_default().push(i);
422        i += 1;
423    }
424
425    matches
426}
427
428/// Computes hash for a specific token window
429///
430/// Uses Rabin-Karp polynomial rolling hash with:
431/// - BASE = 257 (prime > 256 to minimize collisions for all byte values)
432/// - MODULUS = 1e9+7 (large prime commonly used in hashing algorithms)
433pub fn compute_window_hash(window: &[Token]) -> u64 {
434    /// Prime base for polynomial rolling hash (chosen to be > 256)
435    const BASE: u64 = 257;
436    /// Large prime modulus to reduce hash collisions (1e9+7)
437    const MODULUS: u64 = 1_000_000_007;
438
439    let mut hash: u64 = 0;
440    for token in window {
441        let token_hash = hash_token(token);
442        // Use u128 to prevent overflow before modulo operation
443        let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
444        hash = wide_hash as u64;
445    }
446    hash
447}
448
449/// Verifies that two token windows within the same slice are exactly identical
450fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
451    if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
452        return false;
453    }
454    tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
455}
456
457/// Verifies that two token windows from different slices are exactly identical
458///
459/// Used for cross-file/cross-function duplicate detection.
460pub fn verify_cross_window_match(
461    tokens1: &[Token],
462    tokens2: &[Token],
463    idx1: usize,
464    idx2: usize,
465    len: usize,
466) -> bool {
467    if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
468        return false;
469    }
470    tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
471}
472
473/// Extends a token match greedily beyond the initial window size
474///
475/// Given two token sequences and starting positions with a known matching window,
476/// extends the match as far as possible while tokens continue to match.
477///
478/// # Arguments
479/// * `tokens1` - First token sequence
480/// * `tokens2` - Second token sequence
481/// * `pos1` - Starting position in tokens1
482/// * `pos2` - Starting position in tokens2
483/// * `initial_len` - Length of the initial matching window
484///
485/// # Returns
486/// * Number of additional tokens beyond `initial_len` that also match
487pub fn extend_match(
488    tokens1: &[Token],
489    tokens2: &[Token],
490    pos1: usize,
491    pos2: usize,
492    initial_len: usize,
493) -> usize {
494    let mut extension = 0;
495    while (pos1 + initial_len + extension < tokens1.len())
496        && (pos2 + initial_len + extension < tokens2.len())
497        && (tokens1[pos1 + initial_len + extension] == tokens2[pos2 + initial_len + extension])
498    {
499        extension += 1;
500    }
501    extension
502}
503
504/// Returns a set of keywords for all supported languages
505fn get_keyword_set() -> &'static [&'static str] {
506    &[
507        // Rust keywords
508        "as",
509        "break",
510        "const",
511        "continue",
512        "crate",
513        "else",
514        "enum",
515        "extern",
516        "false",
517        "fn",
518        "for",
519        "if",
520        "impl",
521        "in",
522        "let",
523        "loop",
524        "match",
525        "mod",
526        "move",
527        "mut",
528        "pub",
529        "ref",
530        "return",
531        "self",
532        "Self",
533        "static",
534        "struct",
535        "super",
536        "trait",
537        "true",
538        "type",
539        "unsafe",
540        "use",
541        "where",
542        "while",
543        "async",
544        "await",
545        "dyn",
546        // Python keywords
547        "and",
548        "assert",
549        "class",
550        "def",
551        "del",
552        "elif",
553        "except",
554        "finally",
555        "from",
556        "global",
557        "import",
558        "is",
559        "lambda",
560        "nonlocal",
561        "not",
562        "or",
563        "pass",
564        "raise",
565        "try",
566        "with",
567        "yield",
568        // JavaScript keywords
569        "await",
570        "case",
571        "catch",
572        "class",
573        "const",
574        "continue",
575        "debugger",
576        "default",
577        "delete",
578        "do",
579        "else",
580        "export",
581        "extends",
582        "finally",
583        "for",
584        "function",
585        "if",
586        "import",
587        "in",
588        "instanceof",
589        "let",
590        "new",
591        "return",
592        "super",
593        "switch",
594        "this",
595        "throw",
596        "try",
597        "typeof",
598        "var",
599        "void",
600        "while",
601        "with",
602        "yield",
603    ]
604}
605
606/// Checks if a string is an operator
607fn is_operator(s: &str) -> bool {
608    matches!(
609        s,
610        "+" | "-"
611            | "*"
612            | "/"
613            | "%"
614            | "="
615            | "=="
616            | "!="
617            | "<"
618            | ">"
619            | "<="
620            | ">="
621            | "&&"
622            | "||"
623            | "!"
624            | "&"
625            | "|"
626            | "^"
627            | "<<"
628            | ">>"
629            | "+="
630            | "-="
631            | "*="
632            | "/="
633            | "=>"
634            | "->"
635            | "::"
636            | "."
637    )
638}
639
640/// Checks if a character is punctuation
641fn is_punctuation(ch: char) -> bool {
642    matches!(
643        ch,
644        '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
645    )
646}
647
648/// Computes token-level similarity between two token sequences using edit distance
649///
650/// Returns a similarity score between 0.0 and 1.0, where:
651/// - 1.0 = identical sequences
652/// - 0.0 = completely different
653///
654/// Uses normalized Levenshtein distance:
655/// similarity = 1 - (edit_distance / max_length)
656///
657/// # Arguments
658/// * `tokens1` - First token sequence
659/// * `tokens2` - Second token sequence
660///
661/// # Returns
662/// * `f64` - Similarity score (0.0-1.0)
663pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
664    if tokens1.is_empty() && tokens2.is_empty() {
665        return 1.0;
666    }
667    if tokens1.is_empty() || tokens2.is_empty() {
668        return 0.0;
669    }
670
671    // Compute edit distance using custom token comparison
672    let distance = compute_token_edit_distance(tokens1, tokens2);
673    let max_len = tokens1.len().max(tokens2.len());
674
675    // Normalize to similarity score
676    let similarity = 1.0 - (distance as f64 / max_len as f64);
677    similarity.clamp(0.0, 1.0)
678}
679
680/// Computes Levenshtein edit distance between two token sequences
681///
682/// Uses dynamic programming to calculate the minimum number of
683/// insertions, deletions, or substitutions needed to transform
684/// tokens1 into tokens2.
685///
686/// # Arguments
687/// * `tokens1` - First token sequence
688/// * `tokens2` - Second token sequence
689///
690/// # Returns
691/// * `usize` - Edit distance
692pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
693    let len1 = tokens1.len();
694    let len2 = tokens2.len();
695
696    if len1 == 0 {
697        return len2;
698    }
699    if len2 == 0 {
700        return len1;
701    }
702
703    // Create DP table
704    let mut prev_row: Vec<usize> = (0..=len2).collect();
705    let mut curr_row: Vec<usize> = vec![0; len2 + 1];
706
707    for i in 1..=len1 {
708        curr_row[0] = i;
709
710        for j in 1..=len2 {
711            let cost = if tokens1[i - 1] == tokens2[j - 1] {
712                0
713            } else {
714                1
715            };
716
717            curr_row[j] = (prev_row[j - 1] + cost) // substitution
718                .min(prev_row[j] + 1) // deletion
719                .min(curr_row[j - 1] + 1); // insertion
720        }
721
722        std::mem::swap(&mut prev_row, &mut curr_row);
723    }
724
725    prev_row[len2]
726}
727
728/// Explores different window lengths to find the best Type-3 match
729///
730/// Tries extending both windows by small amounts to find the best similarity score.
731/// This accounts for insertions/deletions that make windows different lengths.
732///
733/// # Returns
734/// * `Option<(source_len, target_len, similarity)>` - Best match found, or None
735#[allow(clippy::too_many_arguments)]
736fn explore_length_variants(
737    tokens1: &[Token],
738    tokens2: &[Token],
739    i: usize,
740    j: usize,
741    window_size: usize,
742    base_similarity: f64,
743    length_delta_limit: usize,
744    tolerance: f64,
745) -> Option<(usize, usize, f64)> {
746    let max_extra1 = length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
747    let max_extra2 = length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
748
749    let mut best_match: Option<(usize, usize, f64)> = None;
750
751    for extra1 in 0..=max_extra1 {
752        let len1 = window_size + extra1;
753        let window1 = &tokens1[i..i + len1];
754
755        for extra2 in 0..=max_extra2 {
756            let len2 = window_size + extra2;
757
758            // Skip if length difference exceeds limit
759            if len1.abs_diff(len2) > length_delta_limit {
760                continue;
761            }
762
763            // Early rejection: minimum edits needed exceeds tolerance
764            let max_len = len1.max(len2);
765            let min_distance = len1.abs_diff(len2);
766            let max_allowed_distance = ((1.0 - tolerance) * max_len as f64).ceil() as usize;
767
768            if min_distance > max_allowed_distance {
769                continue;
770            }
771
772            // Compute similarity (reuse base if same window)
773            let candidate_similarity = if extra1 == 0 && extra2 == 0 {
774                base_similarity
775            } else {
776                compute_token_similarity(window1, &tokens2[j..j + len2])
777            };
778
779            if candidate_similarity >= tolerance {
780                best_match = match best_match {
781                    None => Some((len1, len2, candidate_similarity)),
782                    Some((best_len1, best_len2, best_sim)) => {
783                        let better_similarity = candidate_similarity > best_sim + f64::EPSILON;
784                        let better_coverage =
785                            !better_similarity && len1.max(len2) > best_len1.max(best_len2);
786
787                        if better_similarity || better_coverage {
788                            Some((len1, len2, candidate_similarity))
789                        } else {
790                            best_match
791                        }
792                    }
793                };
794            }
795        }
796    }
797
798    best_match
799}
800
801/// Detects Type-3 clones (gap-tolerant) between two token sequences
802///
803/// Uses a sliding window approach with edit distance calculation:
804/// 1. Find candidate regions where tokens partially match
805/// 2. Calculate edit distance for each candidate pair
806/// 3. Accept matches above the tolerance threshold
807///
808/// # Arguments
809/// * `tokens1` - First token sequence
810/// * `tokens2` - Second token sequence
811/// * `window_size` - Minimum block size to consider
812/// * `tolerance` - Minimum similarity threshold (0.0-1.0)
813///
814/// # Returns
815/// * `Vec<CloneMatch>` - List of Type-3 clone matches with similarity scores
816pub fn detect_type3_clones(
817    tokens1: &[Token],
818    tokens2: &[Token],
819    window_size: usize,
820    tolerance: f64,
821) -> Vec<CloneMatch> {
822    let mut matches = Vec::new();
823
824    // Performance optimization: Only check sequences within ±20% length difference
825    let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
826    if !(0.8..=1.2).contains(&len_ratio) {
827        return matches;
828    }
829    if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
830        return matches;
831    }
832
833    // Allow limited length drift so edit distance can align insertions/deletions.
834    // Bounded by tolerance (minimum edits needed) and capped to avoid quadratic blowup.
835    let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
836    let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
837    // Explore only when the base windows are at least somewhat similar to avoid exhaustive search
838    let similarity_floor = (tolerance - 0.3).max(0.0);
839
840    // Sliding window comparison
841    let mut i = 0;
842    while i + window_size <= tokens1.len() {
843        let mut j = 0;
844        while j + window_size <= tokens2.len() {
845            // Calculate base similarity for the minimum windows
846            let window1 = &tokens1[i..i + window_size];
847            let window2 = &tokens2[j..j + window_size];
848            let similarity = compute_token_similarity(window1, window2);
849
850            if similarity < similarity_floor {
851                j += 1;
852                continue;
853            }
854
855            // Explore length variants to find best match
856            let best_match = explore_length_variants(
857                tokens1,
858                tokens2,
859                i,
860                j,
861                window_size,
862                similarity,
863                length_delta_limit,
864                tolerance,
865            );
866
867            if let Some((source_len, target_len, final_similarity)) = best_match {
868                matches.push(CloneMatch {
869                    source_start: i,
870                    target_start: j,
871                    length: source_len,
872                    target_length: target_len,
873                    similarity: final_similarity,
874                });
875
876                // Skip ahead on the target side proportionally to the match size
877                let skip = target_len.saturating_sub(window_size).max(1);
878                j += skip;
879            } else {
880                j += 1;
881            }
882        }
883        i += 1;
884    }
885
886    matches
887}
888
889#[cfg(test)]
890mod tests {
891    use super::*;
892
893    #[test]
894    fn test_normalize_rust_code() {
895        let code = r#"
896        fn add(x: i32, y: i32) -> i32 {
897            x + y
898        }
899        "#;
900
901        let tokens = normalize(code);
902        assert!(!tokens.is_empty());
903
904        // Check that 'fn' is a keyword
905        assert!(tokens
906            .iter()
907            .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
908
909        // Check that identifiers are normalized
910        assert!(tokens.contains(&Token::Identifier));
911    }
912
913    #[test]
914    fn test_normalize_python_code() {
915        let code = r#"
916        def greet(name):
917            return f"Hello, {name}!"
918        "#;
919
920        let tokens = normalize(code);
921        assert!(!tokens.is_empty());
922
923        // Check that 'def' and 'return' are keywords
924        assert!(tokens
925            .iter()
926            .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
927        assert!(tokens
928            .iter()
929            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
930
931        // Check that string is normalized
932        assert!(tokens.contains(&Token::StringLiteral));
933    }
934
935    #[test]
936    fn test_normalize_javascript_code() {
937        let code = r#"
938        function multiply(a, b) {
939            return a * b;
940        }
941        "#;
942
943        let tokens = normalize(code);
944        assert!(!tokens.is_empty());
945
946        // Check that 'function' and 'return' are keywords
947        assert!(tokens
948            .iter()
949            .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
950        assert!(tokens
951            .iter()
952            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
953    }
954
955    #[test]
956    fn test_normalize_ignores_comments() {
957        let code = r#"
958        // This is a comment
959        fn test() {
960            /* Multi-line
961               comment */
962            let x = 5; // inline comment
963        }
964        "#;
965
966        let tokens = normalize(code);
967
968        // Should not contain comment text
969        for token in &tokens {
970            if let Token::Identifier = token {
971                // OK, identifier
972            } else if let Token::Keyword(_) = token {
973                // OK, keyword
974            }
975        }
976    }
977
978    #[test]
979    fn test_rolling_hash_creation() {
980        let hasher = RollingHash::new(50);
981        assert_eq!(hasher.window_size(), 50);
982        assert_eq!(hasher.current_hash(), None);
983    }
984
985    #[test]
986    fn test_rolling_hash_basic() {
987        let mut hasher = RollingHash::new(3);
988
989        // Add tokens one by one
990        assert_eq!(hasher.roll(1), None); // Window not full
991        assert_eq!(hasher.roll(2), None); // Window not full
992
993        let hash1 = hasher.roll(3); // Window full
994        assert!(hash1.is_some());
995
996        let hash2 = hasher.roll(4); // Window rolls
997        assert!(hash2.is_some());
998
999        // Hashes should be different
1000        assert_ne!(hash1, hash2);
1001    }
1002
1003    #[test]
1004    fn test_compute_rolling_hashes() {
1005        let tokens = vec![
1006            Token::Keyword("fn".to_string()),
1007            Token::Identifier,
1008            Token::Punctuation("(".to_string()),
1009            Token::Identifier,
1010            Token::Punctuation(")".to_string()),
1011        ];
1012
1013        let hashes = compute_rolling_hashes(&tokens, 3);
1014        assert_eq!(hashes.len(), 3); // 5 tokens, window size 3 = 3 hashes
1015    }
1016
1017    #[test]
1018    fn test_hash_token_consistency() {
1019        let token1 = Token::Identifier;
1020        let token2 = Token::Identifier;
1021
1022        assert_eq!(hash_token(&token1), hash_token(&token2));
1023    }
1024
1025    #[test]
1026    fn test_token_as_hash_string() {
1027        assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
1028        assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
1029        assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
1030        assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
1031    }
1032}
dupe_core/hashing.rs

dupe_core/
hashing.rs