dupe_core/
hashing.rs

1//! Hashing and normalization logic for duplicate code detection
2//!
3//! This module implements:
4//! - Token normalization (Type-2 clone detection)
5//! - Rabin-Karp rolling hash algorithm
6//! - Hash-based code similarity detection
7
8use std::num::Wrapping;
9
10/// Normalized token representation
11///
12/// Identifiers and literals are normalized to allow detection of
13/// structurally similar code (Type-2 clones).
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub enum Token {
16    /// Keyword (if, for, while, fn, def, etc.)
17    Keyword(String),
18    /// Normalized identifier placeholder
19    Identifier,
20    /// Normalized string literal placeholder
21    StringLiteral,
22    /// Normalized number literal placeholder
23    NumberLiteral,
24    /// Operator (+, -, *, /, etc.)
25    Operator(String),
26    /// Punctuation (parentheses, braces, semicolons)
27    Punctuation(String),
28}
29
30impl Token {
31    /// Returns a string representation for hashing
32    pub fn as_hash_string(&self) -> &str {
33        match self {
34            Token::Keyword(kw) => kw.as_str(),
35            Token::Identifier => "$$ID",
36            Token::StringLiteral => "$$STR",
37            Token::NumberLiteral => "$$NUM",
38            Token::Operator(op) => op.as_str(),
39            Token::Punctuation(p) => p.as_str(),
40        }
41    }
42}
43
44/// Normalizes source code into a token stream for duplicate detection
45///
46/// # Normalization Rules
47/// - Comments are ignored
48/// - Whitespace is ignored
49/// - Identifiers → `$$ID`
50/// - String literals → `$$STR`
51/// - Number literals → `$$NUM`
52/// - Keywords are preserved
53///
54/// # Arguments
55/// * `code` - The source code to normalize
56///
57/// # Returns
58/// * `Vec<Token>` - Normalized token stream
59pub fn normalize(code: &str) -> Vec<Token> {
60    let keywords = get_keyword_set();
61    // Pre-allocate capacity based on heuristic: ~1 token per 3 characters
62    let estimated_tokens = code.len() / 3;
63    let mut tokens = Vec::with_capacity(estimated_tokens);
64    let chars: Vec<char> = code.chars().collect();
65    let mut i = 0;
66
67    while i < chars.len() {
68        let ch = chars[i];
69
70        // Skip whitespace
71        if ch.is_whitespace() {
72            i += 1;
73            continue;
74        }
75
76        // Skip single-line comments (// and #)
77        if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
78            // Skip until end of line
79            while i < chars.len() && chars[i] != '\n' {
80                i += 1;
81            }
82            continue;
83        }
84
85        // Skip multi-line comments (/* */ and """ """)
86        if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
87            i += 2;
88            while i + 1 < chars.len() {
89                if chars[i] == '*' && chars[i + 1] == '/' {
90                    i += 2;
91                    break;
92                }
93                i += 1;
94            }
95            continue;
96        }
97
98        // String literals
99        if ch == '"' || ch == '\'' {
100            let quote = ch;
101            i += 1;
102            // Skip until closing quote
103            while i < chars.len() {
104                if chars[i] == '\\' {
105                    i += 2; // Skip escaped character
106                    continue;
107                }
108                if chars[i] == quote {
109                    i += 1;
110                    break;
111                }
112                i += 1;
113            }
114            tokens.push(Token::StringLiteral);
115            continue;
116        }
117
118        // Numbers
119        if ch.is_ascii_digit() {
120            while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
121                i += 1;
122            }
123            tokens.push(Token::NumberLiteral);
124            continue;
125        }
126
127        // Identifiers and keywords
128        if ch.is_alphabetic() || ch == '_' || ch == '$' {
129            let start = i;
130            while i < chars.len()
131                && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
132            {
133                i += 1;
134            }
135            let word: String = chars[start..i].iter().collect();
136
137            if keywords.contains(&word.as_str()) {
138                tokens.push(Token::Keyword(word));
139            } else {
140                tokens.push(Token::Identifier);
141            }
142            continue;
143        }
144
145        // Operators (multi-char)
146        if i + 1 < chars.len() {
147            let two_char: String = chars[i..i + 2].iter().collect();
148            if is_operator(&two_char) {
149                tokens.push(Token::Operator(two_char));
150                i += 2;
151                continue;
152            }
153        }
154
155        // Single-char operators and punctuation
156        let single = ch.to_string();
157        if is_operator(&single) {
158            tokens.push(Token::Operator(single));
159        } else if is_punctuation(ch) {
160            tokens.push(Token::Punctuation(single));
161        }
162
163        i += 1;
164    }
165
166    tokens
167}
168
169/// Rabin-Karp rolling hash for efficient substring comparison
170///
171/// Uses a rolling window to compute hashes of code blocks.
172/// Allows for efficient duplicate detection in O(n) time.
173#[derive(Debug, Clone)]
174pub struct RollingHash {
175    /// Window size for rolling hash
176    window_size: usize,
177    /// Base for polynomial rolling hash
178    base: Wrapping<u64>,
179    /// Current hash value
180    hash: Wrapping<u64>,
181    /// Power of base for window size (base^window_size)
182    base_power: Wrapping<u64>,
183    /// Current window contents
184    window: Vec<u64>,
185}
186
187impl RollingHash {
188    /// Creates a new RollingHash with the specified window size
189    ///
190    /// # Arguments
191    /// * `window_size` - Number of tokens in the rolling window (default: 50)
192    pub fn new(window_size: usize) -> Self {
193        let base = Wrapping(257u64);
194        let mut base_power = Wrapping(1u64);
195
196        // Calculate base^window_size
197        for _ in 0..window_size {
198            base_power *= base;
199        }
200
201        Self {
202            window_size,
203            base,
204            hash: Wrapping(0),
205            base_power,
206            window: Vec::with_capacity(window_size),
207        }
208    }
209
210    /// Adds a token to the rolling hash window
211    ///
212    /// If the window is full, the oldest token is removed.
213    ///
214    /// # Arguments
215    /// * `token_hash` - Hash value of the token to add
216    ///
217    /// # Returns
218    /// * `Option<u64>` - The current hash if window is full, None otherwise
219    pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
220        if self.window.len() < self.window_size {
221            // Window not full yet
222            self.window.push(token_hash);
223            self.hash = self.hash * self.base + Wrapping(token_hash);
224
225            if self.window.len() == self.window_size {
226                Some(self.hash.0)
227            } else {
228                None
229            }
230        } else {
231            // Window is full, remove oldest and add new
232            let old_token = self.window.remove(0);
233            self.window.push(token_hash);
234
235            // Remove contribution of old token
236            self.hash -= Wrapping(old_token) * self.base_power;
237            // Shift and add new token
238            self.hash = self.hash * self.base + Wrapping(token_hash);
239
240            Some(self.hash.0)
241        }
242    }
243
244    /// Resets the rolling hash to initial state
245    pub fn reset(&mut self) {
246        self.hash = Wrapping(0);
247        self.window.clear();
248    }
249
250    /// Returns the current hash value (if window is full)
251    pub fn current_hash(&self) -> Option<u64> {
252        if self.window.len() == self.window_size {
253            Some(self.hash.0)
254        } else {
255            None
256        }
257    }
258
259    /// Returns the current window size
260    pub fn window_size(&self) -> usize {
261        self.window_size
262    }
263}
264
265/// Computes rolling hashes for a token stream
266///
267/// # Arguments
268/// * `tokens` - Normalized token stream
269/// * `window_size` - Size of the rolling window
270///
271/// # Returns
272/// * `Vec<(u64, usize)>` - List of (hash, start_index) pairs
273pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
274    if tokens.len() < window_size {
275        return Vec::new();
276    }
277
278    let mut hasher = RollingHash::new(window_size);
279    let mut hashes = Vec::new();
280
281    for (idx, token) in tokens.iter().enumerate() {
282        let token_hash = hash_token(token);
283        if let Some(hash) = hasher.roll(token_hash) {
284            // idx is the last token in the window, so start_index is idx - window_size + 1
285            // but we need to ensure it doesn't underflow
286            let start_index = idx.saturating_sub(window_size - 1);
287            hashes.push((hash, start_index));
288        }
289    }
290
291    hashes
292}
293
294/// Computes a hash value for a single token
295fn hash_token(token: &Token) -> u64 {
296    use std::collections::hash_map::DefaultHasher;
297    use std::hash::{Hash, Hasher};
298
299    let mut hasher = DefaultHasher::new();
300    token.as_hash_string().hash(&mut hasher);
301    hasher.finish()
302}
303
304/// Represents a detected duplicate code block
305#[derive(Debug, Clone)]
306pub struct CloneMatch {
307    pub source_start: usize,
308    pub target_start: usize,
309    pub length: usize,
310}
311
312/// Detects duplicates using rolling hash with greedy extension
313///
314/// This implements the Hash-and-Extend strategy:
315/// 1. Use rolling hash to find candidate matches (50-token windows)
316/// 2. Verify the match to handle hash collisions
317/// 3. Greedily extend the match beyond the initial window
318/// 4. Skip ahead to avoid reporting overlapping duplicates
319///
320/// # Arguments
321/// * `tokens` - The token sequence to analyze
322/// * `window_size` - Size of the rolling window (default: 50)
323///
324/// # Returns
325/// * `Vec<CloneMatch>` - List of detected clones with variable lengths
326pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
327    use std::collections::HashMap;
328
329    if tokens.len() < window_size {
330        return Vec::new();
331    }
332
333    let mut matches = Vec::new();
334    let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
335    let mut i = 0;
336
337    // Build rolling hashes and detect matches with extension
338    while i <= tokens.len().saturating_sub(window_size) {
339        // 1. Compute hash for current window
340        let current_hash = compute_window_hash(&tokens[i..i + window_size]);
341
342        // 2. Check if we've seen this hash before
343        if let Some(prev_indices) = hash_map.get(&current_hash) {
344            // Try to match with each previous occurrence
345            for &prev_index in prev_indices.iter() {
346                // Skip if this would be a self-match or overlap
347                if prev_index >= i {
348                    continue;
349                }
350
351                // 3. Verify exact match (handle hash collisions)
352                if verify_window_match(tokens, prev_index, i, window_size) {
353                    // 4. GREEDY EXTENSION: Expand beyond the initial window
354                    let mut extension = 0;
355                    while (i + window_size + extension < tokens.len())
356                        && (prev_index + window_size + extension < i)
357                        && (tokens[prev_index + window_size + extension]
358                            == tokens[i + window_size + extension])
359                    {
360                        extension += 1;
361                    }
362
363                    let total_length = window_size + extension;
364
365                    // Record the full match
366                    matches.push(CloneMatch {
367                        source_start: prev_index,
368                        target_start: i,
369                        length: total_length,
370                    });
371
372                    // 5. Skip ahead to avoid reporting overlapping subsets
373                    i += extension.max(1);
374                    break; // Found a match, move to next position
375                }
376            }
377        }
378
379        // Store this position for future comparisons
380        hash_map.entry(current_hash).or_default().push(i);
381        i += 1;
382    }
383
384    matches
385}
386
387/// Computes hash for a specific token window
388///
389/// Uses Rabin-Karp polynomial rolling hash with:
390/// - BASE = 257 (prime > 256 to minimize collisions for all byte values)
391/// - MODULUS = 1e9+7 (large prime commonly used in hashing algorithms)
392fn compute_window_hash(window: &[Token]) -> u64 {
393    /// Prime base for polynomial rolling hash (chosen to be > 256)
394    const BASE: u64 = 257;
395    /// Large prime modulus to reduce hash collisions (1e9+7)
396    const MODULUS: u64 = 1_000_000_007;
397
398    let mut hash: u64 = 0;
399    for token in window {
400        let token_hash = hash_token(token);
401        // Use u128 to prevent overflow before modulo operation
402        let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
403        hash = wide_hash as u64;
404    }
405    hash
406}
407
408/// Verifies that two token windows are exactly identical
409fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
410    if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
411        return false;
412    }
413    tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
414}
415
416/// Returns a set of keywords for all supported languages
417fn get_keyword_set() -> &'static [&'static str] {
418    &[
419        // Rust keywords
420        "as",
421        "break",
422        "const",
423        "continue",
424        "crate",
425        "else",
426        "enum",
427        "extern",
428        "false",
429        "fn",
430        "for",
431        "if",
432        "impl",
433        "in",
434        "let",
435        "loop",
436        "match",
437        "mod",
438        "move",
439        "mut",
440        "pub",
441        "ref",
442        "return",
443        "self",
444        "Self",
445        "static",
446        "struct",
447        "super",
448        "trait",
449        "true",
450        "type",
451        "unsafe",
452        "use",
453        "where",
454        "while",
455        "async",
456        "await",
457        "dyn",
458        // Python keywords
459        "and",
460        "assert",
461        "class",
462        "def",
463        "del",
464        "elif",
465        "except",
466        "finally",
467        "from",
468        "global",
469        "import",
470        "is",
471        "lambda",
472        "nonlocal",
473        "not",
474        "or",
475        "pass",
476        "raise",
477        "try",
478        "with",
479        "yield",
480        // JavaScript keywords
481        "await",
482        "case",
483        "catch",
484        "class",
485        "const",
486        "continue",
487        "debugger",
488        "default",
489        "delete",
490        "do",
491        "else",
492        "export",
493        "extends",
494        "finally",
495        "for",
496        "function",
497        "if",
498        "import",
499        "in",
500        "instanceof",
501        "let",
502        "new",
503        "return",
504        "super",
505        "switch",
506        "this",
507        "throw",
508        "try",
509        "typeof",
510        "var",
511        "void",
512        "while",
513        "with",
514        "yield",
515    ]
516}
517
518/// Checks if a string is an operator
519fn is_operator(s: &str) -> bool {
520    matches!(
521        s,
522        "+" | "-"
523            | "*"
524            | "/"
525            | "%"
526            | "="
527            | "=="
528            | "!="
529            | "<"
530            | ">"
531            | "<="
532            | ">="
533            | "&&"
534            | "||"
535            | "!"
536            | "&"
537            | "|"
538            | "^"
539            | "<<"
540            | ">>"
541            | "+="
542            | "-="
543            | "*="
544            | "/="
545            | "=>"
546            | "->"
547            | "::"
548            | "."
549    )
550}
551
552/// Checks if a character is punctuation
553fn is_punctuation(ch: char) -> bool {
554    matches!(
555        ch,
556        '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
557    )
558}
559
560#[cfg(test)]
561mod tests {
562    use super::*;
563
564    #[test]
565    fn test_normalize_rust_code() {
566        let code = r#"
567        fn add(x: i32, y: i32) -> i32 {
568            x + y
569        }
570        "#;
571
572        let tokens = normalize(code);
573        assert!(!tokens.is_empty());
574
575        // Check that 'fn' is a keyword
576        assert!(tokens
577            .iter()
578            .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
579
580        // Check that identifiers are normalized
581        assert!(tokens.contains(&Token::Identifier));
582    }
583
584    #[test]
585    fn test_normalize_python_code() {
586        let code = r#"
587        def greet(name):
588            return f"Hello, {name}!"
589        "#;
590
591        let tokens = normalize(code);
592        assert!(!tokens.is_empty());
593
594        // Check that 'def' and 'return' are keywords
595        assert!(tokens
596            .iter()
597            .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
598        assert!(tokens
599            .iter()
600            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
601
602        // Check that string is normalized
603        assert!(tokens.contains(&Token::StringLiteral));
604    }
605
606    #[test]
607    fn test_normalize_javascript_code() {
608        let code = r#"
609        function multiply(a, b) {
610            return a * b;
611        }
612        "#;
613
614        let tokens = normalize(code);
615        assert!(!tokens.is_empty());
616
617        // Check that 'function' and 'return' are keywords
618        assert!(tokens
619            .iter()
620            .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
621        assert!(tokens
622            .iter()
623            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
624    }
625
626    #[test]
627    fn test_normalize_ignores_comments() {
628        let code = r#"
629        // This is a comment
630        fn test() {
631            /* Multi-line
632               comment */
633            let x = 5; // inline comment
634        }
635        "#;
636
637        let tokens = normalize(code);
638
639        // Should not contain comment text
640        for token in &tokens {
641            if let Token::Identifier = token {
642                // OK, identifier
643            } else if let Token::Keyword(_) = token {
644                // OK, keyword
645            }
646        }
647    }
648
649    #[test]
650    fn test_rolling_hash_creation() {
651        let hasher = RollingHash::new(50);
652        assert_eq!(hasher.window_size(), 50);
653        assert_eq!(hasher.current_hash(), None);
654    }
655
656    #[test]
657    fn test_rolling_hash_basic() {
658        let mut hasher = RollingHash::new(3);
659
660        // Add tokens one by one
661        assert_eq!(hasher.roll(1), None); // Window not full
662        assert_eq!(hasher.roll(2), None); // Window not full
663
664        let hash1 = hasher.roll(3); // Window full
665        assert!(hash1.is_some());
666
667        let hash2 = hasher.roll(4); // Window rolls
668        assert!(hash2.is_some());
669
670        // Hashes should be different
671        assert_ne!(hash1, hash2);
672    }
673
674    #[test]
675    fn test_compute_rolling_hashes() {
676        let tokens = vec![
677            Token::Keyword("fn".to_string()),
678            Token::Identifier,
679            Token::Punctuation("(".to_string()),
680            Token::Identifier,
681            Token::Punctuation(")".to_string()),
682        ];
683
684        let hashes = compute_rolling_hashes(&tokens, 3);
685        assert_eq!(hashes.len(), 3); // 5 tokens, window size 3 = 3 hashes
686    }
687
688    #[test]
689    fn test_hash_token_consistency() {
690        let token1 = Token::Identifier;
691        let token2 = Token::Identifier;
692
693        assert_eq!(hash_token(&token1), hash_token(&token2));
694    }
695
696    #[test]
697    fn test_token_as_hash_string() {
698        assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
699        assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
700        assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
701        assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
702    }
703}