dupe_core/
hashing.rs

1//! Hashing and normalization logic for duplicate code detection
2//!
3//! This module implements:
4//! - Token normalization (Type-2 clone detection)
5//! - Rabin-Karp rolling hash algorithm
6//! - Hash-based code similarity detection
7//! - Edit distance calculation (Type-3 clone detection)
8
9use std::num::Wrapping;
10
11/// Normalized token representation
12///
13/// Identifiers and literals are normalized to allow detection of
14/// structurally similar code (Type-2 clones).
15#[derive(Debug, Clone, PartialEq, Eq, Hash)]
16pub enum Token {
17    /// Keyword (if, for, while, fn, def, etc.)
18    Keyword(String),
19    /// Normalized identifier placeholder
20    Identifier,
21    /// Normalized string literal placeholder
22    StringLiteral,
23    /// Normalized number literal placeholder
24    NumberLiteral,
25    /// Operator (+, -, *, /, etc.)
26    Operator(String),
27    /// Punctuation (parentheses, braces, semicolons)
28    Punctuation(String),
29}
30
31impl Token {
32    /// Returns a string representation for hashing
33    pub fn as_hash_string(&self) -> &str {
34        match self {
35            Token::Keyword(kw) => kw.as_str(),
36            Token::Identifier => "$$ID",
37            Token::StringLiteral => "$$STR",
38            Token::NumberLiteral => "$$NUM",
39            Token::Operator(op) => op.as_str(),
40            Token::Punctuation(p) => p.as_str(),
41        }
42    }
43}
44
45/// Normalizes source code into a token stream for duplicate detection
46///
47/// # Normalization Rules
48/// - Comments are ignored
49/// - Whitespace is ignored
50/// - Identifiers → `$$ID`
51/// - String literals → `$$STR`
52/// - Number literals → `$$NUM`
53/// - Keywords are preserved
54///
55/// # Arguments
56/// * `code` - The source code to normalize
57///
58/// # Returns
59/// * `Vec<Token>` - Normalized token stream
60pub fn normalize(code: &str) -> Vec<Token> {
61    let keywords = get_keyword_set();
62    // Pre-allocate capacity based on heuristic: ~1 token per 3 characters
63    let estimated_tokens = code.len() / 3;
64    let mut tokens = Vec::with_capacity(estimated_tokens);
65    let chars: Vec<char> = code.chars().collect();
66    let mut i = 0;
67
68    while i < chars.len() {
69        let ch = chars[i];
70
71        // Skip whitespace
72        if ch.is_whitespace() {
73            i += 1;
74            continue;
75        }
76
77        // Skip single-line comments (// and #)
78        if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
79            // Skip until end of line
80            while i < chars.len() && chars[i] != '\n' {
81                i += 1;
82            }
83            continue;
84        }
85
86        // Skip multi-line comments (/* */ and """ """)
87        if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
88            i += 2;
89            while i + 1 < chars.len() {
90                if chars[i] == '*' && chars[i + 1] == '/' {
91                    i += 2;
92                    break;
93                }
94                i += 1;
95            }
96            continue;
97        }
98
99        // String literals
100        if ch == '"' || ch == '\'' {
101            let quote = ch;
102            i += 1;
103            // Skip until closing quote
104            while i < chars.len() {
105                if chars[i] == '\\' {
106                    i += 2; // Skip escaped character
107                    continue;
108                }
109                if chars[i] == quote {
110                    i += 1;
111                    break;
112                }
113                i += 1;
114            }
115            tokens.push(Token::StringLiteral);
116            continue;
117        }
118
119        // Numbers
120        if ch.is_ascii_digit() {
121            while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
122                i += 1;
123            }
124            tokens.push(Token::NumberLiteral);
125            continue;
126        }
127
128        // Identifiers and keywords
129        if ch.is_alphabetic() || ch == '_' || ch == '$' {
130            let start = i;
131            while i < chars.len()
132                && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
133            {
134                i += 1;
135            }
136            let word: String = chars[start..i].iter().collect();
137
138            if keywords.contains(&word.as_str()) {
139                tokens.push(Token::Keyword(word));
140            } else {
141                tokens.push(Token::Identifier);
142            }
143            continue;
144        }
145
146        // Operators (multi-char)
147        if i + 1 < chars.len() {
148            let two_char: String = chars[i..i + 2].iter().collect();
149            if is_operator(&two_char) {
150                tokens.push(Token::Operator(two_char));
151                i += 2;
152                continue;
153            }
154        }
155
156        // Single-char operators and punctuation
157        let single = ch.to_string();
158        if is_operator(&single) {
159            tokens.push(Token::Operator(single));
160        } else if is_punctuation(ch) {
161            tokens.push(Token::Punctuation(single));
162        }
163
164        i += 1;
165    }
166
167    tokens
168}
169
170/// Rabin-Karp rolling hash for efficient substring comparison
171///
172/// Uses a rolling window to compute hashes of code blocks.
173/// Allows for efficient duplicate detection in O(n) time.
174#[derive(Debug, Clone)]
175pub struct RollingHash {
176    /// Window size for rolling hash
177    window_size: usize,
178    /// Base for polynomial rolling hash
179    base: Wrapping<u64>,
180    /// Current hash value
181    hash: Wrapping<u64>,
182    /// Power of base for window size (base^window_size)
183    base_power: Wrapping<u64>,
184    /// Current window contents
185    window: Vec<u64>,
186}
187
188impl RollingHash {
189    /// Creates a new RollingHash with the specified window size
190    ///
191    /// # Arguments
192    /// * `window_size` - Number of tokens in the rolling window (default: 50)
193    pub fn new(window_size: usize) -> Self {
194        let base = Wrapping(257u64);
195        let mut base_power = Wrapping(1u64);
196
197        // Calculate base^window_size
198        for _ in 0..window_size {
199            base_power *= base;
200        }
201
202        Self {
203            window_size,
204            base,
205            hash: Wrapping(0),
206            base_power,
207            window: Vec::with_capacity(window_size),
208        }
209    }
210
211    /// Adds a token to the rolling hash window
212    ///
213    /// If the window is full, the oldest token is removed.
214    ///
215    /// # Arguments
216    /// * `token_hash` - Hash value of the token to add
217    ///
218    /// # Returns
219    /// * `Option<u64>` - The current hash if window is full, None otherwise
220    pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
221        if self.window.len() < self.window_size {
222            // Window not full yet
223            self.window.push(token_hash);
224            self.hash = self.hash * self.base + Wrapping(token_hash);
225
226            if self.window.len() == self.window_size {
227                Some(self.hash.0)
228            } else {
229                None
230            }
231        } else {
232            // Window is full, remove oldest and add new
233            let old_token = self.window.remove(0);
234            self.window.push(token_hash);
235
236            // Remove contribution of old token
237            self.hash -= Wrapping(old_token) * self.base_power;
238            // Shift and add new token
239            self.hash = self.hash * self.base + Wrapping(token_hash);
240
241            Some(self.hash.0)
242        }
243    }
244
245    /// Resets the rolling hash to initial state
246    pub fn reset(&mut self) {
247        self.hash = Wrapping(0);
248        self.window.clear();
249    }
250
251    /// Returns the current hash value (if window is full)
252    pub fn current_hash(&self) -> Option<u64> {
253        if self.window.len() == self.window_size {
254            Some(self.hash.0)
255        } else {
256            None
257        }
258    }
259
260    /// Returns the current window size
261    pub fn window_size(&self) -> usize {
262        self.window_size
263    }
264}
265
266/// Computes rolling hashes for a token stream
267///
268/// # Arguments
269/// * `tokens` - Normalized token stream
270/// * `window_size` - Size of the rolling window
271///
272/// # Returns
273/// * `Vec<(u64, usize)>` - List of (hash, start_index) pairs
274pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
275    if tokens.len() < window_size {
276        return Vec::new();
277    }
278
279    let mut hasher = RollingHash::new(window_size);
280    let mut hashes = Vec::new();
281
282    for (idx, token) in tokens.iter().enumerate() {
283        let token_hash = hash_token(token);
284        if let Some(hash) = hasher.roll(token_hash) {
285            // idx is the last token in the window, so start_index is idx - window_size + 1
286            // but we need to ensure it doesn't underflow
287            let start_index = idx.saturating_sub(window_size - 1);
288            hashes.push((hash, start_index));
289        }
290    }
291
292    hashes
293}
294
295/// Computes a hash value for a single token
296fn hash_token(token: &Token) -> u64 {
297    use std::collections::hash_map::DefaultHasher;
298    use std::hash::{Hash, Hasher};
299
300    let mut hasher = DefaultHasher::new();
301    token.as_hash_string().hash(&mut hasher);
302    hasher.finish()
303}
304
305/// Represents a detected duplicate code block
306#[derive(Debug, Clone)]
307pub struct CloneMatch {
308    pub source_start: usize,
309    pub target_start: usize,
310    /// Length of the matched region in the source sequence
311    pub length: usize,
312    /// Length of the matched region in the target sequence (may differ for Type-3)
313    pub target_length: usize,
314    /// Token-level similarity (0.0-1.0). 1.0 for Type-1/2, calculated for Type-3
315    pub similarity: f64,
316}
317
318/// Detects duplicates using rolling hash with greedy extension
319///
320/// This implements the Hash-and-Extend strategy:
321/// 1. Use rolling hash to find candidate matches (50-token windows)
322/// 2. Verify the match to handle hash collisions
323/// 3. Greedily extend the match beyond the initial window
324/// 4. Skip ahead to avoid reporting overlapping duplicates
325///
326/// # Arguments
327/// * `tokens` - The token sequence to analyze
328/// * `window_size` - Size of the rolling window (default: 50)
329///
330/// # Returns
331/// * `Vec<CloneMatch>` - List of detected clones with variable lengths
332pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
333    use std::collections::HashMap;
334
335    if tokens.len() < window_size {
336        return Vec::new();
337    }
338
339    let mut matches = Vec::new();
340    let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
341    let mut i = 0;
342
343    // Build rolling hashes and detect matches with extension
344    while i <= tokens.len().saturating_sub(window_size) {
345        // 1. Compute hash for current window
346        let current_hash = compute_window_hash(&tokens[i..i + window_size]);
347
348        // 2. Check if we've seen this hash before
349        if let Some(prev_indices) = hash_map.get(&current_hash) {
350            // Try to match with each previous occurrence
351            for &prev_index in prev_indices.iter() {
352                // Skip if this would be a self-match or overlap
353                if prev_index >= i {
354                    continue;
355                }
356
357                // 3. Verify exact match (handle hash collisions)
358                if verify_window_match(tokens, prev_index, i, window_size) {
359                    // 4. GREEDY EXTENSION: Expand beyond the initial window
360                    let mut extension = 0;
361                    while (i + window_size + extension < tokens.len())
362                        && (prev_index + window_size + extension < i)
363                        && (tokens[prev_index + window_size + extension]
364                            == tokens[i + window_size + extension])
365                    {
366                        extension += 1;
367                    }
368
369                    let total_length = window_size + extension;
370
371                    // Record the full match
372                    matches.push(CloneMatch {
373                        source_start: prev_index,
374                        target_start: i,
375                        length: total_length,
376                        target_length: total_length,
377                        similarity: 1.0, // Exact match
378                    });
379
380                    // 5. Skip ahead to avoid reporting overlapping subsets
381                    i += extension.max(1);
382                    break; // Found a match, move to next position
383                }
384            }
385        }
386
387        // Store this position for future comparisons
388        hash_map.entry(current_hash).or_default().push(i);
389        i += 1;
390    }
391
392    matches
393}
394
395/// Computes hash for a specific token window
396///
397/// Uses Rabin-Karp polynomial rolling hash with:
398/// - BASE = 257 (prime > 256 to minimize collisions for all byte values)
399/// - MODULUS = 1e9+7 (large prime commonly used in hashing algorithms)
400fn compute_window_hash(window: &[Token]) -> u64 {
401    /// Prime base for polynomial rolling hash (chosen to be > 256)
402    const BASE: u64 = 257;
403    /// Large prime modulus to reduce hash collisions (1e9+7)
404    const MODULUS: u64 = 1_000_000_007;
405
406    let mut hash: u64 = 0;
407    for token in window {
408        let token_hash = hash_token(token);
409        // Use u128 to prevent overflow before modulo operation
410        let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
411        hash = wide_hash as u64;
412    }
413    hash
414}
415
416/// Verifies that two token windows are exactly identical
417fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
418    if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
419        return false;
420    }
421    tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
422}
423
424/// Returns a set of keywords for all supported languages
425fn get_keyword_set() -> &'static [&'static str] {
426    &[
427        // Rust keywords
428        "as",
429        "break",
430        "const",
431        "continue",
432        "crate",
433        "else",
434        "enum",
435        "extern",
436        "false",
437        "fn",
438        "for",
439        "if",
440        "impl",
441        "in",
442        "let",
443        "loop",
444        "match",
445        "mod",
446        "move",
447        "mut",
448        "pub",
449        "ref",
450        "return",
451        "self",
452        "Self",
453        "static",
454        "struct",
455        "super",
456        "trait",
457        "true",
458        "type",
459        "unsafe",
460        "use",
461        "where",
462        "while",
463        "async",
464        "await",
465        "dyn",
466        // Python keywords
467        "and",
468        "assert",
469        "class",
470        "def",
471        "del",
472        "elif",
473        "except",
474        "finally",
475        "from",
476        "global",
477        "import",
478        "is",
479        "lambda",
480        "nonlocal",
481        "not",
482        "or",
483        "pass",
484        "raise",
485        "try",
486        "with",
487        "yield",
488        // JavaScript keywords
489        "await",
490        "case",
491        "catch",
492        "class",
493        "const",
494        "continue",
495        "debugger",
496        "default",
497        "delete",
498        "do",
499        "else",
500        "export",
501        "extends",
502        "finally",
503        "for",
504        "function",
505        "if",
506        "import",
507        "in",
508        "instanceof",
509        "let",
510        "new",
511        "return",
512        "super",
513        "switch",
514        "this",
515        "throw",
516        "try",
517        "typeof",
518        "var",
519        "void",
520        "while",
521        "with",
522        "yield",
523    ]
524}
525
526/// Checks if a string is an operator
527fn is_operator(s: &str) -> bool {
528    matches!(
529        s,
530        "+" | "-"
531            | "*"
532            | "/"
533            | "%"
534            | "="
535            | "=="
536            | "!="
537            | "<"
538            | ">"
539            | "<="
540            | ">="
541            | "&&"
542            | "||"
543            | "!"
544            | "&"
545            | "|"
546            | "^"
547            | "<<"
548            | ">>"
549            | "+="
550            | "-="
551            | "*="
552            | "/="
553            | "=>"
554            | "->"
555            | "::"
556            | "."
557    )
558}
559
560/// Checks if a character is punctuation
561fn is_punctuation(ch: char) -> bool {
562    matches!(
563        ch,
564        '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
565    )
566}
567
568/// Computes token-level similarity between two token sequences using edit distance
569///
570/// Returns a similarity score between 0.0 and 1.0, where:
571/// - 1.0 = identical sequences
572/// - 0.0 = completely different
573///
574/// Uses normalized Levenshtein distance:
575/// similarity = 1 - (edit_distance / max_length)
576///
577/// # Arguments
578/// * `tokens1` - First token sequence
579/// * `tokens2` - Second token sequence
580///
581/// # Returns
582/// * `f64` - Similarity score (0.0-1.0)
583pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
584    if tokens1.is_empty() && tokens2.is_empty() {
585        return 1.0;
586    }
587    if tokens1.is_empty() || tokens2.is_empty() {
588        return 0.0;
589    }
590
591    // Compute edit distance using custom token comparison
592    let distance = compute_token_edit_distance(tokens1, tokens2);
593    let max_len = tokens1.len().max(tokens2.len());
594
595    // Normalize to similarity score
596    let similarity = 1.0 - (distance as f64 / max_len as f64);
597    similarity.clamp(0.0, 1.0)
598}
599
600/// Computes Levenshtein edit distance between two token sequences
601///
602/// Uses dynamic programming to calculate the minimum number of
603/// insertions, deletions, or substitutions needed to transform
604/// tokens1 into tokens2.
605///
606/// # Arguments
607/// * `tokens1` - First token sequence
608/// * `tokens2` - Second token sequence
609///
610/// # Returns
611/// * `usize` - Edit distance
612pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
613    let len1 = tokens1.len();
614    let len2 = tokens2.len();
615
616    if len1 == 0 {
617        return len2;
618    }
619    if len2 == 0 {
620        return len1;
621    }
622
623    // Create DP table
624    let mut prev_row: Vec<usize> = (0..=len2).collect();
625    let mut curr_row: Vec<usize> = vec![0; len2 + 1];
626
627    for i in 1..=len1 {
628        curr_row[0] = i;
629
630        for j in 1..=len2 {
631            let cost = if tokens1[i - 1] == tokens2[j - 1] {
632                0
633            } else {
634                1
635            };
636
637            curr_row[j] = (prev_row[j - 1] + cost) // substitution
638                .min(prev_row[j] + 1) // deletion
639                .min(curr_row[j - 1] + 1); // insertion
640        }
641
642        std::mem::swap(&mut prev_row, &mut curr_row);
643    }
644
645    prev_row[len2]
646}
647
648/// Detects Type-3 clones (gap-tolerant) between two token sequences
649///
650/// Uses a sliding window approach with edit distance calculation:
651/// 1. Find candidate regions where tokens partially match
652/// 2. Calculate edit distance for each candidate pair
653/// 3. Accept matches above the tolerance threshold
654///
655/// # Arguments
656/// * `tokens1` - First token sequence
657/// * `tokens2` - Second token sequence
658/// * `window_size` - Minimum block size to consider
659/// * `tolerance` - Minimum similarity threshold (0.0-1.0)
660///
661/// # Returns
662/// * `Vec<CloneMatch>` - List of Type-3 clone matches with similarity scores
663pub fn detect_type3_clones(
664    tokens1: &[Token],
665    tokens2: &[Token],
666    window_size: usize,
667    tolerance: f64,
668) -> Vec<CloneMatch> {
669    let mut matches = Vec::new();
670
671    // Performance optimization: Only check sequences within ±20% length difference
672    let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
673    if !(0.8..=1.2).contains(&len_ratio) {
674        return matches;
675    }
676    if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
677        return matches;
678    }
679
680    // Allow limited length drift so edit distance can align insertions/deletions.
681    // Bounded by tolerance (minimum edits needed) and capped to avoid quadratic blowup.
682    let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
683    let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
684    // Explore only when the base windows are at least somewhat similar to avoid exhaustive search
685    let similarity_floor = (tolerance - 0.3).max(0.0);
686
687    // Sliding window comparison
688    let mut i = 0;
689    while i + window_size <= tokens1.len() {
690        let mut j = 0;
691        while j + window_size <= tokens2.len() {
692            // Extract windows
693            let window1 = &tokens1[i..i + window_size];
694            let window2 = &tokens2[j..j + window_size];
695
696            // Calculate similarity
697            let similarity = compute_token_similarity(window1, window2);
698
699            if similarity < similarity_floor {
700                j += 1;
701            } else {
702                let max_extra1 =
703                    length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
704                let max_extra2 =
705                    length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
706
707                let mut best_match: Option<(usize, usize, f64)> = None;
708
709                for extra1 in 0..=max_extra1 {
710                    let len1 = window_size + extra1;
711                    let window1 = &tokens1[i..i + len1];
712
713                    for extra2 in 0..=max_extra2 {
714                        let len2 = window_size + extra2;
715
716                        if len1.abs_diff(len2) > length_delta_limit {
717                            continue;
718                        }
719
720                        let max_len = len1.max(len2);
721                        let min_distance = len1.abs_diff(len2); // At least this many edits required
722                        let max_allowed_distance =
723                            ((1.0 - tolerance) * max_len as f64).ceil() as usize;
724
725                        if min_distance > max_allowed_distance {
726                            continue;
727                        }
728
729                        let candidate_similarity = if extra1 == 0 && extra2 == 0 {
730                            similarity
731                        } else {
732                            compute_token_similarity(window1, &tokens2[j..j + len2])
733                        };
734
735                        if candidate_similarity >= tolerance {
736                            match best_match {
737                                None => best_match = Some((len1, len2, candidate_similarity)),
738                                Some((best_len1, best_len2, best_sim)) => {
739                                    let better_similarity =
740                                        candidate_similarity > best_sim + f64::EPSILON;
741                                    let better_coverage = !better_similarity
742                                        && len1.max(len2) > best_len1.max(best_len2);
743
744                                    if better_similarity || better_coverage {
745                                        best_match = Some((len1, len2, candidate_similarity));
746                                    }
747                                }
748                            }
749                        }
750                    }
751                }
752
753                if let Some((source_len, target_len, final_similarity)) = best_match {
754                    matches.push(CloneMatch {
755                        source_start: i,
756                        target_start: j,
757                        length: source_len,
758                        target_length: target_len,
759                        similarity: final_similarity,
760                    });
761
762                    // Skip ahead on the target side proportionally to the match size to reduce overlap
763                    let skip = target_len.saturating_sub(window_size).max(1);
764                    j += skip;
765                } else {
766                    j += 1;
767                }
768            }
769        }
770        i += 1;
771    }
772
773    matches
774}
775
776#[cfg(test)]
777mod tests {
778    use super::*;
779
780    #[test]
781    fn test_normalize_rust_code() {
782        let code = r#"
783        fn add(x: i32, y: i32) -> i32 {
784            x + y
785        }
786        "#;
787
788        let tokens = normalize(code);
789        assert!(!tokens.is_empty());
790
791        // Check that 'fn' is a keyword
792        assert!(tokens
793            .iter()
794            .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
795
796        // Check that identifiers are normalized
797        assert!(tokens.contains(&Token::Identifier));
798    }
799
800    #[test]
801    fn test_normalize_python_code() {
802        let code = r#"
803        def greet(name):
804            return f"Hello, {name}!"
805        "#;
806
807        let tokens = normalize(code);
808        assert!(!tokens.is_empty());
809
810        // Check that 'def' and 'return' are keywords
811        assert!(tokens
812            .iter()
813            .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
814        assert!(tokens
815            .iter()
816            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
817
818        // Check that string is normalized
819        assert!(tokens.contains(&Token::StringLiteral));
820    }
821
822    #[test]
823    fn test_normalize_javascript_code() {
824        let code = r#"
825        function multiply(a, b) {
826            return a * b;
827        }
828        "#;
829
830        let tokens = normalize(code);
831        assert!(!tokens.is_empty());
832
833        // Check that 'function' and 'return' are keywords
834        assert!(tokens
835            .iter()
836            .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
837        assert!(tokens
838            .iter()
839            .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
840    }
841
842    #[test]
843    fn test_normalize_ignores_comments() {
844        let code = r#"
845        // This is a comment
846        fn test() {
847            /* Multi-line
848               comment */
849            let x = 5; // inline comment
850        }
851        "#;
852
853        let tokens = normalize(code);
854
855        // Should not contain comment text
856        for token in &tokens {
857            if let Token::Identifier = token {
858                // OK, identifier
859            } else if let Token::Keyword(_) = token {
860                // OK, keyword
861            }
862        }
863    }
864
865    #[test]
866    fn test_rolling_hash_creation() {
867        let hasher = RollingHash::new(50);
868        assert_eq!(hasher.window_size(), 50);
869        assert_eq!(hasher.current_hash(), None);
870    }
871
872    #[test]
873    fn test_rolling_hash_basic() {
874        let mut hasher = RollingHash::new(3);
875
876        // Add tokens one by one
877        assert_eq!(hasher.roll(1), None); // Window not full
878        assert_eq!(hasher.roll(2), None); // Window not full
879
880        let hash1 = hasher.roll(3); // Window full
881        assert!(hash1.is_some());
882
883        let hash2 = hasher.roll(4); // Window rolls
884        assert!(hash2.is_some());
885
886        // Hashes should be different
887        assert_ne!(hash1, hash2);
888    }
889
890    #[test]
891    fn test_compute_rolling_hashes() {
892        let tokens = vec![
893            Token::Keyword("fn".to_string()),
894            Token::Identifier,
895            Token::Punctuation("(".to_string()),
896            Token::Identifier,
897            Token::Punctuation(")".to_string()),
898        ];
899
900        let hashes = compute_rolling_hashes(&tokens, 3);
901        assert_eq!(hashes.len(), 3); // 5 tokens, window size 3 = 3 hashes
902    }
903
904    #[test]
905    fn test_hash_token_consistency() {
906        let token1 = Token::Identifier;
907        let token2 = Token::Identifier;
908
909        assert_eq!(hash_token(&token1), hash_token(&token2));
910    }
911
912    #[test]
913    fn test_token_as_hash_string() {
914        assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
915        assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
916        assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
917        assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
918    }
919}