Skip to main content

sem_core/utils/
hash.rs

1use std::hash::Hasher;
2use tree_sitter::Node;
3use xxhash_rust::xxh3::Xxh3;
4
5pub fn content_hash(content: &str) -> String {
6    format!("{:016x}", xxhash_rust::xxh3::xxh3_64(content.as_bytes()))
7}
8
9pub fn short_hash(content: &str, length: usize) -> String {
10    let hash = content_hash(content);
11    hash[..length.min(hash.len())].to_string()
12}
13
14/// Compute a structural hash from a tree-sitter AST node.
15/// Strips comments and normalizes whitespace so formatting-only changes
16/// produce the same hash. Uses streaming xxHash64 to avoid intermediate
17/// string allocations.
18pub fn structural_hash(node: Node, source: &[u8]) -> String {
19    let mut hasher = Xxh3::new();
20    hash_structural_tokens(node, source, &mut hasher);
21    format!("{:016x}", hasher.finish())
22}
23
24/// Compute a structural hash that excludes tokens within a given byte range.
25/// Used to strip the entity name from the hash so that renames of otherwise
26/// identical entities produce the same hash, enabling Phase 2 rename detection.
27pub fn structural_hash_excluding_range(
28    node: Node,
29    source: &[u8],
30    exclude_start: usize,
31    exclude_end: usize,
32) -> String {
33    let mut hasher = Xxh3::new();
34    hash_structural_tokens_excluding(node, source, &mut hasher, exclude_start, exclude_end);
35    format!("{:016x}", hasher.finish())
36}
37
38/// Recursively hash tokens from the AST, skipping comments.
39/// Hashes both node types (structure) and leaf text (content) so that
40/// structurally different ASTs with identical leaf tokens produce different hashes.
41/// Zero allocations: hashes directly from source byte slices.
42fn hash_structural_tokens(node: Node, source: &[u8], hasher: &mut Xxh3) {
43    let kind = node.kind();
44
45    if is_comment_node(kind) {
46        return;
47    }
48
49    if node.child_count() == 0 {
50        // Leaf node: hash its text directly from the source buffer
51        let start = node.start_byte();
52        let end = node.end_byte();
53        if start < end && end <= source.len() {
54            let bytes = &source[start..end];
55            // Trim whitespace manually to avoid allocation
56            let trimmed = trim_bytes(bytes);
57            if !trimmed.is_empty() {
58                hasher.write(trimmed);
59                hasher.write(b" ");
60            }
61        }
62    } else {
63        // Hash the node type to capture structure, not just leaf content.
64        // e.g. `x = foo(bar)` vs `foo(bar) = x` have same leaves but different structure.
65        hasher.write(kind.as_bytes());
66        hasher.write(b":");
67        let mut cursor = node.walk();
68        for child in node.children(&mut cursor) {
69            hash_structural_tokens(child, source, hasher);
70        }
71    }
72}
73
74/// Like `hash_structural_tokens` but skips any leaf node whose byte range
75/// overlaps the excluded range (the entity name).
76fn hash_structural_tokens_excluding(
77    node: Node,
78    source: &[u8],
79    hasher: &mut Xxh3,
80    exclude_start: usize,
81    exclude_end: usize,
82) {
83    let kind = node.kind();
84
85    if is_comment_node(kind) {
86        return;
87    }
88
89    if node.child_count() == 0 {
90        let start = node.start_byte();
91        let end = node.end_byte();
92        // Skip leaf nodes that overlap the excluded range
93        if start < exclude_end && end > exclude_start {
94            return;
95        }
96        if start < end && end <= source.len() {
97            let bytes = &source[start..end];
98            let trimmed = trim_bytes(bytes);
99            if !trimmed.is_empty() {
100                hasher.write(trimmed);
101                hasher.write(b" ");
102            }
103        }
104    } else {
105        hasher.write(kind.as_bytes());
106        hasher.write(b":");
107        let mut cursor = node.walk();
108        for child in node.children(&mut cursor) {
109            hash_structural_tokens_excluding(child, source, hasher, exclude_start, exclude_end);
110        }
111    }
112}
113
114/// Trim leading/trailing ASCII whitespace from a byte slice without allocating.
115#[inline]
116fn trim_bytes(bytes: &[u8]) -> &[u8] {
117    let start = bytes.iter().position(|b| !b.is_ascii_whitespace()).unwrap_or(bytes.len());
118    let end = bytes.iter().rposition(|b| !b.is_ascii_whitespace()).map_or(start, |p| p + 1);
119    &bytes[start..end]
120}
121
122fn is_comment_node(kind: &str) -> bool {
123    matches!(
124        kind,
125        "comment" | "line_comment" | "block_comment" | "doc_comment" | "tag_comment"
126    )
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132
133    #[test]
134    fn test_content_hash_deterministic() {
135        let h1 = content_hash("hello world");
136        let h2 = content_hash("hello world");
137        assert_eq!(h1, h2);
138    }
139
140    #[test]
141    fn test_content_hash_hex_format() {
142        let h = content_hash("test");
143        assert_eq!(h.len(), 16); // xxHash64 = 8 bytes = 16 hex chars
144        assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
145    }
146
147    #[test]
148    fn test_short_hash() {
149        let h = short_hash("test", 8);
150        assert_eq!(h.len(), 8);
151    }
152}