Skip to main content

sem_core/utils/
hash.rs

1use std::hash::Hasher;
2use tree_sitter::Node;
3use xxhash_rust::xxh3::Xxh3;
4
5pub fn content_hash(content: &str) -> String {
6    format!("{:016x}", xxhash_rust::xxh3::xxh3_64(content.as_bytes()))
7}
8
9pub fn short_hash(content: &str, length: usize) -> String {
10    let hash = content_hash(content);
11    hash[..length.min(hash.len())].to_string()
12}
13
14/// Compute a structural hash from a tree-sitter AST node.
15/// Strips comments and normalizes whitespace so formatting-only changes
16/// produce the same hash. Uses streaming xxHash64 to avoid intermediate
17/// string allocations.
18pub fn structural_hash(node: Node, source: &[u8]) -> String {
19    let mut hasher = Xxh3::new();
20    hash_structural_tokens(node, source, &mut hasher);
21    format!("{:016x}", hasher.finish())
22}
23
24/// Compute a structural hash that excludes tokens within a given byte range.
25/// Used to strip the entity name from the hash so that renames of otherwise
26/// identical entities produce the same hash, enabling Phase 2 rename detection.
27pub fn structural_hash_excluding_range(
28    node: Node,
29    source: &[u8],
30    exclude_start: usize,
31    exclude_end: usize,
32) -> String {
33    let mut hasher = Xxh3::new();
34    hash_structural_tokens_excluding(node, source, &mut hasher, exclude_start, exclude_end);
35    format!("{:016x}", hasher.finish())
36}
37
38/// Iteratively hash tokens from the AST, skipping comments.
39/// Hashes both node types (structure) and leaf text (content) so that
40/// structurally different ASTs with identical leaf tokens produce different hashes.
41/// Zero allocations: hashes directly from source byte slices.
42fn hash_structural_tokens(root: Node, source: &[u8], hasher: &mut Xxh3) {
43    let mut worklist = vec![root];
44    while let Some(node) = worklist.pop() {
45        let kind = node.kind();
46
47        if is_comment_node(kind) {
48            continue;
49        }
50
51        if node.child_count() == 0 {
52            // Leaf node: hash its text directly from the source buffer
53            let start = node.start_byte();
54            let end = node.end_byte();
55            if start < end && end <= source.len() {
56                let bytes = &source[start..end];
57                // Trim whitespace manually to avoid allocation
58                let trimmed = trim_bytes(bytes);
59                if !trimmed.is_empty() {
60                    hasher.write(trimmed);
61                    hasher.write(b" ");
62                }
63            }
64        } else {
65            // Hash the node type to capture structure, not just leaf content.
66            // e.g. `x = foo(bar)` vs `foo(bar) = x` have same leaves but different structure.
67            hasher.write(kind.as_bytes());
68            hasher.write(b":");
69            let mut cursor = node.walk();
70            let children: Vec<_> = node.children(&mut cursor).collect();
71            for child in children.into_iter().rev() {
72                worklist.push(child);
73            }
74        }
75    }
76}
77
78/// Like `hash_structural_tokens` but skips any leaf node whose byte range
79/// overlaps the excluded range (the entity name).
80fn hash_structural_tokens_excluding(
81    root: Node,
82    source: &[u8],
83    hasher: &mut Xxh3,
84    exclude_start: usize,
85    exclude_end: usize,
86) {
87    let mut worklist = vec![root];
88    while let Some(node) = worklist.pop() {
89        let kind = node.kind();
90
91        if is_comment_node(kind) {
92            continue;
93        }
94
95        if node.child_count() == 0 {
96            let start = node.start_byte();
97            let end = node.end_byte();
98            // Skip leaf nodes that overlap the excluded range
99            if start < exclude_end && end > exclude_start {
100                continue;
101            }
102            if start < end && end <= source.len() {
103                let bytes = &source[start..end];
104                let trimmed = trim_bytes(bytes);
105                if !trimmed.is_empty() {
106                    hasher.write(trimmed);
107                    hasher.write(b" ");
108                }
109            }
110        } else {
111            hasher.write(kind.as_bytes());
112            hasher.write(b":");
113            let mut cursor = node.walk();
114            let children: Vec<_> = node.children(&mut cursor).collect();
115            for child in children.into_iter().rev() {
116                worklist.push(child);
117            }
118        }
119    }
120}
121
122/// Trim leading/trailing ASCII whitespace from a byte slice without allocating.
123#[inline]
124fn trim_bytes(bytes: &[u8]) -> &[u8] {
125    let start = bytes.iter().position(|b| !b.is_ascii_whitespace()).unwrap_or(bytes.len());
126    let end = bytes.iter().rposition(|b| !b.is_ascii_whitespace()).map_or(start, |p| p + 1);
127    &bytes[start..end]
128}
129
130fn is_comment_node(kind: &str) -> bool {
131    matches!(
132        kind,
133        "comment" | "line_comment" | "block_comment" | "doc_comment" | "tag_comment"
134    )
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn test_content_hash_deterministic() {
143        let h1 = content_hash("hello world");
144        let h2 = content_hash("hello world");
145        assert_eq!(h1, h2);
146    }
147
148    #[test]
149    fn test_content_hash_hex_format() {
150        let h = content_hash("test");
151        assert_eq!(h.len(), 16); // xxHash64 = 8 bytes = 16 hex chars
152        assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
153    }
154
155    #[test]
156    fn test_short_hash() {
157        let h = short_hash("test", 8);
158        assert_eq!(h.len(), 8);
159    }
160}