Skip to main content

sem_core/utils/
hash.rs

1use std::hash::Hasher;
2use tree_sitter::Node;
3use xxhash_rust::xxh3::Xxh3;
4
5pub fn content_hash(content: &str) -> String {
6    content_hash_bytes(content.as_bytes())
7}
8
9pub fn content_hash_bytes(content: &[u8]) -> String {
10    format!("{:016x}", xxhash_rust::xxh3::xxh3_64(content))
11}
12
13pub fn short_hash(content: &str, length: usize) -> String {
14    let hash = content_hash(content);
15    hash[..length.min(hash.len())].to_string()
16}
17
18/// Compute a structural hash from a tree-sitter AST node.
19/// Strips comments and normalizes whitespace so formatting-only changes
20/// produce the same hash. Uses streaming xxHash64 to avoid intermediate
21/// string allocations.
22pub fn structural_hash(node: Node, source: &[u8]) -> String {
23    let mut hasher = Xxh3::new();
24    hash_structural_tokens(node, source, &mut hasher);
25    format!("{:016x}", hasher.finish())
26}
27
28/// Compute a structural hash that excludes tokens within a given byte range.
29/// Used to strip the entity name from the hash so that renames of otherwise
30/// identical entities produce the same hash, enabling Phase 2 rename detection.
31pub fn structural_hash_excluding_range(
32    node: Node,
33    source: &[u8],
34    exclude_start: usize,
35    exclude_end: usize,
36) -> String {
37    let mut hasher = Xxh3::new();
38    hash_structural_tokens_excluding(node, source, &mut hasher, exclude_start, exclude_end);
39    format!("{:016x}", hasher.finish())
40}
41
42/// Iteratively hash tokens from the AST, skipping comments.
43/// Hashes both node types (structure) and leaf text (content) so that
44/// structurally different ASTs with identical leaf tokens produce different hashes.
45/// Zero allocations: hashes directly from source byte slices.
46fn hash_structural_tokens(root: Node, source: &[u8], hasher: &mut Xxh3) {
47    let mut worklist = vec![root];
48    while let Some(node) = worklist.pop() {
49        let kind = node.kind();
50
51        if is_comment_node(kind) {
52            continue;
53        }
54
55        if node.child_count() == 0 {
56            // Leaf node: hash its text directly from the source buffer
57            let start = node.start_byte();
58            let end = node.end_byte();
59            if start < end && end <= source.len() {
60                let bytes = &source[start..end];
61                // Trim whitespace manually to avoid allocation
62                let trimmed = trim_bytes(bytes);
63                if !trimmed.is_empty() {
64                    hasher.write(trimmed);
65                    hasher.write(b" ");
66                }
67            }
68        } else {
69            // Hash the node type to capture structure, not just leaf content.
70            // e.g. `x = foo(bar)` vs `foo(bar) = x` have same leaves but different structure.
71            hasher.write(kind.as_bytes());
72            hasher.write(b":");
73            let mut cursor = node.walk();
74            let children: Vec<_> = node.children(&mut cursor).collect();
75            for child in children.into_iter().rev() {
76                worklist.push(child);
77            }
78        }
79    }
80}
81
82/// Like `hash_structural_tokens` but skips any leaf node whose byte range
83/// overlaps the excluded range (the entity name).
84fn hash_structural_tokens_excluding(
85    root: Node,
86    source: &[u8],
87    hasher: &mut Xxh3,
88    exclude_start: usize,
89    exclude_end: usize,
90) {
91    let mut worklist = vec![root];
92    while let Some(node) = worklist.pop() {
93        let kind = node.kind();
94
95        if is_comment_node(kind) {
96            continue;
97        }
98
99        if node.child_count() == 0 {
100            let start = node.start_byte();
101            let end = node.end_byte();
102            // Skip leaf nodes that overlap the excluded range
103            if start < exclude_end && end > exclude_start {
104                continue;
105            }
106            if start < end && end <= source.len() {
107                let bytes = &source[start..end];
108                let trimmed = trim_bytes(bytes);
109                if !trimmed.is_empty() {
110                    hasher.write(trimmed);
111                    hasher.write(b" ");
112                }
113            }
114        } else {
115            hasher.write(kind.as_bytes());
116            hasher.write(b":");
117            let mut cursor = node.walk();
118            let children: Vec<_> = node.children(&mut cursor).collect();
119            for child in children.into_iter().rev() {
120                worklist.push(child);
121            }
122        }
123    }
124}
125
126/// Trim leading/trailing ASCII whitespace from a byte slice without allocating.
127#[inline]
128fn trim_bytes(bytes: &[u8]) -> &[u8] {
129    let start = bytes.iter().position(|b| !b.is_ascii_whitespace()).unwrap_or(bytes.len());
130    let end = bytes.iter().rposition(|b| !b.is_ascii_whitespace()).map_or(start, |p| p + 1);
131    &bytes[start..end]
132}
133
134fn is_comment_node(kind: &str) -> bool {
135    matches!(
136        kind,
137        "comment" | "line_comment" | "block_comment" | "doc_comment" | "tag_comment"
138    )
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn test_content_hash_deterministic() {
147        let h1 = content_hash("hello world");
148        let h2 = content_hash("hello world");
149        assert_eq!(h1, h2);
150    }
151
152    #[test]
153    fn test_content_hash_hex_format() {
154        let h = content_hash("test");
155        assert_eq!(h.len(), 16); // xxHash64 = 8 bytes = 16 hex chars
156        assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
157    }
158
159    #[test]
160    fn test_content_hash_bytes_matches_string_hash() {
161        assert_eq!(content_hash_bytes(b"test"), content_hash("test"));
162    }
163
164    #[test]
165    fn test_short_hash() {
166        let h = short_hash("test", 8);
167        assert_eq!(h.len(), 8);
168    }
169}