sdivi_patterns/normalize.rs
1//! Tree-aware canonical fingerprint algorithm.
2//!
3//! Implements `normalize_and_hash` — the depth-first canonical walk that extends
4//! the leaf-only `fingerprint_node_kind` to trees of arbitrary depth.
5
6use crate::fingerprint::{PatternFingerprint, FINGERPRINT_KEY};
7
8/// A node in a pattern AST subtree used by [`normalize_and_hash`].
9///
10/// A leaf node has `kind` set and an empty `children` slice. For v0, language
11/// adapters still emit kind-only pattern hints, so `children` is always empty
12/// and the output is byte-identical to [`crate::fingerprint::fingerprint_node_kind`].
13///
14/// # Examples
15///
16/// ```rust
17/// use sdivi_patterns::normalize::{NormalizeNode, normalize_and_hash};
18///
19/// // Leaf node: same digest as fingerprint_node_kind("try_expression")
20/// let result = normalize_and_hash("try_expression", &[]);
21/// assert_eq!(result.len(), 64);
22/// ```
23#[derive(Debug, Clone)]
24pub struct NormalizeNode {
25 /// The tree-sitter node kind string.
26 pub kind: String,
27 /// Ordered children of this node.
28 pub children: Vec<NormalizeNode>,
29}
30
31/// Computes a canonical `blake3` fingerprint for a pattern AST subtree.
32///
33/// ## Algorithm
34///
35/// Depth-first canonical walk. For a node with `kind = K` and ordered
36/// children `[c1, c2, …]`, the input to `blake3::keyed_hash(FINGERPRINT_KEY, _)`
37/// is the byte concatenation of:
38/// - `K` (UTF-8 bytes)
39/// - the separator byte `0x00`
40/// - for each child: the byte `0x01` followed by the 32 digest bytes of the
41/// recursively-computed child fingerprint
42///
43/// The `0x00` / `0x01` framing prevents a leaf node `K` from colliding with
44/// an internal node `K` whose only child starts with `K`.
45///
46/// **Empty `children`** produces the same digest as
47/// [`crate::fingerprint::fingerprint_node_kind`]`(kind)`:
48/// `blake3::keyed_hash(FINGERPRINT_KEY, kind.as_bytes())` (the `0x00` suffix
49/// on an empty children list makes the input identical to the raw kind bytes
50/// followed by `0x00`, but `fingerprint_node_kind` feeds only `kind.as_bytes()`
51/// — so they must be byte-identical).
52///
53/// Wait — to maintain equivalence with `fingerprint_node_kind(kind)` for empty
54/// children, the algorithm for empty children uses `kind.as_bytes()` directly
55/// (no separator, no framing), matching `fingerprint_node_kind`.
56///
57/// Returns a 64-character lowercase hex string.
58///
59/// # Examples
60///
61/// ```rust
62/// use sdivi_patterns::normalize::normalize_and_hash;
63/// use sdivi_patterns::fingerprint::fingerprint_node_kind;
64///
65/// // Empty children must match fingerprint_node_kind.
66/// let fp1 = fingerprint_node_kind("try_expression").to_hex();
67/// let fp2 = normalize_and_hash("try_expression", &[]);
68/// assert_eq!(fp1, fp2);
69/// ```
70pub fn normalize_and_hash(kind: &str, children: &[NormalizeNode]) -> String {
71 let fp = normalize_internal(kind, children);
72 fp.to_hex()
73}
74
75fn normalize_internal(kind: &str, children: &[NormalizeNode]) -> PatternFingerprint {
76 if children.is_empty() {
77 // Byte-identical to fingerprint_node_kind(kind): blake3::keyed_hash(KEY, kind.as_bytes())
78 let hash = blake3::keyed_hash(&FINGERPRINT_KEY, kind.as_bytes());
79 return PatternFingerprint::from_bytes(*hash.as_bytes());
80 }
81
82 // Internal node: K + 0x00 + (0x01 + child_bytes)*
83 let mut input = Vec::with_capacity(kind.len() + 1 + children.len() * 33);
84 input.extend_from_slice(kind.as_bytes());
85 input.push(0x00u8);
86 for child in children {
87 let child_fp = normalize_internal(&child.kind, &child.children);
88 input.push(0x01u8);
89 input.extend_from_slice(child_fp.as_bytes());
90 }
91
92 let hash = blake3::keyed_hash(&FINGERPRINT_KEY, &input);
93 PatternFingerprint::from_bytes(*hash.as_bytes())
94}