Skip to main content

sdivi_patterns/
normalize.rs

1//! Tree-aware canonical fingerprint algorithm.
2//!
3//! Implements `normalize_and_hash` — the depth-first canonical walk that extends
4//! the leaf-only `fingerprint_node_kind` to trees of arbitrary depth.
5
6use crate::fingerprint::{PatternFingerprint, FINGERPRINT_KEY};
7
8/// A node in a pattern AST subtree used by [`normalize_and_hash`].
9///
10/// A leaf node has `kind` set and an empty `children` slice.  For v0, language
11/// adapters still emit kind-only pattern hints, so `children` is always empty
12/// and the output is byte-identical to [`crate::fingerprint::fingerprint_node_kind`].
13///
14/// # Examples
15///
16/// ```rust
17/// use sdivi_patterns::normalize::{NormalizeNode, normalize_and_hash};
18///
19/// // Leaf node: same digest as fingerprint_node_kind("try_expression")
20/// let result = normalize_and_hash("try_expression", &[]);
21/// assert_eq!(result.len(), 64);
22/// ```
23#[derive(Debug, Clone)]
24pub struct NormalizeNode {
25    /// The tree-sitter node kind string.
26    pub kind: String,
27    /// Ordered children of this node.
28    pub children: Vec<NormalizeNode>,
29}
30
31/// Computes a canonical `blake3` fingerprint for a pattern AST subtree.
32///
33/// ## Algorithm
34///
35/// Depth-first canonical walk.  For a node with `kind = K` and ordered
36/// children `[c1, c2, …]`, the input to `blake3::keyed_hash(FINGERPRINT_KEY, _)`
37/// is the byte concatenation of:
38/// - `K` (UTF-8 bytes)
39/// - the separator byte `0x00`
40/// - for each child: the byte `0x01` followed by the 32 digest bytes of the
41///   recursively-computed child fingerprint
42///
43/// The `0x00` / `0x01` framing prevents a leaf node `K` from colliding with
44/// an internal node `K` whose only child starts with `K`.
45///
46/// **Empty `children`** produces the same digest as
47/// [`crate::fingerprint::fingerprint_node_kind`]`(kind)`:
48/// `blake3::keyed_hash(FINGERPRINT_KEY, kind.as_bytes())` (the `0x00` suffix
49/// on an empty children list makes the input identical to the raw kind bytes
50/// followed by `0x00`, but `fingerprint_node_kind` feeds only `kind.as_bytes()`
51/// — so they must be byte-identical).
52///
53/// Wait — to maintain equivalence with `fingerprint_node_kind(kind)` for empty
54/// children, the algorithm for empty children uses `kind.as_bytes()` directly
55/// (no separator, no framing), matching `fingerprint_node_kind`.
56///
57/// Returns a 64-character lowercase hex string.
58///
59/// # Examples
60///
61/// ```rust
62/// use sdivi_patterns::normalize::normalize_and_hash;
63/// use sdivi_patterns::fingerprint::fingerprint_node_kind;
64///
65/// // Empty children must match fingerprint_node_kind.
66/// let fp1 = fingerprint_node_kind("try_expression").to_hex();
67/// let fp2 = normalize_and_hash("try_expression", &[]);
68/// assert_eq!(fp1, fp2);
69/// ```
70pub fn normalize_and_hash(kind: &str, children: &[NormalizeNode]) -> String {
71    let fp = normalize_internal(kind, children);
72    fp.to_hex()
73}
74
75fn normalize_internal(kind: &str, children: &[NormalizeNode]) -> PatternFingerprint {
76    if children.is_empty() {
77        // Byte-identical to fingerprint_node_kind(kind): blake3::keyed_hash(KEY, kind.as_bytes())
78        let hash = blake3::keyed_hash(&FINGERPRINT_KEY, kind.as_bytes());
79        return PatternFingerprint::from_bytes(*hash.as_bytes());
80    }
81
82    // Internal node: K + 0x00 + (0x01 + child_bytes)*
83    let mut input = Vec::with_capacity(kind.len() + 1 + children.len() * 33);
84    input.extend_from_slice(kind.as_bytes());
85    input.push(0x00u8);
86    for child in children {
87        let child_fp = normalize_internal(&child.kind, &child.children);
88        input.push(0x01u8);
89        input.extend_from_slice(child_fp.as_bytes());
90    }
91
92    let hash = blake3::keyed_hash(&FINGERPRINT_KEY, &input);
93    PatternFingerprint::from_bytes(*hash.as_bytes())
94}