use std::collections::HashSet;
use std::fmt::Write as _;
use sha2::{Digest, Sha256};
use tree_sitter::{Node, Parser, Tree};
const SHINGLE_N: usize = 5;
const W_AST: f64 = 0.40;
const W_CFG: f64 = 0.25;
const W_CALL_SEQ: f64 = 0.20;
const W_SHINGLE: f64 = 0.15;
#[derive(Debug, Clone)]
pub struct Fingerprint {
pub ast_hash: String,
pub cfg_hash: String,
pub call_seq_hash: String,
pub shingles: Vec<u32>,
pub body_tokens: usize,
pub source_hash: String,
}
impl Fingerprint {
pub fn shingles_to_string(&self) -> String {
let mut s = String::with_capacity(self.shingles.len() * 9);
for (i, h) in self.shingles.iter().enumerate() {
if i > 0 {
s.push(',');
}
let _ = write!(s, "{h:08x}");
}
s
}
pub fn shingles_from_string(s: &str) -> Vec<u32> {
if s.is_empty() {
return Vec::new();
}
s.split(',')
.filter_map(|hex| u32::from_str_radix(hex, 16).ok())
.collect()
}
}
pub fn compute_fingerprint(full_source: &str, body_node: Node<'_>) -> Fingerprint {
let body_text = body_node
.utf8_text(full_source.as_bytes())
.unwrap_or_default();
let body_tokens = tokenize(body_text);
Fingerprint {
ast_hash: hash_kind_walk(body_node, false),
cfg_hash: hash_kind_walk(body_node, true),
call_seq_hash: hash_call_sequence(body_node, full_source.as_bytes()),
shingles: compute_shingles(&body_tokens),
body_tokens: body_tokens.len(),
source_hash: short_sha256(body_text),
}
}
pub fn parse_file(source: &str, language: &tree_sitter::Language) -> Option<Tree> {
let mut parser = Parser::new();
parser.set_language(language).ok()?;
parser.parse(source, None)
}
pub fn find_node_at_lines<'tree>(
tree: &'tree Tree,
start_line_zero_indexed: u32,
end_line_zero_indexed: u32,
) -> Option<Node<'tree>> {
let root = tree.root_node();
let mut best: Option<Node<'tree>> = None;
let mut stack = vec![root];
while let Some(node) = stack.pop() {
let ns = node.start_position().row as u32;
let ne = node.end_position().row as u32;
if ns <= start_line_zero_indexed && ne >= end_line_zero_indexed {
if let Some(b) = best {
let b_span = b.end_position().row - b.start_position().row;
let n_span = ne - ns;
if n_span < u32::try_from(b_span).unwrap_or(u32::MAX) {
best = Some(node);
}
} else {
best = Some(node);
}
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
stack.push(cursor.node());
if !cursor.goto_next_sibling() {
break;
}
}
}
}
}
best
}
fn tokenize(body: &str) -> Vec<&str> {
let bytes = body.as_bytes();
let mut tokens: Vec<&str> = Vec::new();
let mut i = 0usize;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() || b == b'_' {
let start = i;
while i < bytes.len() {
let bb = bytes[i];
if bb.is_ascii_alphanumeric() || bb == b'_' {
i += 1;
} else {
break;
}
}
tokens.push(&body[start..i]);
} else {
i += 1;
}
}
tokens
}
fn hash_kind_walk(root: Node<'_>, control_flow_only: bool) -> String {
let mut hasher = Sha256::new();
let mut stack: Vec<(Node<'_>, u32)> = vec![(root, 0)];
while let Some((node, depth)) = stack.pop() {
let kind = node.kind();
let emit = if control_flow_only {
is_control_flow_kind(kind)
} else {
true
};
if emit {
hasher.update(kind.as_bytes());
hasher.update([0x1f]);
hasher.update(depth.to_le_bytes());
hasher.update([0x1e]);
}
let mut cursor = node.walk();
if cursor.goto_first_child() {
let mut children: Vec<Node<'_>> = Vec::new();
loop {
children.push(cursor.node());
if !cursor.goto_next_sibling() {
break;
}
}
for child in children.into_iter().rev() {
stack.push((child, depth + 1));
}
}
}
short_hex(hasher.finalize().as_slice())
}
fn is_control_flow_kind(kind: &str) -> bool {
const MARKERS: [&str; 12] = [
"if", "for", "while", "loop", "switch", "case", "match", "return", "break", "continue",
"try", "catch",
];
MARKERS.iter().any(|m| kind.contains(m))
}
fn hash_call_sequence(root: Node<'_>, source: &[u8]) -> String {
let mut calls: Vec<String> = Vec::new();
let mut stack: Vec<Node<'_>> = vec![root];
while let Some(node) = stack.pop() {
let kind = node.kind();
if is_call_kind(kind) {
if let Some(name) = leftmost_callable_name(node, source) {
calls.push(name);
}
}
let mut cursor = node.walk();
if cursor.goto_first_child() {
let mut children: Vec<Node<'_>> = Vec::new();
loop {
children.push(cursor.node());
if !cursor.goto_next_sibling() {
break;
}
}
for child in children.into_iter().rev() {
stack.push(child);
}
}
}
let mut hasher = Sha256::new();
for name in &calls {
hasher.update(name.as_bytes());
hasher.update([0x1f]);
}
short_hex(hasher.finalize().as_slice())
}
fn is_call_kind(kind: &str) -> bool {
const MARKERS: [&str; 4] = ["call", "invocation", "macro", "apply"];
MARKERS.iter().any(|m| kind.contains(m))
}
fn leftmost_callable_name(node: Node<'_>, source: &[u8]) -> Option<String> {
let mut cursor = node.walk();
if !cursor.goto_first_child() {
return None;
}
loop {
let child = cursor.node();
let kind = child.kind();
if kind == "identifier"
|| kind == "field_identifier"
|| kind == "property_identifier"
|| kind == "scoped_identifier"
{
return child.utf8_text(source).ok().map(str::to_string);
}
if kind.contains("field_expression")
|| kind.contains("member_expression")
|| kind.contains("scoped")
{
let mut inner = child.walk();
if inner.goto_first_child() {
let mut last_id: Option<String> = None;
loop {
let ic = inner.node();
let ik = ic.kind();
if ik.contains("identifier") {
if let Ok(t) = ic.utf8_text(source) {
last_id = Some(t.to_string());
}
}
if !inner.goto_next_sibling() {
break;
}
}
if last_id.is_some() {
return last_id;
}
}
}
if !cursor.goto_next_sibling() {
break;
}
}
None
}
fn compute_shingles(tokens: &[&str]) -> Vec<u32> {
if tokens.len() < SHINGLE_N {
return Vec::new();
}
let mut set: HashSet<u32> = HashSet::new();
for window in tokens.windows(SHINGLE_N) {
let mut hasher = Sha256::new();
for tok in window {
hasher.update(tok.as_bytes());
hasher.update([0x1f]);
}
let digest = hasher.finalize();
let mut acc: u32 = 0;
for chunk in digest.chunks(4) {
let mut b = [0u8; 4];
for (i, v) in chunk.iter().enumerate() {
b[i] = *v;
}
acc ^= u32::from_le_bytes(b);
}
set.insert(acc);
}
let mut out: Vec<u32> = set.into_iter().collect();
out.sort_unstable();
out
}
pub fn jaccard_similarity(a: &[u32], b: &[u32]) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
if a.is_empty() || b.is_empty() {
return 0.0;
}
let (mut i, mut j) = (0usize, 0usize);
let mut inter = 0usize;
while i < a.len() && j < b.len() {
match a[i].cmp(&b[j]) {
std::cmp::Ordering::Equal => {
inter += 1;
i += 1;
j += 1;
}
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
}
}
let union = a.len() + b.len() - inter;
if union == 0 {
return 1.0;
}
inter as f64 / union as f64
}
pub fn composite_similarity(a: &Fingerprint, b: &Fingerprint) -> f64 {
let ast = if a.ast_hash == b.ast_hash { 1.0 } else { 0.0 };
let cfg = if a.cfg_hash == b.cfg_hash { 1.0 } else { 0.0 };
let call = if a.call_seq_hash == b.call_seq_hash {
1.0
} else {
0.0
};
let shingle = jaccard_similarity(&a.shingles, &b.shingles);
W_AST * ast + W_CFG * cfg + W_CALL_SEQ * call + W_SHINGLE * shingle
}
pub fn overlap_kind(a: &Fingerprint, b: &Fingerprint) -> &'static str {
if a.ast_hash == b.ast_hash {
"ast_isomorphic"
} else if a.cfg_hash == b.cfg_hash {
"control_flow"
} else if a.call_seq_hash == b.call_seq_hash {
"algorithmic"
} else if jaccard_similarity(&a.shingles, &b.shingles) >= 0.5 {
"token_overlap"
} else {
"naming"
}
}
pub fn severity_bucket(score: f64, kind: &str) -> &'static str {
if kind == "ast_isomorphic" && score >= 0.80 {
"definite"
} else if kind == "naming" {
"naming_only"
} else if score >= 0.55 {
"likely"
} else {
"naming_only"
}
}
fn short_hex(bytes: &[u8]) -> String {
let mut s = String::with_capacity(16);
for b in bytes.iter().take(8) {
let _ = write!(s, "{b:02x}");
}
s
}
fn short_sha256(s: &str) -> String {
let mut h = Sha256::new();
h.update(s.as_bytes());
short_hex(h.finalize().as_slice())
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
fn fingerprint_for_rust_fn(snippet: &str) -> Fingerprint {
let lang = crate::extraction::ts_provider::language("rust");
let tree = parse_file(snippet, &lang).expect("parse failed");
let root = tree.root_node();
let fn_node = find_first_kind(root, "function_item").expect("no function in snippet");
compute_fingerprint(snippet, fn_node)
}
fn find_first_kind<'t>(root: Node<'t>, target: &str) -> Option<Node<'t>> {
let mut stack = vec![root];
while let Some(n) = stack.pop() {
if n.kind() == target {
return Some(n);
}
let mut cursor = n.walk();
if cursor.goto_first_child() {
loop {
stack.push(cursor.node());
if !cursor.goto_next_sibling() {
break;
}
}
}
}
None
}
#[test]
fn identical_functions_have_identical_ast_hash() {
let a =
fingerprint_for_rust_fn("fn a(x: i32) -> i32 { if x > 0 { x + 1 } else { x - 1 } }");
let b =
fingerprint_for_rust_fn("fn b(y: i32) -> i32 { if y > 0 { y + 1 } else { y - 1 } }");
assert_eq!(
a.ast_hash, b.ast_hash,
"renamed identifiers must not change AST hash"
);
let score = composite_similarity(&a, &b);
assert!(score >= 0.85, "expected >= 0.85, got {score}");
assert_eq!(overlap_kind(&a, &b), "ast_isomorphic");
assert_eq!(severity_bucket(score, "ast_isomorphic"), "definite");
}
#[test]
fn different_structure_produces_different_ast_hash() {
let a = fingerprint_for_rust_fn("fn a(x: i32) -> i32 { x + 1 }");
let b =
fingerprint_for_rust_fn("fn b(x: i32) -> i32 { if x > 0 { x + 1 } else { x - 1 } }");
assert_ne!(a.ast_hash, b.ast_hash);
assert_ne!(a.cfg_hash, b.cfg_hash);
}
#[test]
fn cfg_hash_matches_under_renaming_and_inline_changes() {
let a = fingerprint_for_rust_fn(
"fn a(x: i32) -> i32 { if x > 0 { return 1; } else { return 2; } }",
);
let b = fingerprint_for_rust_fn(
"fn b(x: i32) -> i32 { if x > 0 { return 99; } else { return 100; } }",
);
assert_eq!(a.cfg_hash, b.cfg_hash);
}
#[test]
fn jaccard_self_similarity_is_one() {
let a = fingerprint_for_rust_fn(
"fn a() { let x = 1; let y = 2; let z = x + y; println!(\"{}\", z); }",
);
assert!((jaccard_similarity(&a.shingles, &a.shingles) - 1.0).abs() < 1e-9);
}
#[test]
fn jaccard_disjoint_is_zero() {
let a = fingerprint_for_rust_fn(
"fn a() { let aaaa = 1; let bbbb = 2; let cccc = 3; let dddd = 4; let eeee = 5; }",
);
let b = fingerprint_for_rust_fn(
"fn b() { let zzzz = 9; let yyyy = 8; let xxxx = 7; let wwww = 6; let vvvv = 5; }",
);
let j = jaccard_similarity(&a.shingles, &b.shingles);
assert!(j < 0.4, "expected low Jaccard, got {j}");
}
#[test]
fn shingles_roundtrip_through_string_format() {
let original: Vec<u32> = vec![1, 2, 0xdead_beef, 0xffff_ffff];
let fp = Fingerprint {
ast_hash: "x".into(),
cfg_hash: "x".into(),
call_seq_hash: "x".into(),
shingles: original.clone(),
body_tokens: 0,
source_hash: "x".into(),
};
let s = fp.shingles_to_string();
let parsed = Fingerprint::shingles_from_string(&s);
assert_eq!(parsed, original);
}
#[test]
fn call_sequence_captures_order() {
let a = fingerprint_for_rust_fn("fn a() { foo(); bar(); baz(); }");
let b = fingerprint_for_rust_fn("fn b() { foo(); bar(); baz(); }");
let c = fingerprint_for_rust_fn("fn c() { baz(); bar(); foo(); }");
assert_eq!(a.call_seq_hash, b.call_seq_hash);
assert_ne!(a.call_seq_hash, c.call_seq_hash);
}
#[test]
fn severity_naming_only_for_low_score() {
assert_eq!(severity_bucket(0.10, "naming"), "naming_only");
assert_eq!(severity_bucket(0.30, "token_overlap"), "naming_only");
assert_eq!(severity_bucket(0.60, "control_flow"), "likely");
}
}