use serde::{Deserialize, Serialize};
use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct BypassFingerprint {
pub hash: u64,
pub rule_id: String,
pub chain_shape: Vec<String>,
pub skeleton: String,
}
#[must_use]
pub fn fingerprint(rule_id: &str, encoding_chain: &[String], payload: &str) -> BypassFingerprint {
let skeleton = skeletonize(payload);
let chain_shape = canonicalize_chain(encoding_chain);
let mut h = FNV_OFFSET;
fnv1a_bytes(&mut h, rule_id.as_bytes());
h = fnv1a_byte(h, 0x1F); for c in &chain_shape {
fnv1a_bytes(&mut h, c.as_bytes());
h = fnv1a_byte(h, 0x1E); }
h = fnv1a_byte(h, 0x1F);
fnv1a_bytes(&mut h, skeleton.as_bytes());
BypassFingerprint {
hash: h,
rule_id: rule_id.to_string(),
chain_shape,
skeleton,
}
}
fn canonicalize_chain(chain: &[String]) -> Vec<String> {
chain
.iter()
.map(|s| {
s.split('(').next().unwrap_or(s).trim().to_string()
})
.filter(|s| !s.is_empty())
.collect()
}
fn skeletonize(payload: &str) -> String {
let mut out = String::with_capacity(payload.len());
let mut chars = payload.chars().peekable();
while let Some(&c) = chars.peek() {
if c.is_whitespace() {
out.push(' ');
chars.next();
while chars.peek().is_some_and(|c| c.is_whitespace()) {
chars.next();
}
} else if c.is_ascii_alphanumeric() {
let mut word = String::new();
while let Some(&p) = chars.peek() {
if p.is_ascii_alphanumeric() {
word.push(p);
chars.next();
} else {
break;
}
}
let lower = word.to_ascii_lowercase();
if is_known_keyword(&lower) {
out.push_str(&lower);
} else if word.chars().all(|c| c.is_ascii_digit()) {
out.push_str("<D>");
} else {
out.push_str("<W>");
}
} else {
out.push(c);
chars.next();
}
}
out.trim().to_string()
}
const KNOWN_KEYWORDS: &[&str] = &[
"select",
"union",
"where",
"from",
"or",
"and",
"drop",
"insert",
"update",
"delete",
"exec",
"execute",
"load_file",
"into",
"outfile",
"concat",
"sleep",
"benchmark",
"version",
"user",
"true",
"false",
"null",
"limit",
"having",
"group",
"by",
"order",
"script",
"alert",
"prompt",
"confirm",
"onerror",
"onload",
"onclick",
"onfocus",
"img",
"svg",
"iframe",
"onmouseover",
"onkeyup",
"javascript",
"eval",
"fromcharcode",
"cat",
"ls",
"id",
"whoami",
"curl",
"wget",
"nc",
"bash",
"sh",
"ping",
"nslookup",
"jndi",
"ldap",
"rmi",
"dns",
"log4j",
"etc",
"passwd",
"windows",
"system32",
"boot",
"class",
"mro",
"subclasses",
"globals",
"import",
"getattr",
"http",
"https",
"file",
"data",
];
fn is_known_keyword(token: &str) -> bool {
KNOWN_KEYWORDS.contains(&token)
}
use wafrift_types::hash::{
FNV_OFFSET_64 as FNV_OFFSET, fnv1a_64_extend as fnv1a_bytes, fnv1a_64_step as fnv1a_byte,
};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct H1Archive {
pub reports: std::collections::BTreeMap<String, HashSet<u64>>,
}
impl H1Archive {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add_report(&mut self, fp: &BypassFingerprint) {
self.reports
.entry(fp.rule_id.clone())
.or_default()
.insert(fp.hash);
}
#[must_use]
pub fn contains(&self, fp: &BypassFingerprint) -> bool {
self.reports
.get(&fp.rule_id)
.is_some_and(|set| set.contains(&fp.hash))
}
#[must_use]
pub fn total_reports(&self) -> usize {
self.reports.values().map(HashSet::len).sum()
}
#[must_use]
pub fn rules_seen(&self) -> usize {
self.reports.len()
}
#[must_use]
pub fn load_or_default(path: &std::path::Path) -> Self {
const H1_ARCHIVE_MAX_BYTES: usize = 64 * 1024 * 1024;
let Ok(raw) = crate::safe_io::read_capped_text(path, H1_ARCHIVE_MAX_BYTES) else {
return Self::default();
};
serde_json::from_str(&raw).unwrap_or_default()
}
pub fn save_atomic(&self, path: &std::path::Path) -> std::io::Result<()> {
let body = serde_json::to_vec_pretty(self)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
wafrift_types::loaders::write_atomic(path, &body)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn same_payload_same_fingerprint() {
let a = fingerprint("942100", &["url".into()], "' OR 1=1--");
let b = fingerprint("942100", &["url".into()], "' OR 1=1--");
assert_eq!(a.hash, b.hash);
}
#[test]
fn different_identifier_same_fingerprint() {
let a = fingerprint("942100", &["url".into()], "' OR x=x--");
let b = fingerprint("942100", &["url".into()], "' OR foo=foo--");
assert_eq!(a.hash, b.hash);
assert_eq!(a.skeleton, b.skeleton);
}
#[test]
fn keyword_preserved_in_skeleton() {
let fp = fingerprint("942100", &[], "' OR 1=1--");
assert!(fp.skeleton.contains("or"));
assert!(fp.skeleton.contains("<D>"));
}
#[test]
fn different_rule_different_fingerprint() {
let a = fingerprint("942100", &["url".into()], "' OR 1=1--");
let b = fingerprint("942110", &["url".into()], "' OR 1=1--");
assert_ne!(a.hash, b.hash);
}
#[test]
fn different_chain_different_fingerprint() {
let a = fingerprint("942100", &["url".into()], "' OR 1=1--");
let b = fingerprint("942100", &["unicode".into()], "' OR 1=1--");
assert_ne!(a.hash, b.hash);
}
#[test]
fn chain_order_distinguished() {
let a = fingerprint("942100", &["url".into(), "unicode".into()], "x");
let b = fingerprint("942100", &["unicode".into(), "url".into()], "x");
assert_ne!(a.hash, b.hash);
}
#[test]
fn chain_param_collapsed() {
let a = fingerprint("942100", &["url(double)".into()], "x");
let b = fingerprint("942100", &["url(triple)".into()], "x");
assert_eq!(a.hash, b.hash);
assert_eq!(a.chain_shape, b.chain_shape);
}
#[test]
fn chain_param_distinct_from_classless() {
let a = fingerprint("942100", &["url".into()], "x");
let b = fingerprint("942100", &["url(double)".into()], "x");
assert_eq!(a.hash, b.hash);
}
#[test]
fn keyword_case_folded() {
let a = fingerprint("R", &[], "OR");
let b = fingerprint("R", &[], "or");
assert_eq!(a.hash, b.hash);
}
#[test]
fn digit_run_distinct_from_word() {
let a = fingerprint("R", &[], "abc");
let b = fingerprint("R", &[], "123");
assert_ne!(a.hash, b.hash);
assert!(a.skeleton.contains("<W>"));
assert!(b.skeleton.contains("<D>"));
}
#[test]
fn whitespace_collapsed() {
let a = fingerprint("R", &[], "a b");
let b = fingerprint("R", &[], "a b");
let c = fingerprint("R", &[], "a\t\tb");
assert_eq!(a.hash, b.hash);
assert_eq!(a.hash, c.hash);
}
#[test]
fn punctuation_preserved() {
let a = fingerprint("R", &[], "x;y");
let b = fingerprint("R", &[], "x|y");
assert_ne!(a.hash, b.hash);
}
#[test]
fn empty_chain_handled() {
let _ = fingerprint("R", &[], "x");
}
#[test]
fn empty_chain_entries_filtered() {
let a = fingerprint("R", &["url".into(), "".into(), "unicode".into()], "x");
let b = fingerprint("R", &["url".into(), "unicode".into()], "x");
assert_eq!(a.chain_shape, b.chain_shape);
assert_eq!(a.hash, b.hash);
}
#[test]
fn empty_payload_handled() {
let fp = fingerprint("R", &["url".into()], "");
assert_eq!(fp.skeleton, "");
}
#[test]
fn unicode_in_payload_no_panic() {
let fp = fingerprint("R", &[], "SELECT 中 \u{200B}");
assert!(!fp.skeleton.is_empty());
}
#[test]
fn h1_archive_contains_after_add() {
let mut a = H1Archive::new();
let fp = fingerprint("R", &["url".into()], "x");
a.add_report(&fp);
assert!(a.contains(&fp));
}
#[test]
fn h1_archive_distinguishes_rule_ids() {
let mut a = H1Archive::new();
let fp_r1 = fingerprint("R1", &["url".into()], "x");
a.add_report(&fp_r1);
let fp_r2 = fingerprint("R2", &["url".into()], "x");
assert!(!a.contains(&fp_r2));
}
#[test]
fn h1_archive_load_default_on_missing_path() {
let p = std::path::Path::new("/nonexistent/zzzz.json");
let a = H1Archive::load_or_default(p);
assert_eq!(a.total_reports(), 0);
assert_eq!(a.rules_seen(), 0);
}
#[test]
fn h1_archive_save_load_round_trip() {
let dir = std::env::temp_dir().join(format!("wafrift-h1-test-{}", std::process::id()));
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("h1.json");
let mut a = H1Archive::new();
for i in 0..5 {
let fp = fingerprint(&format!("R{i}"), &["url".into()], "x");
a.add_report(&fp);
}
a.save_atomic(&path).expect("save");
let b = H1Archive::load_or_default(&path);
assert_eq!(a.total_reports(), b.total_reports());
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn h1_archive_corrupted_returns_default() {
let dir = std::env::temp_dir().join(format!("wafrift-h1-corr-{}", std::process::id()));
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("h1.json");
std::fs::write(&path, b"{not json").expect("write");
let a = H1Archive::load_or_default(&path);
assert_eq!(a.total_reports(), 0);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn duplicate_add_is_idempotent() {
let mut a = H1Archive::new();
let fp = fingerprint("R", &["url".into()], "x");
a.add_report(&fp);
a.add_report(&fp);
a.add_report(&fp);
assert_eq!(a.total_reports(), 1);
}
#[test]
fn rules_seen_counts_distinct_keys() {
let mut a = H1Archive::new();
a.add_report(&fingerprint("R1", &[], "' OR 1=1--"));
a.add_report(&fingerprint("R1", &[], "; sleep 5"));
a.add_report(&fingerprint("R2", &[], "<script>alert(1)</script>"));
assert_eq!(a.rules_seen(), 2);
assert_eq!(a.total_reports(), 3);
}
#[test]
fn fingerprint_serializes_round_trip() {
let fp = fingerprint("R", &["url".into()], "x");
let json = serde_json::to_string(&fp).expect("ser");
let back: BypassFingerprint = serde_json::from_str(&json).expect("de");
assert_eq!(fp.hash, back.hash);
assert_eq!(fp.rule_id, back.rule_id);
assert_eq!(fp.chain_shape, back.chain_shape);
assert_eq!(fp.skeleton, back.skeleton);
}
#[test]
fn fnv1a_deterministic() {
let a = fingerprint("R", &["url".into()], "x");
let b = fingerprint("R", &["url".into()], "x");
assert_eq!(a.hash, b.hash);
}
#[test]
fn long_payload_no_panic() {
let big = "A".repeat(100_000);
let fp = fingerprint("R", &[], &big);
assert_eq!(fp.skeleton, "<W>");
}
#[test]
fn long_chain_no_panic() {
let chain: Vec<String> = (0..1000).map(|i| format!("t{i}")).collect();
let _ = fingerprint("R", &chain, "x");
}
#[test]
fn sql_classic_bypass_canonical_form() {
let a = fingerprint("942100", &["url".into()], "' OR 1=1--");
let b = fingerprint("942100", &["url".into()], "' OR 5=5--");
let c = fingerprint("942100", &["url".into()], "' OR aaa=aaa--");
assert_eq!(a.hash, b.hash);
assert_ne!(a.hash, c.hash);
}
#[test]
fn keyword_set_minimum_count() {
assert!(KNOWN_KEYWORDS.len() >= 30);
}
#[test]
fn keyword_set_lowercase() {
for k in KNOWN_KEYWORDS {
assert!(
k.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_'),
"keyword must be lowercase ASCII alnum (underscores OK): {k}"
);
}
}
}