cloakrs_patterns/
crypto.rs1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static ETHEREUM_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"\b0x[0-9A-Fa-f]{40}\b"));
8static BITCOIN_LEGACY_REGEX: Lazy<Regex> =
9 Lazy::new(|| compile_regex(r"\b[13][1-9A-HJ-NP-Za-km-z]{25,34}\b"));
10static BITCOIN_BECH32_REGEX: Lazy<Regex> =
11 Lazy::new(|| compile_regex(r"\b(?:bc1|BC1)[ac-hj-np-zAC-HJ-NP-Z02-9]{11,71}\b"));
12
13const CONTEXT_WORDS: &[&str] = &[
14 "wallet", "crypto", "bitcoin", "btc", "ethereum", "eth", "address", "deposit",
15];
16
17#[derive(Debug, Clone, Copy, Default)]
29pub struct CryptoAddressRecognizer;
30
31impl Recognizer for CryptoAddressRecognizer {
32 fn id(&self) -> &str {
33 "crypto_address_regex_v1"
34 }
35
36 fn entity_type(&self) -> EntityType {
37 EntityType::CryptoAddress
38 }
39
40 fn supported_locales(&self) -> &[Locale] {
41 &[]
42 }
43
44 fn scan(&self, text: &str) -> Vec<PiiEntity> {
45 let mut seen = HashSet::new();
46 let mut findings = Vec::new();
47
48 for regex in [
49 &*ETHEREUM_REGEX,
50 &*BITCOIN_LEGACY_REGEX,
51 &*BITCOIN_BECH32_REGEX,
52 ] {
53 for matched in regex.find_iter(text) {
54 if seen.insert((matched.start(), matched.end()))
55 && self.is_valid_match(text, matched.start(), matched.end())
56 {
57 findings.push(PiiEntity {
58 entity_type: self.entity_type(),
59 span: Span::new(matched.start(), matched.end()),
60 text: matched.as_str().to_string(),
61 confidence: self.compute_confidence(
62 text,
63 matched.start(),
64 matched.as_str(),
65 ),
66 recognizer_id: self.id().to_string(),
67 });
68 }
69 }
70 }
71
72 findings.sort_by_key(|finding| finding.span.start);
73 findings
74 }
75
76 fn validate(&self, candidate: &str) -> bool {
77 validate_ethereum(candidate)
78 || validate_bitcoin_legacy(candidate)
79 || validate_bitcoin_bech32(candidate)
80 }
81}
82
83impl CryptoAddressRecognizer {
84 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
85 self.validate(&text[start..end]) && is_crypto_boundary(text, start, end)
86 }
87
88 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
89 let base = if candidate.starts_with("0x") || candidate.starts_with("0X") {
90 0.92
91 } else {
92 0.86
93 };
94 confidence(base + context_boost(text, start, CONTEXT_WORDS))
95 }
96}
97
98fn validate_ethereum(candidate: &str) -> bool {
99 candidate.len() == 42
100 && candidate
101 .get(..2)
102 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("0x"))
103 && candidate[2..].chars().all(|c| c.is_ascii_hexdigit())
104}
105
106fn validate_bitcoin_legacy(candidate: &str) -> bool {
107 (26..=35).contains(&candidate.len())
108 && matches!(candidate.as_bytes().first(), Some(b'1' | b'3'))
109 && candidate.chars().all(is_base58_char)
110}
111
112fn validate_bitcoin_bech32(candidate: &str) -> bool {
113 let has_prefix = candidate
114 .get(..3)
115 .is_some_and(|prefix| prefix.eq_ignore_ascii_case("bc1"));
116 let is_lower = candidate.chars().all(|c| !c.is_ascii_uppercase());
117 let is_upper = candidate.chars().all(|c| !c.is_ascii_lowercase());
118 has_prefix
119 && (14..=74).contains(&candidate.len())
120 && (is_lower || is_upper)
121 && candidate[3..].chars().all(is_bech32_char)
122}
123
124fn is_base58_char(c: char) -> bool {
125 c.is_ascii_alphanumeric() && !matches!(c, '0' | 'O' | 'I' | 'l')
126}
127
128fn is_bech32_char(c: char) -> bool {
129 matches!(
130 c.to_ascii_lowercase(),
131 'q' | 'p'
132 | 'z'
133 | 'r'
134 | 'y'
135 | '9'
136 | 'x'
137 | '8'
138 | 'g'
139 | 'f'
140 | '2'
141 | 't'
142 | 'v'
143 | 'd'
144 | 'w'
145 | '0'
146 | 's'
147 | '3'
148 | 'j'
149 | 'n'
150 | '5'
151 | '4'
152 | 'k'
153 | 'h'
154 | 'c'
155 | 'e'
156 | '6'
157 | 'm'
158 | 'u'
159 | 'a'
160 | '7'
161 | 'l'
162 )
163}
164
165fn is_crypto_boundary(text: &str, start: usize, end: usize) -> bool {
166 let before = text[..start].chars().next_back();
167 let after = text[end..].chars().next();
168 !before.is_some_and(is_crypto_continuation) && !after.is_some_and(is_crypto_continuation)
169}
170
171fn is_crypto_continuation(c: char) -> bool {
172 c.is_ascii_alphanumeric() || c == '_'
173}
174
175#[cfg(test)]
176mod tests {
177 use super::*;
178 use crate::default_registry;
179
180 fn texts(input: &str) -> Vec<String> {
181 CryptoAddressRecognizer
182 .scan(input)
183 .into_iter()
184 .map(|finding| finding.text)
185 .collect()
186 }
187
188 #[test]
189 fn test_crypto_address_ethereum_detected() {
190 assert_eq!(
191 texts("eth 0x52908400098527886E0F7030069857D2E4169EE7"),
192 ["0x52908400098527886E0F7030069857D2E4169EE7"]
193 );
194 }
195
196 #[test]
197 fn test_crypto_address_ethereum_lowercase_detected() {
198 assert_eq!(
199 texts("eth 0xde709f2102306220921060314715629080e2fb77"),
200 ["0xde709f2102306220921060314715629080e2fb77"]
201 );
202 }
203
204 #[test]
205 fn test_crypto_address_bitcoin_legacy_one_detected() {
206 assert_eq!(
207 texts("btc 1BoatSLRHtKNngkdXEeobR76b53LETtpyT"),
208 ["1BoatSLRHtKNngkdXEeobR76b53LETtpyT"]
209 );
210 }
211
212 #[test]
213 fn test_crypto_address_bitcoin_script_detected() {
214 assert_eq!(
215 texts("btc 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"),
216 ["3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"]
217 );
218 }
219
220 #[test]
221 fn test_crypto_address_bitcoin_bech32_detected() {
222 assert_eq!(
223 texts("wallet bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080"),
224 ["bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080"]
225 );
226 }
227
228 #[test]
229 fn test_crypto_address_multiple_values_detected() {
230 assert_eq!(
231 texts("eth 0xde709f2102306220921060314715629080e2fb77 btc 1BoatSLRHtKNngkdXEeobR76b53LETtpyT"),
232 [
233 "0xde709f2102306220921060314715629080e2fb77",
234 "1BoatSLRHtKNngkdXEeobR76b53LETtpyT"
235 ]
236 );
237 }
238
239 #[test]
240 fn test_crypto_address_ethereum_short_rejected() {
241 assert!(texts("eth 0x52908400098527886E0F7030069857D2E4169EE").is_empty());
242 }
243
244 #[test]
245 fn test_crypto_address_ethereum_invalid_hex_rejected() {
246 assert!(texts("eth 0x52908400098527886E0F7030069857D2E4169EEZ").is_empty());
247 }
248
249 #[test]
250 fn test_crypto_address_bitcoin_base58_zero_rejected() {
251 assert!(texts("btc 1BoatSLRHtKNngkdXEeobR76b53LETtpy0").is_empty());
252 }
253
254 #[test]
255 fn test_crypto_address_bech32_mixed_case_rejected() {
256 assert!(texts("btc bc1Qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080").is_empty());
257 }
258
259 #[test]
260 fn test_crypto_address_embedded_in_word_rejected() {
261 assert!(texts("id0xde709f2102306220921060314715629080e2fb77").is_empty());
262 }
263
264 #[test]
265 fn test_crypto_address_context_boosts_confidence() {
266 let with_context =
267 CryptoAddressRecognizer.scan("wallet 0xde709f2102306220921060314715629080e2fb77");
268 let without_context =
269 CryptoAddressRecognizer.scan("value 0xde709f2102306220921060314715629080e2fb77");
270 assert!(with_context[0].confidence > without_context[0].confidence);
271 }
272
273 #[test]
274 fn test_crypto_address_bitcoin_context_boosts_confidence() {
275 let with_context =
276 CryptoAddressRecognizer.scan("bitcoin 1BoatSLRHtKNngkdXEeobR76b53LETtpyT");
277 let without_context =
278 CryptoAddressRecognizer.scan("value 1BoatSLRHtKNngkdXEeobR76b53LETtpyT");
279 assert!(with_context[0].confidence > without_context[0].confidence);
280 }
281
282 #[test]
283 fn test_crypto_address_supported_locales_are_universal() {
284 assert!(CryptoAddressRecognizer.supported_locales().is_empty());
285 }
286
287 #[test]
288 fn test_crypto_address_default_registry_detects_crypto_address() {
289 let findings =
290 default_registry().scan_all("eth 0xde709f2102306220921060314715629080e2fb77");
291
292 assert!(findings
293 .iter()
294 .any(|finding| finding.entity_type == EntityType::CryptoAddress));
295 }
296}