Skip to main content

cloakrs_patterns/
crypto.rs

1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static ETHEREUM_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"\b0x[0-9A-Fa-f]{40}\b"));
8static BITCOIN_LEGACY_REGEX: Lazy<Regex> =
9    Lazy::new(|| compile_regex(r"\b[13][1-9A-HJ-NP-Za-km-z]{25,34}\b"));
10static BITCOIN_BECH32_REGEX: Lazy<Regex> =
11    Lazy::new(|| compile_regex(r"\b(?:bc1|BC1)[ac-hj-np-zAC-HJ-NP-Z02-9]{11,71}\b"));
12
13const CONTEXT_WORDS: &[&str] = &[
14    "wallet", "crypto", "bitcoin", "btc", "ethereum", "eth", "address", "deposit",
15];
16
17/// Recognizes common Bitcoin and Ethereum wallet address shapes.
18///
19/// # Examples
20///
21/// ```
22/// use cloakrs_core::{EntityType, Recognizer};
23/// use cloakrs_patterns::CryptoAddressRecognizer;
24///
25/// let findings = CryptoAddressRecognizer.scan("eth 0x52908400098527886E0F7030069857D2E4169EE7");
26/// assert_eq!(findings[0].entity_type, EntityType::CryptoAddress);
27/// ```
28#[derive(Debug, Clone, Copy, Default)]
29pub struct CryptoAddressRecognizer;
30
31impl Recognizer for CryptoAddressRecognizer {
32    fn id(&self) -> &str {
33        "crypto_address_regex_v1"
34    }
35
36    fn entity_type(&self) -> EntityType {
37        EntityType::CryptoAddress
38    }
39
40    fn supported_locales(&self) -> &[Locale] {
41        &[]
42    }
43
44    fn scan(&self, text: &str) -> Vec<PiiEntity> {
45        let mut seen = HashSet::new();
46        let mut findings = Vec::new();
47
48        for regex in [
49            &*ETHEREUM_REGEX,
50            &*BITCOIN_LEGACY_REGEX,
51            &*BITCOIN_BECH32_REGEX,
52        ] {
53            for matched in regex.find_iter(text) {
54                if seen.insert((matched.start(), matched.end()))
55                    && self.is_valid_match(text, matched.start(), matched.end())
56                {
57                    findings.push(PiiEntity {
58                        entity_type: self.entity_type(),
59                        span: Span::new(matched.start(), matched.end()),
60                        text: matched.as_str().to_string(),
61                        confidence: self.compute_confidence(
62                            text,
63                            matched.start(),
64                            matched.as_str(),
65                        ),
66                        recognizer_id: self.id().to_string(),
67                    });
68                }
69            }
70        }
71
72        findings.sort_by_key(|finding| finding.span.start);
73        findings
74    }
75
76    fn validate(&self, candidate: &str) -> bool {
77        validate_ethereum(candidate)
78            || validate_bitcoin_legacy(candidate)
79            || validate_bitcoin_bech32(candidate)
80    }
81}
82
83impl CryptoAddressRecognizer {
84    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
85        self.validate(&text[start..end]) && is_crypto_boundary(text, start, end)
86    }
87
88    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
89        let base = if candidate.starts_with("0x") || candidate.starts_with("0X") {
90            0.92
91        } else {
92            0.86
93        };
94        confidence(base + context_boost(text, start, CONTEXT_WORDS))
95    }
96}
97
98fn validate_ethereum(candidate: &str) -> bool {
99    candidate.len() == 42
100        && candidate
101            .get(..2)
102            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("0x"))
103        && candidate[2..].chars().all(|c| c.is_ascii_hexdigit())
104}
105
106fn validate_bitcoin_legacy(candidate: &str) -> bool {
107    (26..=35).contains(&candidate.len())
108        && matches!(candidate.as_bytes().first(), Some(b'1' | b'3'))
109        && candidate.chars().all(is_base58_char)
110}
111
112fn validate_bitcoin_bech32(candidate: &str) -> bool {
113    let has_prefix = candidate
114        .get(..3)
115        .is_some_and(|prefix| prefix.eq_ignore_ascii_case("bc1"));
116    let is_lower = candidate.chars().all(|c| !c.is_ascii_uppercase());
117    let is_upper = candidate.chars().all(|c| !c.is_ascii_lowercase());
118    has_prefix
119        && (14..=74).contains(&candidate.len())
120        && (is_lower || is_upper)
121        && candidate[3..].chars().all(is_bech32_char)
122}
123
124fn is_base58_char(c: char) -> bool {
125    c.is_ascii_alphanumeric() && !matches!(c, '0' | 'O' | 'I' | 'l')
126}
127
128fn is_bech32_char(c: char) -> bool {
129    matches!(
130        c.to_ascii_lowercase(),
131        'q' | 'p'
132            | 'z'
133            | 'r'
134            | 'y'
135            | '9'
136            | 'x'
137            | '8'
138            | 'g'
139            | 'f'
140            | '2'
141            | 't'
142            | 'v'
143            | 'd'
144            | 'w'
145            | '0'
146            | 's'
147            | '3'
148            | 'j'
149            | 'n'
150            | '5'
151            | '4'
152            | 'k'
153            | 'h'
154            | 'c'
155            | 'e'
156            | '6'
157            | 'm'
158            | 'u'
159            | 'a'
160            | '7'
161            | 'l'
162    )
163}
164
165fn is_crypto_boundary(text: &str, start: usize, end: usize) -> bool {
166    let before = text[..start].chars().next_back();
167    let after = text[end..].chars().next();
168    !before.is_some_and(is_crypto_continuation) && !after.is_some_and(is_crypto_continuation)
169}
170
171fn is_crypto_continuation(c: char) -> bool {
172    c.is_ascii_alphanumeric() || c == '_'
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178    use crate::default_registry;
179
180    fn texts(input: &str) -> Vec<String> {
181        CryptoAddressRecognizer
182            .scan(input)
183            .into_iter()
184            .map(|finding| finding.text)
185            .collect()
186    }
187
188    #[test]
189    fn test_crypto_address_ethereum_detected() {
190        assert_eq!(
191            texts("eth 0x52908400098527886E0F7030069857D2E4169EE7"),
192            ["0x52908400098527886E0F7030069857D2E4169EE7"]
193        );
194    }
195
196    #[test]
197    fn test_crypto_address_ethereum_lowercase_detected() {
198        assert_eq!(
199            texts("eth 0xde709f2102306220921060314715629080e2fb77"),
200            ["0xde709f2102306220921060314715629080e2fb77"]
201        );
202    }
203
204    #[test]
205    fn test_crypto_address_bitcoin_legacy_one_detected() {
206        assert_eq!(
207            texts("btc 1BoatSLRHtKNngkdXEeobR76b53LETtpyT"),
208            ["1BoatSLRHtKNngkdXEeobR76b53LETtpyT"]
209        );
210    }
211
212    #[test]
213    fn test_crypto_address_bitcoin_script_detected() {
214        assert_eq!(
215            texts("btc 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"),
216            ["3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"]
217        );
218    }
219
220    #[test]
221    fn test_crypto_address_bitcoin_bech32_detected() {
222        assert_eq!(
223            texts("wallet bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080"),
224            ["bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080"]
225        );
226    }
227
228    #[test]
229    fn test_crypto_address_multiple_values_detected() {
230        assert_eq!(
231            texts("eth 0xde709f2102306220921060314715629080e2fb77 btc 1BoatSLRHtKNngkdXEeobR76b53LETtpyT"),
232            [
233                "0xde709f2102306220921060314715629080e2fb77",
234                "1BoatSLRHtKNngkdXEeobR76b53LETtpyT"
235            ]
236        );
237    }
238
239    #[test]
240    fn test_crypto_address_ethereum_short_rejected() {
241        assert!(texts("eth 0x52908400098527886E0F7030069857D2E4169EE").is_empty());
242    }
243
244    #[test]
245    fn test_crypto_address_ethereum_invalid_hex_rejected() {
246        assert!(texts("eth 0x52908400098527886E0F7030069857D2E4169EEZ").is_empty());
247    }
248
249    #[test]
250    fn test_crypto_address_bitcoin_base58_zero_rejected() {
251        assert!(texts("btc 1BoatSLRHtKNngkdXEeobR76b53LETtpy0").is_empty());
252    }
253
254    #[test]
255    fn test_crypto_address_bech32_mixed_case_rejected() {
256        assert!(texts("btc bc1Qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080").is_empty());
257    }
258
259    #[test]
260    fn test_crypto_address_embedded_in_word_rejected() {
261        assert!(texts("id0xde709f2102306220921060314715629080e2fb77").is_empty());
262    }
263
264    #[test]
265    fn test_crypto_address_context_boosts_confidence() {
266        let with_context =
267            CryptoAddressRecognizer.scan("wallet 0xde709f2102306220921060314715629080e2fb77");
268        let without_context =
269            CryptoAddressRecognizer.scan("value 0xde709f2102306220921060314715629080e2fb77");
270        assert!(with_context[0].confidence > without_context[0].confidence);
271    }
272
273    #[test]
274    fn test_crypto_address_bitcoin_context_boosts_confidence() {
275        let with_context =
276            CryptoAddressRecognizer.scan("bitcoin 1BoatSLRHtKNngkdXEeobR76b53LETtpyT");
277        let without_context =
278            CryptoAddressRecognizer.scan("value 1BoatSLRHtKNngkdXEeobR76b53LETtpyT");
279        assert!(with_context[0].confidence > without_context[0].confidence);
280    }
281
282    #[test]
283    fn test_crypto_address_supported_locales_are_universal() {
284        assert!(CryptoAddressRecognizer.supported_locales().is_empty());
285    }
286
287    #[test]
288    fn test_crypto_address_default_registry_detects_crypto_address() {
289        let findings =
290            default_registry().scan_all("eth 0xde709f2102306220921060314715629080e2fb77");
291
292        assert!(findings
293            .iter()
294            .any(|finding| finding.entity_type == EntityType::CryptoAddress));
295    }
296}