cloakrs_patterns/
mac_address.rs1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static MAC_COLON_HYPHEN_REGEX: Lazy<Regex> =
8 Lazy::new(|| compile_regex(r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b"));
9static MAC_DOTTED_REGEX: Lazy<Regex> =
10 Lazy::new(|| compile_regex(r"\b[0-9A-Fa-f]{4}\.[0-9A-Fa-f]{4}\.[0-9A-Fa-f]{4}\b"));
11
12const CONTEXT_WORDS: &[&str] = &[
13 "mac",
14 "mac address",
15 "hardware address",
16 "ethernet",
17 "bssid",
18 "adapter",
19 "nic",
20];
21
22#[derive(Debug, Clone, Copy, Default)]
34pub struct MacAddressRecognizer;
35
36impl Recognizer for MacAddressRecognizer {
37 fn id(&self) -> &str {
38 "mac_address_regex_v1"
39 }
40
41 fn entity_type(&self) -> EntityType {
42 EntityType::MacAddress
43 }
44
45 fn supported_locales(&self) -> &[Locale] {
46 &[]
47 }
48
49 fn scan(&self, text: &str) -> Vec<PiiEntity> {
50 let mut seen = HashSet::new();
51 let mut findings = Vec::new();
52
53 for regex in [&*MAC_COLON_HYPHEN_REGEX, &*MAC_DOTTED_REGEX] {
54 for matched in regex.find_iter(text) {
55 if seen.insert((matched.start(), matched.end()))
56 && self.is_valid_match(text, matched.start(), matched.end())
57 {
58 findings.push(PiiEntity {
59 entity_type: self.entity_type(),
60 span: Span::new(matched.start(), matched.end()),
61 text: matched.as_str().to_string(),
62 confidence: self.compute_confidence(text, matched.start()),
63 recognizer_id: self.id().to_string(),
64 });
65 }
66 }
67 }
68
69 findings.sort_by_key(|finding| finding.span.start);
70 findings
71 }
72
73 fn validate(&self, candidate: &str) -> bool {
74 let normalized = normalized_hex(candidate);
75 normalized.len() == 12 && normalized.chars().all(|c| c.is_ascii_hexdigit())
76 }
77}
78
79impl MacAddressRecognizer {
80 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
81 self.validate(&text[start..end]) && is_mac_boundary(text, start, end)
82 }
83
84 fn compute_confidence(&self, text: &str, start: usize) -> Confidence {
85 confidence(0.82 + context_boost(text, start, CONTEXT_WORDS))
86 }
87}
88
89fn normalized_hex(candidate: &str) -> String {
90 candidate
91 .chars()
92 .filter(|c| c.is_ascii_hexdigit())
93 .collect()
94}
95
96fn is_mac_boundary(text: &str, start: usize, end: usize) -> bool {
97 let before = text[..start].chars().next_back();
98 let after = text[end..].chars().next();
99 !before.is_some_and(is_mac_continuation) && !after.is_some_and(is_mac_continuation)
100}
101
102fn is_mac_continuation(c: char) -> bool {
103 c.is_ascii_hexdigit() || matches!(c, ':' | '-' | '.')
104}
105
106#[cfg(test)]
107mod tests {
108 use super::*;
109 use crate::default_registry;
110
111 fn texts(input: &str) -> Vec<String> {
112 MacAddressRecognizer
113 .scan(input)
114 .into_iter()
115 .map(|finding| finding.text)
116 .collect()
117 }
118
119 #[test]
120 fn test_mac_address_colon_uppercase_detected() {
121 assert_eq!(texts("mac 00:1A:2B:3C:4D:5E"), ["00:1A:2B:3C:4D:5E"]);
122 }
123
124 #[test]
125 fn test_mac_address_colon_lowercase_detected() {
126 assert_eq!(texts("mac aa:bb:cc:dd:ee:ff"), ["aa:bb:cc:dd:ee:ff"]);
127 }
128
129 #[test]
130 fn test_mac_address_hyphen_detected() {
131 assert_eq!(texts("mac 00-1A-2B-3C-4D-5E"), ["00-1A-2B-3C-4D-5E"]);
132 }
133
134 #[test]
135 fn test_mac_address_dotted_detected() {
136 assert_eq!(texts("mac 001A.2B3C.4D5E"), ["001A.2B3C.4D5E"]);
137 }
138
139 #[test]
140 fn test_mac_address_multiple_values_detected() {
141 assert_eq!(
142 texts("a 00:1A:2B:3C:4D:5E b 001A.2B3C.4D5E"),
143 ["00:1A:2B:3C:4D:5E", "001A.2B3C.4D5E"]
144 );
145 }
146
147 #[test]
148 fn test_mac_address_invalid_hex_rejected() {
149 assert!(texts("mac 00:1G:2B:3C:4D:5E").is_empty());
150 }
151
152 #[test]
153 fn test_mac_address_too_short_rejected() {
154 assert!(texts("mac 00:1A:2B:3C:4D").is_empty());
155 }
156
157 #[test]
158 fn test_mac_address_too_long_rejected() {
159 assert!(texts("mac 00:1A:2B:3C:4D:5E:6F").is_empty());
160 }
161
162 #[test]
163 fn test_mac_address_embedded_in_word_rejected() {
164 assert!(texts("id00:1A:2B:3C:4D:5E").is_empty());
165 }
166
167 #[test]
168 fn test_mac_address_embedded_in_longer_dotted_rejected() {
169 assert!(texts("001A.2B3C.4D5E.6F70").is_empty());
170 }
171
172 #[test]
173 fn test_mac_address_context_boosts_confidence() {
174 let with_context = MacAddressRecognizer.scan("mac address 00:1A:2B:3C:4D:5E");
175 let without_context = MacAddressRecognizer.scan("value 00:1A:2B:3C:4D:5E");
176 assert!(with_context[0].confidence > without_context[0].confidence);
177 }
178
179 #[test]
180 fn test_mac_address_bssid_context_boosts_confidence() {
181 let with_context = MacAddressRecognizer.scan("bssid 00:1A:2B:3C:4D:5E");
182 let without_context = MacAddressRecognizer.scan("value 00:1A:2B:3C:4D:5E");
183 assert!(with_context[0].confidence > without_context[0].confidence);
184 }
185
186 #[test]
187 fn test_mac_address_supported_locales_are_universal() {
188 assert!(MacAddressRecognizer.supported_locales().is_empty());
189 }
190
191 #[test]
192 fn test_mac_address_validate_accepts_dotted() {
193 assert!(MacAddressRecognizer.validate("001A.2B3C.4D5E"));
194 }
195
196 #[test]
197 fn test_mac_address_default_registry_detects_mac_address() {
198 let findings = default_registry().scan_all("mac 00:1A:2B:3C:4D:5E");
199
200 assert!(findings
201 .iter()
202 .any(|finding| finding.entity_type == EntityType::MacAddress));
203 }
204}