Skip to main content

redact_core/recognizers/
validation.rs

1// Copyright (c) 2026 Censgate LLC.
2// Licensed under the Business Source License 1.1 (BUSL-1.1).
3// See the LICENSE file in the project root for license details,
4// including the Additional Use Grant, Change Date, and Change License.
5
6//! Validation functions for PII patterns.
7//!
8//! These functions provide additional validation beyond regex matching
9//! to reduce false positives. For example, credit card numbers must
10//! pass the Luhn checksum, and IBANs have country-specific formats.
11
12use crate::types::EntityType;
13
14/// Validate a detected entity value based on its type.
15///
16/// Returns a confidence adjustment factor:
17/// - 1.0: Validation passed or not applicable
18/// - 0.0-0.99: Validation partially passed (reduces confidence)
19/// - 0.0: Validation failed (entity should be rejected)
20pub fn validate_entity(entity_type: &EntityType, value: &str) -> f32 {
21    match entity_type {
22        EntityType::CreditCard => validate_credit_card(value),
23        EntityType::IbanCode | EntityType::Iban => validate_iban(value),
24        EntityType::UsSsn => validate_us_ssn(value),
25        EntityType::UkNino => validate_uk_nino(value),
26        EntityType::UkNhs => validate_uk_nhs(value),
27        EntityType::Isbn => validate_isbn(value),
28        EntityType::IpAddress => validate_ip_address(value),
29        _ => 1.0, // No validation available
30    }
31}
32
33/// Validate credit card number using Luhn algorithm.
34///
35/// The Luhn algorithm (mod 10) is used by most credit card issuers.
36pub fn validate_credit_card(value: &str) -> f32 {
37    let digits: Vec<u32> = value
38        .chars()
39        .filter(|c| c.is_ascii_digit())
40        .filter_map(|c| c.to_digit(10))
41        .collect();
42
43    if digits.len() < 13 || digits.len() > 19 {
44        return 0.0;
45    }
46
47    if luhn_check(&digits) {
48        1.0
49    } else {
50        0.0
51    }
52}
53
54/// Luhn algorithm implementation
55fn luhn_check(digits: &[u32]) -> bool {
56    let mut sum = 0;
57    let mut double = false;
58
59    for &digit in digits.iter().rev() {
60        let mut d = digit;
61        if double {
62            d *= 2;
63            if d > 9 {
64                d -= 9;
65            }
66        }
67        sum += d;
68        double = !double;
69    }
70
71    sum.is_multiple_of(10)
72}
73
74/// Validate IBAN format and checksum.
75///
76/// IBAN validation:
77/// 1. Check length matches country-specific requirements
78/// 2. Verify mod-97 checksum
79pub fn validate_iban(value: &str) -> f32 {
80    let cleaned: String = value.chars().filter(|c| c.is_alphanumeric()).collect();
81
82    if cleaned.len() < 15 || cleaned.len() > 34 {
83        return 0.0;
84    }
85
86    // Check country code (first 2 chars must be letters)
87    let country_code: String = cleaned.chars().take(2).collect();
88    if !country_code.chars().all(|c| c.is_ascii_alphabetic()) {
89        return 0.0;
90    }
91
92    // Validate length for known countries
93    let expected_length = get_iban_length(&country_code);
94    if expected_length > 0 && cleaned.len() != expected_length {
95        return 0.5; // Partial match - wrong length for country
96    }
97
98    // Mod-97 checksum validation
99    if validate_iban_checksum(&cleaned) {
100        1.0
101    } else {
102        0.0
103    }
104}
105
106/// Get expected IBAN length for a country
107fn get_iban_length(country_code: &str) -> usize {
108    match country_code.to_uppercase().as_str() {
109        "GB" => 22,
110        "DE" => 22,
111        "FR" => 27,
112        "ES" => 24,
113        "IT" => 27,
114        "NL" => 18,
115        "BE" => 16,
116        "AT" => 20,
117        "CH" => 21,
118        "IE" => 22,
119        "PL" => 28,
120        "PT" => 25,
121        "SE" => 24,
122        "NO" => 15,
123        "DK" => 18,
124        "FI" => 18,
125        _ => 0, // Unknown country
126    }
127}
128
129/// Validate IBAN mod-97 checksum
130fn validate_iban_checksum(iban: &str) -> bool {
131    // Move first 4 chars to end
132    let rearranged = format!("{}{}", &iban[4..], &iban[..4]);
133
134    // Convert letters to numbers (A=10, B=11, etc.)
135    let mut numeric = String::new();
136    for c in rearranged.chars() {
137        if c.is_ascii_digit() {
138            numeric.push(c);
139        } else if c.is_ascii_alphabetic() {
140            let val = c.to_ascii_uppercase() as u32 - 'A' as u32 + 10;
141            numeric.push_str(&val.to_string());
142        }
143    }
144
145    // Calculate mod 97 (handle large numbers by processing in chunks)
146    let mut remainder: u64 = 0;
147    for chunk in numeric.as_bytes().chunks(9) {
148        let chunk_str: String = std::str::from_utf8(chunk).unwrap_or("0").to_string();
149        let combined = format!("{}{}", remainder, chunk_str);
150        remainder = combined.parse::<u64>().unwrap_or(0) % 97;
151    }
152
153    remainder == 1
154}
155
156/// Validate US Social Security Number format.
157///
158/// SSN rules:
159/// - Cannot start with 000, 666, or 900-999
160/// - Middle group cannot be 00
161/// - Last group cannot be 0000
162pub fn validate_us_ssn(value: &str) -> f32 {
163    let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
164
165    if digits.len() != 9 {
166        return 0.0;
167    }
168
169    let area: u32 = digits[0..3].parse().unwrap_or(0);
170    let group: u32 = digits[3..5].parse().unwrap_or(0);
171    let serial: u32 = digits[5..9].parse().unwrap_or(0);
172
173    // Invalid area numbers
174    if area == 0 || area == 666 || area >= 900 {
175        return 0.0;
176    }
177
178    // Invalid group or serial
179    if group == 0 || serial == 0 {
180        return 0.0;
181    }
182
183    1.0
184}
185
186/// Validate UK National Insurance Number format.
187///
188/// NINO format: 2 letters + 6 digits + 1 letter (A-D)
189/// First letter cannot be D, F, I, Q, U, V
190/// Second letter cannot be D, F, I, O, Q, U, V
191/// Prefixes BG, GB, NK, KN, TN, NT, ZZ are invalid
192pub fn validate_uk_nino(value: &str) -> f32 {
193    let cleaned: String = value
194        .chars()
195        .filter(|c| c.is_alphanumeric())
196        .collect::<String>()
197        .to_uppercase();
198
199    if cleaned.len() != 9 {
200        return 0.0;
201    }
202
203    let prefix: String = cleaned.chars().take(2).collect();
204    let suffix = cleaned.chars().last().unwrap_or('X');
205
206    // Check invalid prefixes
207    let invalid_prefixes = ["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"];
208    if invalid_prefixes.contains(&prefix.as_str()) {
209        return 0.0;
210    }
211
212    // Check first letter restrictions
213    let first = prefix.chars().next().unwrap_or('X');
214    if "DFIQUV".contains(first) {
215        return 0.0;
216    }
217
218    // Check second letter restrictions
219    let second = prefix.chars().nth(1).unwrap_or('X');
220    if "DFIOQUV".contains(second) {
221        return 0.0;
222    }
223
224    // Check suffix is A-D
225    if !"ABCD".contains(suffix) {
226        return 0.0;
227    }
228
229    // Check middle 6 characters are digits
230    let middle: String = cleaned.chars().skip(2).take(6).collect();
231    if !middle.chars().all(|c| c.is_ascii_digit()) {
232        return 0.0;
233    }
234
235    1.0
236}
237
238/// Validate UK NHS Number using mod-11 checksum.
239pub fn validate_uk_nhs(value: &str) -> f32 {
240    let digits: Vec<u32> = value
241        .chars()
242        .filter(|c| c.is_ascii_digit())
243        .filter_map(|c| c.to_digit(10))
244        .collect();
245
246    if digits.len() != 10 {
247        return 0.0;
248    }
249
250    // Mod-11 checksum: multiply first 9 digits by weights 10-2
251    let weights = [10, 9, 8, 7, 6, 5, 4, 3, 2];
252    let sum: u32 = digits
253        .iter()
254        .take(9)
255        .zip(weights.iter())
256        .map(|(d, w)| d * w)
257        .sum();
258
259    let remainder = 11 - (sum % 11);
260    let check_digit = if remainder == 11 { 0 } else { remainder };
261
262    if check_digit == 10 {
263        return 0.0; // Invalid NHS number
264    }
265
266    if digits[9] == check_digit {
267        1.0
268    } else {
269        0.0
270    }
271}
272
273/// Validate ISBN-10 or ISBN-13 checksum.
274pub fn validate_isbn(value: &str) -> f32 {
275    let cleaned: String = value
276        .chars()
277        .filter(|c| c.is_ascii_digit() || *c == 'X' || *c == 'x')
278        .collect();
279
280    match cleaned.len() {
281        10 => validate_isbn10(&cleaned),
282        13 => validate_isbn13(&cleaned),
283        _ => 0.0,
284    }
285}
286
287fn validate_isbn10(isbn: &str) -> f32 {
288    let mut sum = 0;
289    for (i, c) in isbn.chars().enumerate() {
290        let digit = if c == 'X' || c == 'x' {
291            10
292        } else {
293            c.to_digit(10).unwrap_or(0)
294        };
295        sum += digit * (10 - i as u32);
296    }
297
298    if sum.is_multiple_of(11) {
299        1.0
300    } else {
301        0.0
302    }
303}
304
305fn validate_isbn13(isbn: &str) -> f32 {
306    let digits: Vec<u32> = isbn.chars().filter_map(|c| c.to_digit(10)).collect();
307
308    if digits.len() != 13 {
309        return 0.0;
310    }
311
312    let sum: u32 = digits
313        .iter()
314        .enumerate()
315        .map(|(i, &d)| if i % 2 == 0 { d } else { d * 3 })
316        .sum();
317
318    if sum.is_multiple_of(10) {
319        1.0
320    } else {
321        0.0
322    }
323}
324
325/// Validate IPv4 address octets are in valid range.
326pub fn validate_ip_address(value: &str) -> f32 {
327    let octets: Vec<&str> = value.split('.').collect();
328
329    if octets.len() != 4 {
330        return 0.0;
331    }
332
333    for octet in octets {
334        match octet.parse::<u32>() {
335            Ok(n) if n <= 255 => continue,
336            _ => return 0.0,
337        }
338    }
339
340    1.0
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346
347    #[test]
348    fn test_luhn_valid_cards() {
349        // Valid test card numbers
350        assert_eq!(validate_credit_card("4532015112830366"), 1.0);
351        assert_eq!(validate_credit_card("5425233430109903"), 1.0);
352        assert_eq!(validate_credit_card("374245455400126"), 1.0);
353    }
354
355    #[test]
356    fn test_luhn_invalid_cards() {
357        assert_eq!(validate_credit_card("4532015112830367"), 0.0);
358        assert_eq!(validate_credit_card("1234567890123456"), 0.0);
359    }
360
361    #[test]
362    fn test_valid_ssn() {
363        assert_eq!(validate_us_ssn("123-45-6789"), 1.0);
364        assert_eq!(validate_us_ssn("123456789"), 1.0);
365    }
366
367    #[test]
368    fn test_invalid_ssn() {
369        assert_eq!(validate_us_ssn("000-12-3456"), 0.0); // Invalid area
370        assert_eq!(validate_us_ssn("666-12-3456"), 0.0); // Invalid area
371        assert_eq!(validate_us_ssn("900-12-3456"), 0.0); // Invalid area
372        assert_eq!(validate_us_ssn("123-00-3456"), 0.0); // Invalid group
373        assert_eq!(validate_us_ssn("123-45-0000"), 0.0); // Invalid serial
374    }
375
376    #[test]
377    fn test_valid_uk_nino() {
378        assert_eq!(validate_uk_nino("AB123456C"), 1.0);
379        assert_eq!(validate_uk_nino("JG103759A"), 1.0);
380    }
381
382    #[test]
383    fn test_invalid_uk_nino() {
384        assert_eq!(validate_uk_nino("BG123456A"), 0.0); // Invalid prefix
385        assert_eq!(validate_uk_nino("DA123456A"), 0.0); // Invalid first letter
386        assert_eq!(validate_uk_nino("AB123456E"), 0.0); // Invalid suffix
387    }
388
389    #[test]
390    fn test_valid_iban() {
391        assert_eq!(validate_iban("GB82WEST12345698765432"), 1.0);
392        assert_eq!(validate_iban("DE89370400440532013000"), 1.0);
393    }
394
395    #[test]
396    fn test_invalid_iban() {
397        assert_eq!(validate_iban("GB82WEST12345698765433"), 0.0); // Bad checksum
398        assert_eq!(validate_iban("XX00000000000000"), 0.0);
399    }
400
401    #[test]
402    fn test_valid_isbn() {
403        assert_eq!(validate_isbn("0-306-40615-2"), 1.0); // ISBN-10
404        assert_eq!(validate_isbn("978-0-306-40615-7"), 1.0); // ISBN-13
405    }
406
407    #[test]
408    fn test_valid_ip() {
409        assert_eq!(validate_ip_address("192.168.1.1"), 1.0);
410        assert_eq!(validate_ip_address("0.0.0.0"), 1.0);
411        assert_eq!(validate_ip_address("255.255.255.255"), 1.0);
412    }
413
414    #[test]
415    fn test_invalid_ip() {
416        assert_eq!(validate_ip_address("256.1.1.1"), 0.0);
417        assert_eq!(validate_ip_address("1.1.1"), 0.0);
418    }
419}