Skip to main content

redact_core/recognizers/
validation.rs

1// Copyright 2026 Censgate LLC.
2// Licensed under the Apache License, Version 2.0. See the LICENSE file
3// in the project root for license information.
4
5//! Validation functions for PII patterns.
6//!
7//! These functions provide additional validation beyond regex matching
8//! to reduce false positives. For example, credit card numbers must
9//! pass the Luhn checksum, and IBANs have country-specific formats.
10
11use crate::types::EntityType;
12
13/// Validate a detected entity value based on its type.
14///
15/// Returns a confidence adjustment factor:
16/// - 1.0: Validation passed or not applicable
17/// - 0.0-0.99: Validation partially passed (reduces confidence)
18/// - 0.0: Validation failed (entity should be rejected)
19pub fn validate_entity(entity_type: &EntityType, value: &str) -> f32 {
20    match entity_type {
21        EntityType::CreditCard => validate_credit_card(value),
22        EntityType::IbanCode | EntityType::Iban => validate_iban(value),
23        EntityType::UsSsn => validate_us_ssn(value),
24        EntityType::UkNino => validate_uk_nino(value),
25        EntityType::UkNhs => validate_uk_nhs(value),
26        EntityType::Isbn => validate_isbn(value),
27        EntityType::IpAddress => validate_ip_address(value),
28        _ => 1.0, // No validation available
29    }
30}
31
32/// Validate credit card number using Luhn algorithm.
33///
34/// The Luhn algorithm (mod 10) is used by most credit card issuers.
35pub fn validate_credit_card(value: &str) -> f32 {
36    let digits: Vec<u32> = value
37        .chars()
38        .filter(|c| c.is_ascii_digit())
39        .filter_map(|c| c.to_digit(10))
40        .collect();
41
42    if digits.len() < 13 || digits.len() > 19 {
43        return 0.0;
44    }
45
46    if luhn_check(&digits) {
47        1.0
48    } else {
49        0.0
50    }
51}
52
53/// Luhn algorithm implementation
54fn luhn_check(digits: &[u32]) -> bool {
55    let mut sum = 0;
56    let mut double = false;
57
58    for &digit in digits.iter().rev() {
59        let mut d = digit;
60        if double {
61            d *= 2;
62            if d > 9 {
63                d -= 9;
64            }
65        }
66        sum += d;
67        double = !double;
68    }
69
70    sum.is_multiple_of(10)
71}
72
73/// Validate IBAN format and checksum.
74///
75/// IBAN validation:
76/// 1. Check length matches country-specific requirements
77/// 2. Verify mod-97 checksum
78pub fn validate_iban(value: &str) -> f32 {
79    let cleaned: String = value.chars().filter(|c| c.is_alphanumeric()).collect();
80
81    if cleaned.len() < 15 || cleaned.len() > 34 {
82        return 0.0;
83    }
84
85    // Check country code (first 2 chars must be letters)
86    let country_code: String = cleaned.chars().take(2).collect();
87    if !country_code.chars().all(|c| c.is_ascii_alphabetic()) {
88        return 0.0;
89    }
90
91    // Validate length for known countries
92    let expected_length = get_iban_length(&country_code);
93    if expected_length > 0 && cleaned.len() != expected_length {
94        return 0.5; // Partial match - wrong length for country
95    }
96
97    // Mod-97 checksum validation
98    if validate_iban_checksum(&cleaned) {
99        1.0
100    } else {
101        0.0
102    }
103}
104
105/// Get expected IBAN length for a country
106fn get_iban_length(country_code: &str) -> usize {
107    match country_code.to_uppercase().as_str() {
108        "GB" => 22,
109        "DE" => 22,
110        "FR" => 27,
111        "ES" => 24,
112        "IT" => 27,
113        "NL" => 18,
114        "BE" => 16,
115        "AT" => 20,
116        "CH" => 21,
117        "IE" => 22,
118        "PL" => 28,
119        "PT" => 25,
120        "SE" => 24,
121        "NO" => 15,
122        "DK" => 18,
123        "FI" => 18,
124        _ => 0, // Unknown country
125    }
126}
127
128/// Validate IBAN mod-97 checksum
129fn validate_iban_checksum(iban: &str) -> bool {
130    // Move first 4 chars to end
131    let rearranged = format!("{}{}", &iban[4..], &iban[..4]);
132
133    // Convert letters to numbers (A=10, B=11, etc.)
134    let mut numeric = String::new();
135    for c in rearranged.chars() {
136        if c.is_ascii_digit() {
137            numeric.push(c);
138        } else if c.is_ascii_alphabetic() {
139            let val = c.to_ascii_uppercase() as u32 - 'A' as u32 + 10;
140            numeric.push_str(&val.to_string());
141        }
142    }
143
144    // Calculate mod 97 (handle large numbers by processing in chunks)
145    let mut remainder: u64 = 0;
146    for chunk in numeric.as_bytes().chunks(9) {
147        let chunk_str: String = std::str::from_utf8(chunk).unwrap_or("0").to_string();
148        let combined = format!("{}{}", remainder, chunk_str);
149        remainder = combined.parse::<u64>().unwrap_or(0) % 97;
150    }
151
152    remainder == 1
153}
154
155/// Validate US Social Security Number format.
156///
157/// SSN rules:
158/// - Cannot start with 000, 666, or 900-999
159/// - Middle group cannot be 00
160/// - Last group cannot be 0000
161pub fn validate_us_ssn(value: &str) -> f32 {
162    let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
163
164    if digits.len() != 9 {
165        return 0.0;
166    }
167
168    let area: u32 = digits[0..3].parse().unwrap_or(0);
169    let group: u32 = digits[3..5].parse().unwrap_or(0);
170    let serial: u32 = digits[5..9].parse().unwrap_or(0);
171
172    // Invalid area numbers
173    if area == 0 || area == 666 || area >= 900 {
174        return 0.0;
175    }
176
177    // Invalid group or serial
178    if group == 0 || serial == 0 {
179        return 0.0;
180    }
181
182    1.0
183}
184
185/// Validate UK National Insurance Number format.
186///
187/// NINO format: 2 letters + 6 digits + 1 letter (A-D)
188/// First letter cannot be D, F, I, Q, U, V
189/// Second letter cannot be D, F, I, O, Q, U, V
190/// Prefixes BG, GB, NK, KN, TN, NT, ZZ are invalid
191pub fn validate_uk_nino(value: &str) -> f32 {
192    let cleaned: String = value
193        .chars()
194        .filter(|c| c.is_alphanumeric())
195        .collect::<String>()
196        .to_uppercase();
197
198    if cleaned.len() != 9 {
199        return 0.0;
200    }
201
202    let prefix: String = cleaned.chars().take(2).collect();
203    let suffix = cleaned.chars().last().unwrap_or('X');
204
205    // Check invalid prefixes
206    let invalid_prefixes = ["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"];
207    if invalid_prefixes.contains(&prefix.as_str()) {
208        return 0.0;
209    }
210
211    // Check first letter restrictions
212    let first = prefix.chars().next().unwrap_or('X');
213    if "DFIQUV".contains(first) {
214        return 0.0;
215    }
216
217    // Check second letter restrictions
218    let second = prefix.chars().nth(1).unwrap_or('X');
219    if "DFIOQUV".contains(second) {
220        return 0.0;
221    }
222
223    // Check suffix is A-D
224    if !"ABCD".contains(suffix) {
225        return 0.0;
226    }
227
228    // Check middle 6 characters are digits
229    let middle: String = cleaned.chars().skip(2).take(6).collect();
230    if !middle.chars().all(|c| c.is_ascii_digit()) {
231        return 0.0;
232    }
233
234    1.0
235}
236
237/// Validate UK NHS Number using mod-11 checksum.
238pub fn validate_uk_nhs(value: &str) -> f32 {
239    let digits: Vec<u32> = value
240        .chars()
241        .filter(|c| c.is_ascii_digit())
242        .filter_map(|c| c.to_digit(10))
243        .collect();
244
245    if digits.len() != 10 {
246        return 0.0;
247    }
248
249    // Mod-11 checksum: multiply first 9 digits by weights 10-2
250    let weights = [10, 9, 8, 7, 6, 5, 4, 3, 2];
251    let sum: u32 = digits
252        .iter()
253        .take(9)
254        .zip(weights.iter())
255        .map(|(d, w)| d * w)
256        .sum();
257
258    let remainder = 11 - (sum % 11);
259    let check_digit = if remainder == 11 { 0 } else { remainder };
260
261    if check_digit == 10 {
262        return 0.0; // Invalid NHS number
263    }
264
265    if digits[9] == check_digit {
266        1.0
267    } else {
268        0.0
269    }
270}
271
272/// Validate ISBN-10 or ISBN-13 checksum.
273pub fn validate_isbn(value: &str) -> f32 {
274    let cleaned: String = value
275        .chars()
276        .filter(|c| c.is_ascii_digit() || *c == 'X' || *c == 'x')
277        .collect();
278
279    match cleaned.len() {
280        10 => validate_isbn10(&cleaned),
281        13 => validate_isbn13(&cleaned),
282        _ => 0.0,
283    }
284}
285
286fn validate_isbn10(isbn: &str) -> f32 {
287    let mut sum = 0;
288    for (i, c) in isbn.chars().enumerate() {
289        let digit = if c == 'X' || c == 'x' {
290            10
291        } else {
292            c.to_digit(10).unwrap_or(0)
293        };
294        sum += digit * (10 - i as u32);
295    }
296
297    if sum.is_multiple_of(11) {
298        1.0
299    } else {
300        0.0
301    }
302}
303
304fn validate_isbn13(isbn: &str) -> f32 {
305    let digits: Vec<u32> = isbn.chars().filter_map(|c| c.to_digit(10)).collect();
306
307    if digits.len() != 13 {
308        return 0.0;
309    }
310
311    let sum: u32 = digits
312        .iter()
313        .enumerate()
314        .map(|(i, &d)| if i % 2 == 0 { d } else { d * 3 })
315        .sum();
316
317    if sum.is_multiple_of(10) {
318        1.0
319    } else {
320        0.0
321    }
322}
323
324/// Validate IPv4 address octets are in valid range.
325pub fn validate_ip_address(value: &str) -> f32 {
326    let octets: Vec<&str> = value.split('.').collect();
327
328    if octets.len() != 4 {
329        return 0.0;
330    }
331
332    for octet in octets {
333        match octet.parse::<u32>() {
334            Ok(n) if n <= 255 => continue,
335            _ => return 0.0,
336        }
337    }
338
339    1.0
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_luhn_valid_cards() {
348        // Valid test card numbers
349        assert_eq!(validate_credit_card("4532015112830366"), 1.0);
350        assert_eq!(validate_credit_card("5425233430109903"), 1.0);
351        assert_eq!(validate_credit_card("374245455400126"), 1.0);
352    }
353
354    #[test]
355    fn test_luhn_invalid_cards() {
356        assert_eq!(validate_credit_card("4532015112830367"), 0.0);
357        assert_eq!(validate_credit_card("1234567890123456"), 0.0);
358    }
359
360    #[test]
361    fn test_valid_ssn() {
362        assert_eq!(validate_us_ssn("123-45-6789"), 1.0);
363        assert_eq!(validate_us_ssn("123456789"), 1.0);
364    }
365
366    #[test]
367    fn test_invalid_ssn() {
368        assert_eq!(validate_us_ssn("000-12-3456"), 0.0); // Invalid area
369        assert_eq!(validate_us_ssn("666-12-3456"), 0.0); // Invalid area
370        assert_eq!(validate_us_ssn("900-12-3456"), 0.0); // Invalid area
371        assert_eq!(validate_us_ssn("123-00-3456"), 0.0); // Invalid group
372        assert_eq!(validate_us_ssn("123-45-0000"), 0.0); // Invalid serial
373    }
374
375    #[test]
376    fn test_valid_uk_nino() {
377        assert_eq!(validate_uk_nino("AB123456C"), 1.0);
378        assert_eq!(validate_uk_nino("JG103759A"), 1.0);
379    }
380
381    #[test]
382    fn test_invalid_uk_nino() {
383        assert_eq!(validate_uk_nino("BG123456A"), 0.0); // Invalid prefix
384        assert_eq!(validate_uk_nino("DA123456A"), 0.0); // Invalid first letter
385        assert_eq!(validate_uk_nino("AB123456E"), 0.0); // Invalid suffix
386    }
387
388    #[test]
389    fn test_valid_iban() {
390        assert_eq!(validate_iban("GB82WEST12345698765432"), 1.0);
391        assert_eq!(validate_iban("DE89370400440532013000"), 1.0);
392    }
393
394    #[test]
395    fn test_invalid_iban() {
396        assert_eq!(validate_iban("GB82WEST12345698765433"), 0.0); // Bad checksum
397        assert_eq!(validate_iban("XX00000000000000"), 0.0);
398    }
399
400    #[test]
401    fn test_valid_isbn() {
402        assert_eq!(validate_isbn("0-306-40615-2"), 1.0); // ISBN-10
403        assert_eq!(validate_isbn("978-0-306-40615-7"), 1.0); // ISBN-13
404    }
405
406    #[test]
407    fn test_valid_ip() {
408        assert_eq!(validate_ip_address("192.168.1.1"), 1.0);
409        assert_eq!(validate_ip_address("0.0.0.0"), 1.0);
410        assert_eq!(validate_ip_address("255.255.255.255"), 1.0);
411    }
412
413    #[test]
414    fn test_invalid_ip() {
415        assert_eq!(validate_ip_address("256.1.1.1"), 0.0);
416        assert_eq!(validate_ip_address("1.1.1"), 0.0);
417    }
418}