redact-ner 0.8.3

Named Entity Recognition for PII detection using ONNX Runtime
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
/// End-to-End NER Testing with ONNX Models
///
/// This test suite validates complete NER functionality with actual ONNX models.
/// Tests are ignored by default since they require downloading models.
///
/// # Setup Instructions
///
/// 1. Download a small NER model (recommended: ~50MB):
///    ```bash
///    python scripts/export_ner_model.py \
///        --model dslim/bert-base-NER \
///        --output tests/fixtures/models/bert-base-ner
///    ```
///
/// 2. Run tests with the ignored flag:
///    ```bash
///    cargo test --package redact-ner --test ner_e2e -- --ignored
///    ```
///
/// # Test Models
///
/// Recommended models for testing:
/// - `dslim/bert-base-NER` (~420MB) - Excellent accuracy, CoNLL-2003 trained
/// - `dbmdz/bert-large-cased-finetuned-conll03-english` (~1.2GB) - High accuracy
/// - `Davlan/distilbert-base-multilingual-cased-ner-hrl` (~500MB) - Multilingual
///
/// For faster CI testing, use quantized or distilled models (~50-100MB).
///
/// # Model Directory Structure
///
/// The export script creates a directory with the following structure:
/// ```
/// models/bert-base-ner/
/// ├── model.onnx          # ONNX model file (required)
/// ├── tokenizer.json      # HuggingFace tokenizer (required)
/// ├── config.json         # Model configuration with label mappings
/// ├── special_tokens_map.json
/// └── tokenizer_config.json
/// ```
///
/// Only `model.onnx` and `tokenizer.json` are required for inference.
use anyhow::Result;
use redact_core::{
    anonymizers::{AnonymizationStrategy, AnonymizerConfig},
    AnalyzerEngine, EntityType, Recognizer,
};
use redact_ner::{NerConfig, NerRecognizer};
use std::path::Path;
use std::sync::Arc;

/// Test fixture with expected entities
struct NerTestCase {
    text: &'static str,
    expected_entities: Vec<(EntityType, &'static str)>,
}

/// Get test cases for NER validation
fn get_test_cases() -> Vec<NerTestCase> {
    vec![
        NerTestCase {
            text: "John Doe works at Microsoft in Seattle.",
            expected_entities: vec![
                (EntityType::Person, "John Doe"),
                (EntityType::Organization, "Microsoft"),
                (EntityType::Location, "Seattle"),
            ],
        },
        NerTestCase {
            text: "Marie Curie conducted research in Paris.",
            expected_entities: vec![
                (EntityType::Person, "Marie Curie"),
                (EntityType::Location, "Paris"),
            ],
        },
        NerTestCase {
            text: "Apple Inc. was founded by Steve Jobs in California.",
            expected_entities: vec![
                (EntityType::Organization, "Apple Inc."),
                (EntityType::Person, "Steve Jobs"),
                (EntityType::Location, "California"),
            ],
        },
    ]
}

/// Helper to check if a model directory exists
fn model_exists(path: &str) -> bool {
    let model_path = Path::new(path).join("model.onnx");
    let tokenizer_path = Path::new(path).join("tokenizer.json");
    model_path.exists() && tokenizer_path.exists()
}

/// Test NER with BERT-base model
#[test]
#[ignore] // Requires downloading model first
fn test_ner_with_bert_base() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        eprintln!("Model not found at: {}", model_dir);
        eprintln!(
            "Run: python scripts/export_ner_model.py --model dslim/bert-base-NER --output {}",
            model_dir
        );
        return Ok(()); // Skip test if model not available
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;
    assert!(
        recognizer.is_available(),
        "NER should be available with model"
    );

    // Test each case
    for test_case in get_test_cases() {
        let results = recognizer.analyze(test_case.text, "en")?;

        // Verify expected entities are detected
        for (expected_type, expected_text) in &test_case.expected_entities {
            let found = results.iter().any(|r| {
                r.entity_type == *expected_type
                    && r.text.as_ref().map(|t| t.as_str()) == Some(*expected_text)
            });

            assert!(
                found,
                "Expected to find {:?} '{}' in text: '{}'\nDetected: {:?}",
                expected_type, expected_text, test_case.text, results
            );
        }
    }

    Ok(())
}

/// Test NER with multilingual model
#[test]
#[ignore] // Requires downloading model first
fn test_ner_multilingual() -> Result<()> {
    let model_dir = "tests/fixtures/models/multilingual-ner";

    if !model_exists(model_dir) {
        eprintln!("Multilingual model not found");
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    // Test multiple languages
    let test_cases = vec![
        ("en", "Barack Obama visited London."),
        ("es", "Gabriel García Márquez nació en Colombia."),
        ("fr", "Emmanuel Macron est le président de la France."),
        ("de", "Angela Merkel war Bundeskanzlerin von Deutschland."),
    ];

    for (lang, text) in test_cases {
        let results = recognizer.analyze(text, lang)?;
        assert!(
            !results.is_empty(),
            "Should detect entities in {}: {}",
            lang,
            text
        );
    }

    Ok(())
}

/// Test NER character offset accuracy
#[test]
#[ignore] // Requires model
fn test_ner_character_offsets() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    let text = "John Doe works at Microsoft.";
    let results = recognizer.analyze(text, "en")?;

    // Verify character offsets are accurate
    for result in &results {
        let extracted = &text[result.start..result.end];
        assert_eq!(
            extracted,
            result.text.as_ref().unwrap(),
            "Character offsets should extract exact text"
        );
    }

    Ok(())
}

/// Test NER with long text (max sequence length handling)
#[test]
#[ignore] // Requires model
fn test_ner_long_text() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        max_seq_length: 128, // Test with smaller sequence length
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    // Create a long text with entities throughout
    let long_text = "John Smith works at Microsoft. Jane Doe works at Apple. \
                     Bob Johnson works at Google. Alice Williams works at Amazon. \
                     Charlie Brown works at Facebook. Diana Prince works at Tesla. \
                     This text exceeds 512 tokens when tokenized, testing truncation.";

    let results = recognizer.analyze(long_text, "en")?;

    // Should detect at least the entities within max_seq_length
    assert!(
        !results.is_empty(),
        "Should detect entities even in long text"
    );

    Ok(())
}

/// Test NER integration with AnalyzerEngine
#[test]
#[ignore] // Requires model
fn test_ner_with_analyzer_engine() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let ner = NerRecognizer::from_config(config)?;

    // Create analyzer engine with both pattern and NER recognizers
    let mut engine = AnalyzerEngine::new();
    engine
        .recognizer_registry_mut()
        .add_recognizer(Arc::new(ner));

    let text = "Contact John Doe at john@example.com or visit Microsoft.com. SSN: 123-45-6789.";
    let result = engine.analyze(text, None)?;

    // Should detect both pattern-based entities (email, SSN) and NER entities (person, org)
    let has_email = result
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::EmailAddress);
    let has_ssn = result
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::UsSsn);
    let has_person = result
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::Person);

    assert!(has_email, "Should detect email (pattern-based)");
    assert!(has_ssn, "Should detect SSN (pattern-based)");
    assert!(has_person, "Should detect person (NER-based)");

    Ok(())
}

/// Benchmark NER inference latency
#[test]
#[ignore] // Requires model and --ignored flag
fn test_ner_performance() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    let text = "John Doe works at Microsoft in Seattle.";

    // Warm-up
    let _ = recognizer.analyze(text, "en")?;

    // Measure inference time
    let start = std::time::Instant::now();
    let iterations = 100;

    for _ in 0..iterations {
        let _ = recognizer.analyze(text, "en")?;
    }

    let elapsed = start.elapsed();
    let avg_latency = elapsed / iterations;

    println!("Average NER inference latency: {:?}", avg_latency);
    println!(
        "Throughput: {:.2} req/s",
        1000.0 / avg_latency.as_millis() as f64
    );

    // Assert reasonable performance (adjust based on hardware)
    assert!(
        avg_latency.as_millis() < 100,
        "NER inference should be < 100ms on average (was {:?})",
        avg_latency
    );

    Ok(())
}

/// Test thread safety - concurrent NER inference
#[test]
#[ignore] // Requires model
fn test_ner_thread_safety() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = std::sync::Arc::new(NerRecognizer::from_config(config)?);

    // Spawn multiple threads
    let mut handles = vec![];

    for i in 0..4 {
        let rec = recognizer.clone();
        let handle = std::thread::spawn(move || {
            let text = format!("Thread {} analyzing John Doe at Microsoft.", i);
            for _ in 0..10 {
                let results = rec.analyze(&text, "en").unwrap();
                assert!(!results.is_empty(), "Thread {} should detect entities", i);
            }
        });
        handles.push(handle);
    }

    // Wait for all threads
    for handle in handles {
        handle.join().unwrap();
    }

    Ok(())
}

/// Test NER with empty and edge case inputs
#[test]
#[ignore] // Requires model
fn test_ner_edge_cases() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    // Empty string
    let results = recognizer.analyze("", "en")?;
    assert_eq!(results.len(), 0, "Empty text should return no entities");

    // Only whitespace
    let results = recognizer.analyze("   \n\t  ", "en")?;
    assert_eq!(results.len(), 0, "Whitespace should return no entities");

    // Special characters
    let results = recognizer.analyze("!@#$%^&*()", "en")?;
    assert_eq!(results.len(), 0, "Special chars should return no entities");

    // Very short text
    let _results = recognizer.analyze("Hi.", "en")?;
    // May or may not detect entities - just ensure it doesn't crash

    Ok(())
}

/// Test full NER integration with redaction/anonymization
///
/// This test verifies the complete pipeline:
/// 1. Load NER model
/// 2. Detect Person/Organization/Location entities
/// 3. Anonymize detected entities
/// 4. Verify redaction in output
#[test]
#[ignore] // Requires model
fn test_ner_with_redaction() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        eprintln!("Model not found at: {}", model_dir);
        eprintln!(
            "Run: python scripts/export_ner_model.py --model dslim/bert-base-NER --output {}",
            model_dir
        );
        return Ok(()); // Skip test if model not available
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.7,
        ..Default::default()
    };

    let ner = NerRecognizer::from_config(config)?;

    // Create analyzer engine with NER
    let mut engine = AnalyzerEngine::new();
    engine
        .recognizer_registry_mut()
        .add_recognizer(Arc::new(ner));

    // Test text with multiple entity types
    let text = "John Doe works at Microsoft in Seattle. Contact him at john@example.com.";

    // Step 1: Analyze and verify detection
    let analysis = engine.analyze(text, None)?;

    // Verify NER entities detected
    let has_person = analysis
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::Person);
    let has_org = analysis
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::Organization);
    let has_location = analysis
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::Location);
    let has_email = analysis
        .detected_entities
        .iter()
        .any(|e| e.entity_type == EntityType::EmailAddress);

    assert!(has_person, "Should detect PERSON (John Doe)");
    assert!(has_org, "Should detect ORGANIZATION (Microsoft)");
    assert!(has_location, "Should detect LOCATION (Seattle)");
    assert!(has_email, "Should detect EMAIL (john@example.com)");

    // Step 2: Anonymize with replace strategy
    let anonymizer_config = AnonymizerConfig {
        strategy: AnonymizationStrategy::Replace,
        ..Default::default()
    };

    let anonymized = engine.anonymize(text, None, &anonymizer_config)?;

    // Step 3: Verify redaction
    assert!(
        !anonymized.text.contains("John Doe"),
        "Person name should be redacted. Got: {}",
        anonymized.text
    );
    assert!(
        !anonymized.text.contains("Microsoft"),
        "Organization should be redacted. Got: {}",
        anonymized.text
    );
    assert!(
        !anonymized.text.contains("Seattle"),
        "Location should be redacted. Got: {}",
        anonymized.text
    );
    assert!(
        !anonymized.text.contains("john@example.com"),
        "Email should be redacted. Got: {}",
        anonymized.text
    );

    // Verify placeholders are present
    assert!(
        anonymized.text.contains("[PERSON]"),
        "Should have [PERSON] placeholder"
    );
    assert!(
        anonymized.text.contains("[EMAIL_ADDRESS]"),
        "Should have [EMAIL_ADDRESS] placeholder"
    );

    println!("Original: {}", text);
    println!("Redacted: {}", anonymized.text);

    Ok(())
}

/// Test NER detects all three main entity types
#[test]
#[ignore] // Requires model
fn test_ner_entity_types() -> Result<()> {
    let model_dir = "tests/fixtures/models/bert-base-ner";

    if !model_exists(model_dir) {
        return Ok(());
    }

    let model_path = format!("{}/model.onnx", model_dir);
    let config = NerConfig {
        model_path,
        tokenizer_path: Some(format!("{}/tokenizer.json", model_dir)),
        min_confidence: 0.5, // Lower threshold to catch all entities
        ..Default::default()
    };

    let recognizer = NerRecognizer::from_config(config)?;

    // Test cases for each entity type
    let test_cases = vec![
        ("Marie Curie won the Nobel Prize.", EntityType::Person),
        (
            "Apple Inc. is based in Cupertino.",
            EntityType::Organization,
        ),
        ("The Eiffel Tower is in Paris.", EntityType::Location),
    ];

    for (text, expected_type) in test_cases {
        let results = recognizer.analyze(text, "en")?;

        let found = results.iter().any(|r| r.entity_type == expected_type);

        assert!(
            found,
            "Should detect {:?} in: '{}'\nDetected: {:?}",
            expected_type, text, results
        );
    }

    Ok(())
}