1use serde::{Deserialize, Serialize};
31
32use super::candidate::Candidate;
33
34#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
36pub enum NilReason {
37 NoCandidates,
39 LowConfidence,
41 TypeMismatch,
43 NoisyMention,
45 LargeMargin,
47 ExplicitNil,
49 OutOfKnowledgebase,
54 EmergingEntity,
56}
57
58impl std::fmt::Display for NilReason {
59 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60 match self {
61 Self::NoCandidates => write!(f, "no_candidates"),
62 Self::LowConfidence => write!(f, "low_confidence"),
63 Self::TypeMismatch => write!(f, "type_mismatch"),
64 Self::NoisyMention => write!(f, "noisy_mention"),
65 Self::LargeMargin => write!(f, "large_margin"),
66 Self::ExplicitNil => write!(f, "explicit_nil"),
67 Self::OutOfKnowledgebase => write!(f, "out_of_kb"),
68 Self::EmergingEntity => write!(f, "emerging_entity"),
69 }
70 }
71}
72
73#[derive(Debug, Clone)]
79pub struct NilDetector {
80 score_threshold: f64,
82 margin_threshold: f64,
84 min_mention_length: usize,
86 min_candidates: usize,
88 out_of_kb_threshold: f64,
97 prefer_create_over_skip: bool,
103}
104
105impl Default for NilDetector {
106 fn default() -> Self {
107 Self {
108 score_threshold: 0.3,
109 margin_threshold: 0.8,
110 min_mention_length: 2,
111 min_candidates: 1,
112 out_of_kb_threshold: 0.5, prefer_create_over_skip: false,
114 }
115 }
116}
117
118impl NilDetector {
119 pub fn new() -> Self {
121 Self::default()
122 }
123
124 pub fn with_score_threshold(mut self, threshold: f64) -> Self {
126 self.score_threshold = threshold;
127 self
128 }
129
130 pub fn with_margin_threshold(mut self, threshold: f64) -> Self {
132 self.margin_threshold = threshold;
133 self
134 }
135
136 pub fn with_out_of_kb_threshold(mut self, threshold: f64) -> Self {
143 self.out_of_kb_threshold = threshold;
144 self
145 }
146
147 pub fn with_prefer_create(mut self, prefer: bool) -> Self {
152 self.prefer_create_over_skip = prefer;
153 self
154 }
155
156 pub fn check_nil(
160 &self,
161 mention: &str,
162 candidates: &[Candidate],
163 ner_type: Option<&str>,
164 ) -> Option<NilReason> {
165 if self.is_noisy_mention(mention) {
167 return Some(NilReason::NoisyMention);
168 }
169
170 if candidates.len() < self.min_candidates {
172 return Some(NilReason::NoCandidates);
173 }
174
175 if let Some(ner_t) = ner_type {
177 let has_compatible = candidates.iter().any(|c| {
178 c.kb_type
179 .as_ref()
180 .map(|kt| super::candidate::type_compatibility(Some(ner_t), Some(kt)) > 0.5)
181 .unwrap_or(true) });
183 if !has_compatible {
184 return Some(NilReason::TypeMismatch);
185 }
186 }
187
188 let top_score = candidates
190 .iter()
191 .map(|c| c.score)
192 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
193 .unwrap_or(0.0);
194
195 if top_score < self.score_threshold {
197 return Some(NilReason::LowConfidence);
198 }
199
200 if candidates.len() >= 2 {
202 let mut scores: Vec<f64> = candidates.iter().map(|c| c.score).collect();
203 scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
204
205 let margin = scores[0] - scores[1];
206 if margin < 0.1 && top_score < 0.6 {
208 return Some(NilReason::LargeMargin);
210 }
211 }
212
213 None }
215
216 fn is_noisy_mention(&self, mention: &str) -> bool {
218 let trimmed = mention.trim();
219
220 if trimmed.len() < self.min_mention_length {
222 return true;
223 }
224
225 if trimmed.chars().all(|c| c.is_numeric() || c.is_whitespace()) {
227 return true;
228 }
229
230 if trimmed
232 .chars()
233 .all(|c| c.is_ascii_punctuation() || c.is_whitespace())
234 {
235 return true;
236 }
237
238 if trimmed.chars().count() == 1 && !trimmed.chars().next().map(is_cjk).unwrap_or(false) {
240 return true;
241 }
242
243 false
244 }
245}
246
247fn is_cjk(c: char) -> bool {
249 matches!(c as u32,
250 0x4E00..=0x9FFF | 0x3400..=0x4DBF | 0x20000..=0x2A6DF | 0xF900..=0xFAFF | 0x2F800..=0x2FA1F )
256}
257
258#[derive(Debug, Clone)]
262pub struct CandidateWithEmbedding<'a> {
263 pub candidate: &'a Candidate,
265 pub embedding_similarity: f64,
267}
268
269impl NilDetector {
270 pub fn check_out_of_kb(&self, candidates: &[CandidateWithEmbedding]) -> Option<NilReason> {
279 if candidates.is_empty() {
280 return None; }
282
283 let best_similarity = candidates
284 .iter()
285 .map(|c| c.embedding_similarity)
286 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
287 .unwrap_or(0.0);
288
289 if best_similarity < self.out_of_kb_threshold {
290 Some(NilReason::OutOfKnowledgebase)
291 } else {
292 None
293 }
294 }
295
296 pub fn check_nil_with_embeddings(
301 &self,
302 mention: &str,
303 candidates: &[CandidateWithEmbedding],
304 ner_type: Option<&str>,
305 ) -> Option<NilReason> {
306 if self.is_noisy_mention(mention) {
308 return Some(NilReason::NoisyMention);
309 }
310
311 if candidates.is_empty() {
313 return Some(NilReason::NoCandidates);
314 }
315
316 if let Some(reason) = self.check_out_of_kb(candidates) {
319 return Some(reason);
320 }
321
322 let base_candidates: Vec<&Candidate> = candidates.iter().map(|c| c.candidate).collect();
324
325 if let Some(ner_t) = ner_type {
327 let has_compatible = base_candidates.iter().any(|c| {
328 c.kb_type
329 .as_ref()
330 .map(|kt| super::candidate::type_compatibility(Some(ner_t), Some(kt)) > 0.5)
331 .unwrap_or(true)
332 });
333 if !has_compatible {
334 return Some(NilReason::TypeMismatch);
335 }
336 }
337
338 let top_score = base_candidates
340 .iter()
341 .map(|c| c.score)
342 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
343 .unwrap_or(0.0);
344
345 if top_score < self.score_threshold {
347 return Some(NilReason::LowConfidence);
348 }
349
350 if base_candidates.len() >= 2 {
352 let mut scores: Vec<f64> = base_candidates.iter().map(|c| c.score).collect();
353 scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
354
355 let margin = scores[0] - scores[1];
356 if margin < 0.1 && top_score < 0.6 {
357 return Some(NilReason::LargeMargin);
358 }
359 }
360
361 None
362 }
363
364 pub fn analyze_with_embeddings(
368 &self,
369 mention: &str,
370 candidates: &[CandidateWithEmbedding],
371 ner_type: Option<&str>,
372 ) -> NilAnalysis {
373 let nil_result = self.check_nil_with_embeddings(mention, candidates, ner_type);
374
375 match nil_result {
376 None => {
377 let best_sim = candidates
378 .iter()
379 .map(|c| c.embedding_similarity)
380 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
381 .unwrap_or(0.0);
382
383 NilAnalysis {
384 is_nil: false,
385 reason: None,
386 confidence: best_sim,
387 action: NilAction::Link,
388 }
389 }
390 Some(reason) => {
391 let (confidence, action) = match &reason {
392 NilReason::NoCandidates => {
393 if is_likely_entity_name(mention) {
394 (
395 0.7,
396 if self.prefer_create_over_skip {
397 NilAction::CreateEntry
398 } else {
399 NilAction::Review
400 },
401 )
402 } else {
403 (0.9, NilAction::Skip)
404 }
405 }
406 NilReason::OutOfKnowledgebase => {
407 if is_likely_entity_name(mention) {
410 (
411 0.8,
412 if self.prefer_create_over_skip {
413 NilAction::CreateEntry
414 } else {
415 NilAction::Review
416 },
417 )
418 } else {
419 (0.6, NilAction::Review)
420 }
421 }
422 NilReason::EmergingEntity => (0.7, NilAction::CreateEntry),
423 NilReason::LowConfidence => (0.6, NilAction::Review),
424 NilReason::TypeMismatch => (0.8, NilAction::Review),
425 NilReason::NoisyMention => (0.95, NilAction::Skip),
426 NilReason::LargeMargin => (0.5, NilAction::Review),
427 NilReason::ExplicitNil => (1.0, NilAction::Skip),
428 };
429
430 NilAnalysis {
431 is_nil: true,
432 reason: Some(reason),
433 confidence,
434 action,
435 }
436 }
437 }
438 }
439}
440
441#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct NilAnalysis {
444 pub is_nil: bool,
446 pub reason: Option<NilReason>,
448 pub confidence: f64,
450 pub action: NilAction,
452}
453
454#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
456pub enum NilAction {
457 Link,
459 Skip,
461 Review,
463 CreateEntry,
465}
466
467impl NilDetector {
468 pub fn analyze(
470 &self,
471 mention: &str,
472 candidates: &[Candidate],
473 ner_type: Option<&str>,
474 ) -> NilAnalysis {
475 let nil_result = self.check_nil(mention, candidates, ner_type);
476
477 match nil_result {
478 None => {
479 let top_score = candidates
481 .iter()
482 .map(|c| c.score)
483 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
484 .unwrap_or(0.0);
485
486 NilAnalysis {
487 is_nil: false,
488 reason: None,
489 confidence: top_score,
490 action: NilAction::Link,
491 }
492 }
493 Some(reason) => {
494 let (confidence, action) = match &reason {
495 NilReason::NoCandidates => {
496 if is_likely_entity_name(mention) {
498 (0.7, NilAction::CreateEntry)
499 } else {
500 (0.9, NilAction::Skip)
501 }
502 }
503 NilReason::LowConfidence => (0.6, NilAction::Review),
504 NilReason::TypeMismatch => (0.8, NilAction::Review),
505 NilReason::NoisyMention => (0.95, NilAction::Skip),
506 NilReason::LargeMargin => (0.5, NilAction::Review),
507 NilReason::ExplicitNil => (1.0, NilAction::Skip),
508 NilReason::OutOfKnowledgebase => (0.85, NilAction::CreateEntry),
509 NilReason::EmergingEntity => (0.75, NilAction::CreateEntry),
510 };
511
512 NilAnalysis {
513 is_nil: true,
514 reason: Some(reason),
515 confidence,
516 action,
517 }
518 }
519 }
520 }
521}
522
523fn is_likely_entity_name(mention: &str) -> bool {
525 let trimmed = mention.trim();
526
527 let has_upper = trimmed
529 .chars()
530 .next()
531 .map(|c| c.is_uppercase())
532 .unwrap_or(false);
533
534 let cap_words = trimmed
536 .split_whitespace()
537 .filter(|w| w.chars().next().map(|c| c.is_uppercase()).unwrap_or(false))
538 .count();
539
540 has_upper && cap_words >= 1
541}
542
543#[cfg(test)]
544mod tests {
545 use super::*;
546
547 #[test]
548 fn test_nil_no_candidates() {
549 let detector = NilDetector::new();
550 let result = detector.check_nil("Unknown Entity", &[], None);
551 assert_eq!(result, Some(NilReason::NoCandidates));
552 }
553
554 #[test]
555 fn test_nil_noisy_mention() {
556 let detector = NilDetector::new();
557 assert_eq!(
558 detector.check_nil("123", &[], None),
559 Some(NilReason::NoisyMention)
560 );
561 assert_eq!(
562 detector.check_nil(".", &[], None),
563 Some(NilReason::NoisyMention)
564 );
565 }
566
567 #[test]
568 fn test_linkable() {
569 let detector = NilDetector::new();
570 let mut candidate = super::super::candidate::Candidate::new(
571 "Q937",
572 super::super::candidate::CandidateSource::Wikidata,
573 "Albert Einstein",
574 );
575 candidate.score = 0.8;
576
577 let result = detector.check_nil("Einstein", &[candidate], Some("PERSON"));
578 assert_eq!(result, None); }
580
581 #[test]
582 fn test_nil_analysis() {
583 let detector = NilDetector::new();
584 let analysis = detector.analyze("Unknown Entity", &[], None);
585
586 assert!(analysis.is_nil);
587 assert!(matches!(analysis.reason, Some(NilReason::NoCandidates)));
588 }
589
590 #[test]
591 fn test_is_cjk() {
592 assert!(is_cjk('中'));
593 assert!(is_cjk('日'));
594 assert!(!is_cjk('A'));
595 }
596
597 #[test]
598 fn test_out_of_kb_detection() {
599 let detector = NilDetector::new().with_out_of_kb_threshold(0.5);
600
601 let mut candidate = super::super::candidate::Candidate::new(
603 "Q12345",
604 super::super::candidate::CandidateSource::Wikidata,
605 "John Smith",
606 );
607 candidate.score = 0.6;
608
609 let candidates_with_embeddings = vec![CandidateWithEmbedding {
610 candidate: &candidate,
611 embedding_similarity: 0.3, }];
613
614 let result = detector.check_out_of_kb(&candidates_with_embeddings);
615 assert_eq!(result, Some(NilReason::OutOfKnowledgebase));
616 }
617
618 #[test]
619 fn test_out_of_kb_above_threshold() {
620 let detector = NilDetector::new().with_out_of_kb_threshold(0.5);
621
622 let mut candidate = super::super::candidate::Candidate::new(
623 "Q937",
624 super::super::candidate::CandidateSource::Wikidata,
625 "Albert Einstein",
626 );
627 candidate.score = 0.9;
628
629 let candidates_with_embeddings = vec![CandidateWithEmbedding {
630 candidate: &candidate,
631 embedding_similarity: 0.85, }];
633
634 let result = detector.check_out_of_kb(&candidates_with_embeddings);
635 assert_eq!(result, None); }
637
638 #[test]
639 fn test_prefer_create_over_skip() {
640 let detector = NilDetector::new()
641 .with_out_of_kb_threshold(0.5)
642 .with_prefer_create(true);
643
644 let mut candidate = super::super::candidate::Candidate::new(
646 "Q99999",
647 super::super::candidate::CandidateSource::Wikidata,
648 "Unknown Person",
649 );
650 candidate.score = 0.4;
651
652 let candidates = vec![CandidateWithEmbedding {
653 candidate: &candidate,
654 embedding_similarity: 0.3,
655 }];
656
657 let analysis = detector.analyze_with_embeddings(
658 "Mayor Thomas Jenkins", &candidates,
660 Some("PERSON"),
661 );
662
663 assert!(analysis.is_nil);
664 assert_eq!(analysis.reason, Some(NilReason::OutOfKnowledgebase));
665 assert_eq!(analysis.action, NilAction::CreateEntry);
666 }
667
668 #[test]
669 fn test_nil_reason_display() {
670 assert_eq!(NilReason::OutOfKnowledgebase.to_string(), "out_of_kb");
671 assert_eq!(NilReason::EmergingEntity.to_string(), "emerging_entity");
672 }
673}