anno/backends/ensemble/weights.rs
1//! Backend weighting, type weights, and candidate/span-key helpers.
2
3use super::*;
4
5// =============================================================================
6// Backend Weights
7// =============================================================================
8
9/// Reliability weight for a backend (0.0 to 1.0).
10///
11/// Higher weight = more trusted when resolving conflicts.
12#[derive(Debug, Clone, Copy)]
13pub struct BackendWeight {
14 /// Overall reliability of this backend
15 pub overall: f64,
16 /// Type-specific weights (optional overrides)
17 pub per_type: Option<TypeWeights>,
18}
19
20impl Default for BackendWeight {
21 fn default() -> Self {
22 Self {
23 overall: 0.5,
24 per_type: None,
25 }
26 }
27}
28
29/// Type-specific reliability weights.
30///
31/// Different backends may have different accuracy profiles for different entity types.
32/// These weights adjust confidence scores based on the entity type being extracted.
33#[derive(Debug, Clone, Copy, Default)]
34pub struct TypeWeights {
35 /// Weight multiplier for Person entities
36 pub person: f64,
37 /// Weight multiplier for Organization entities
38 pub organization: f64,
39 /// Weight multiplier for Location entities
40 pub location: f64,
41 /// Weight multiplier for Date entities
42 pub date: f64,
43 /// Weight multiplier for Money entities
44 pub money: f64,
45 /// Weight multiplier for other/misc entity types
46 pub other: f64,
47}
48
49impl TypeWeights {
50 pub(super) fn get(&self, entity_type: &EntityType) -> f64 {
51 match entity_type {
52 EntityType::Person => self.person,
53 EntityType::Organization => self.organization,
54 EntityType::Location => self.location,
55 EntityType::Date => self.date,
56 EntityType::Money => self.money,
57 _ => self.other,
58 }
59 }
60}
61
62/// Default weights based on empirical observations.
63pub(super) fn default_backend_weights() -> HashMap<&'static str, BackendWeight> {
64 let mut weights = HashMap::new();
65
66 // Pattern backends: very high precision when they fire
67 weights.insert(
68 "regex",
69 BackendWeight {
70 overall: 0.98,
71 per_type: Some(TypeWeights {
72 date: 0.99,
73 money: 0.99,
74 person: 0.50, // Pattern doesn't do NER
75 organization: 0.50,
76 location: 0.50,
77 other: 0.95, // URLs, emails, etc.
78 }),
79 },
80 );
81
82 // GLiNER: good ML-based NER
83 weights.insert(
84 "gliner",
85 BackendWeight {
86 overall: 0.85,
87 per_type: Some(TypeWeights {
88 person: 0.90,
89 organization: 0.85,
90 location: 0.80,
91 date: 0.75,
92 money: 0.70,
93 other: 0.75,
94 }),
95 },
96 );
97 weights.insert(
98 "GLiNER-ONNX",
99 BackendWeight {
100 overall: 0.85,
101 per_type: Some(TypeWeights {
102 person: 0.90,
103 organization: 0.85,
104 location: 0.80,
105 date: 0.75,
106 money: 0.70,
107 other: 0.75,
108 }),
109 },
110 );
111
112 // GLiNER Candle
113 weights.insert(
114 "gliner-candle",
115 BackendWeight {
116 overall: 0.85,
117 per_type: None,
118 },
119 );
120
121 // BERT NER
122 weights.insert(
123 "bert-ner-onnx",
124 BackendWeight {
125 overall: 0.80,
126 per_type: None,
127 },
128 );
129
130 // Heuristic: reasonable but noisy
131 weights.insert(
132 "heuristic",
133 BackendWeight {
134 overall: 0.60,
135 per_type: Some(TypeWeights {
136 person: 0.65, // Title + Name pattern works well
137 organization: 0.70, // "Inc", "Corp" patterns
138 location: 0.55, // Context-dependent
139 date: 0.40, // Better to use pattern
140 money: 0.40,
141 other: 0.50,
142 }),
143 },
144 );
145
146 weights
147}
148
149// =============================================================================
150// Candidate Entity (with source tracking)
151// =============================================================================
152
153/// An entity candidate from a specific backend.
154#[derive(Debug, Clone)]
155pub(super) struct Candidate {
156 pub(super) entity: Entity,
157 pub(super) source: String,
158 pub(super) backend_weight: f64,
159}
160
161// =============================================================================
162// Span Key (for grouping overlapping entities)
163// =============================================================================
164
165/// Key for grouping entities by span.
166///
167/// Two entities are considered "same span" if they significantly overlap.
168#[derive(Debug, Clone, PartialEq, Eq, Hash)]
169pub(super) struct SpanKey {
170 pub(super) start: usize,
171 pub(super) end: usize,
172}
173
174impl SpanKey {
175 pub(super) fn from_entity(e: &Entity) -> Self {
176 Self {
177 start: e.start,
178 end: e.end,
179 }
180 }
181
182 /// Check if two spans overlap significantly (>50% of smaller span).
183 pub(super) fn overlaps(&self, other: &SpanKey) -> bool {
184 let overlap_start = self.start.max(other.start);
185 let overlap_end = self.end.min(other.end);
186
187 if overlap_start >= overlap_end {
188 return false;
189 }
190
191 let overlap = overlap_end - overlap_start;
192 let smaller_span = (self.end - self.start).min(other.end - other.start);
193
194 // Overlap if >50% of smaller span is covered
195 (overlap as f64 / smaller_span as f64) > 0.5
196 }
197}
198
199// =============================================================================
200// EnsembleNER
201// =============================================================================