Skip to main content

anno/backends/ensemble/
weights.rs

1//! Backend weighting, type weights, and candidate/span-key helpers.
2
3use super::*;
4
5// =============================================================================
6// Backend Weights
7// =============================================================================
8
9/// Reliability weight for a backend (0.0 to 1.0).
10///
11/// Higher weight = more trusted when resolving conflicts.
12#[derive(Debug, Clone, Copy)]
13pub struct BackendWeight {
14    /// Overall reliability of this backend
15    pub overall: f64,
16    /// Type-specific weights (optional overrides)
17    pub per_type: Option<TypeWeights>,
18}
19
20impl Default for BackendWeight {
21    fn default() -> Self {
22        Self {
23            overall: 0.5,
24            per_type: None,
25        }
26    }
27}
28
29/// Type-specific reliability weights.
30///
31/// Different backends may have different accuracy profiles for different entity types.
32/// These weights adjust confidence scores based on the entity type being extracted.
33#[derive(Debug, Clone, Copy, Default)]
34pub struct TypeWeights {
35    /// Weight multiplier for Person entities
36    pub person: f64,
37    /// Weight multiplier for Organization entities
38    pub organization: f64,
39    /// Weight multiplier for Location entities
40    pub location: f64,
41    /// Weight multiplier for Date entities
42    pub date: f64,
43    /// Weight multiplier for Money entities
44    pub money: f64,
45    /// Weight multiplier for other/misc entity types
46    pub other: f64,
47}
48
49impl TypeWeights {
50    pub(super) fn get(&self, entity_type: &EntityType) -> f64 {
51        match entity_type {
52            EntityType::Person => self.person,
53            EntityType::Organization => self.organization,
54            EntityType::Location => self.location,
55            EntityType::Date => self.date,
56            EntityType::Money => self.money,
57            _ => self.other,
58        }
59    }
60}
61
62/// Default weights based on empirical observations.
63pub(super) fn default_backend_weights() -> HashMap<&'static str, BackendWeight> {
64    let mut weights = HashMap::new();
65
66    // Pattern backends: very high precision when they fire
67    weights.insert(
68        "regex",
69        BackendWeight {
70            overall: 0.98,
71            per_type: Some(TypeWeights {
72                date: 0.99,
73                money: 0.99,
74                person: 0.50, // Pattern doesn't do NER
75                organization: 0.50,
76                location: 0.50,
77                other: 0.95, // URLs, emails, etc.
78            }),
79        },
80    );
81
82    // GLiNER: good ML-based NER
83    weights.insert(
84        "gliner",
85        BackendWeight {
86            overall: 0.85,
87            per_type: Some(TypeWeights {
88                person: 0.90,
89                organization: 0.85,
90                location: 0.80,
91                date: 0.75,
92                money: 0.70,
93                other: 0.75,
94            }),
95        },
96    );
97    weights.insert(
98        "GLiNER-ONNX",
99        BackendWeight {
100            overall: 0.85,
101            per_type: Some(TypeWeights {
102                person: 0.90,
103                organization: 0.85,
104                location: 0.80,
105                date: 0.75,
106                money: 0.70,
107                other: 0.75,
108            }),
109        },
110    );
111
112    // GLiNER Candle
113    weights.insert(
114        "gliner-candle",
115        BackendWeight {
116            overall: 0.85,
117            per_type: None,
118        },
119    );
120
121    // BERT NER
122    weights.insert(
123        "bert-ner-onnx",
124        BackendWeight {
125            overall: 0.80,
126            per_type: None,
127        },
128    );
129
130    // Heuristic: reasonable but noisy
131    weights.insert(
132        "heuristic",
133        BackendWeight {
134            overall: 0.60,
135            per_type: Some(TypeWeights {
136                person: 0.65,       // Title + Name pattern works well
137                organization: 0.70, // "Inc", "Corp" patterns
138                location: 0.55,     // Context-dependent
139                date: 0.40,         // Better to use pattern
140                money: 0.40,
141                other: 0.50,
142            }),
143        },
144    );
145
146    weights
147}
148
149// =============================================================================
150// Candidate Entity (with source tracking)
151// =============================================================================
152
153/// An entity candidate from a specific backend.
154#[derive(Debug, Clone)]
155pub(super) struct Candidate {
156    pub(super) entity: Entity,
157    pub(super) source: String,
158    pub(super) backend_weight: f64,
159}
160
161// =============================================================================
162// Span Key (for grouping overlapping entities)
163// =============================================================================
164
165/// Key for grouping entities by span.
166///
167/// Two entities are considered "same span" if they significantly overlap.
168#[derive(Debug, Clone, PartialEq, Eq, Hash)]
169pub(super) struct SpanKey {
170    pub(super) start: usize,
171    pub(super) end: usize,
172}
173
174impl SpanKey {
175    pub(super) fn from_entity(e: &Entity) -> Self {
176        Self {
177            start: e.start,
178            end: e.end,
179        }
180    }
181
182    /// Check if two spans overlap significantly (>50% of smaller span).
183    pub(super) fn overlaps(&self, other: &SpanKey) -> bool {
184        let overlap_start = self.start.max(other.start);
185        let overlap_end = self.end.min(other.end);
186
187        if overlap_start >= overlap_end {
188            return false;
189        }
190
191        let overlap = overlap_end - overlap_start;
192        let smaller_span = (self.end - self.start).min(other.end - other.start);
193
194        // Overlap if >50% of smaller span is covered
195        (overlap as f64 / smaller_span as f64) > 0.5
196    }
197}
198
199// =============================================================================
200// EnsembleNER
201// =============================================================================