1use super::confidence::Confidence;
90use super::entity::{
91 DiscontinuousSpan, Entity, EntityType, HierarchicalConfidence, Provenance, Span,
92};
93use serde::{Deserialize, Serialize};
94use std::collections::HashMap;
95
96#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
122pub enum Modality {
123 Iconic,
126 #[default]
129 Symbolic,
130 Hybrid,
133}
134
135#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum Location {
150 Text {
152 start: usize,
154 end: usize,
156 },
157 Discontinuous {
159 segments: Vec<(usize, usize)>,
161 },
162}
163
164impl Location {
165 #[must_use]
167 pub const fn text(start: usize, end: usize) -> Self {
168 Self::Text { start, end }
169 }
170
171 #[must_use]
173 pub const fn modality(&self) -> Modality {
174 match self {
175 Self::Text { .. } | Self::Discontinuous { .. } => Modality::Symbolic,
176 }
177 }
178
179 #[must_use]
181 pub fn text_offsets(&self) -> Option<(usize, usize)> {
182 match self {
183 Self::Text { start, end } => Some((*start, *end)),
184 Self::Discontinuous { segments } => {
185 let start = segments.iter().map(|(s, _)| *s).min()?;
186 let end = segments.iter().map(|(_, e)| *e).max()?;
187 Some((start, end))
188 }
189 }
190 }
191
192 #[must_use]
194 pub fn overlaps(&self, other: &Self) -> bool {
195 match (self, other) {
196 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
197 s1 < e2 && s2 < e1
198 }
199 _ => false, }
201 }
202
203 #[must_use]
207 pub fn iou(&self, other: &Self) -> Option<f64> {
208 match (self, other) {
209 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
210 let intersection_start = (*s1).max(*s2);
211 let intersection_end = (*e1).min(*e2);
212 if intersection_start >= intersection_end {
213 return Some(0.0);
214 }
215 let intersection = (intersection_end - intersection_start) as f64;
216 let union = ((*e1).max(*e2) - (*s1).min(*s2)) as f64;
217 if union == 0.0 {
218 Some(0.0)
219 } else {
220 Some(intersection / union)
221 }
222 }
223 _ => None,
224 }
225 }
226}
227
228impl Default for Location {
229 fn default() -> Self {
230 Self::Text { start: 0, end: 0 }
231 }
232}
233
234impl From<&Span> for Location {
235 fn from(span: &Span) -> Self {
236 match span {
237 Span::Text { start, end } => Self::Text {
238 start: *start,
239 end: *end,
240 },
241 Span::BoundingBox { .. } => Self::Text { start: 0, end: 0 },
244 Span::Hybrid { start, end, .. } => Self::Text {
245 start: *start,
246 end: *end,
247 },
248 }
249 }
250}
251
252impl From<Span> for Location {
253 fn from(span: Span) -> Self {
254 Self::from(&span)
255 }
256}
257
258impl Location {
263 #[must_use]
267 pub fn to_span(&self) -> Option<Span> {
268 match self {
269 Self::Text { start, end } => Some(Span::Text {
270 start: *start,
271 end: *end,
272 }),
273 Self::Discontinuous { .. } => None,
274 }
275 }
276}
277
278pub use super::types::SignalId;
284
285#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
315pub struct Signal<L = Location> {
316 pub id: SignalId,
318 pub location: L,
320 pub surface: String,
322 pub label: super::types::TypeLabel,
326 pub confidence: Confidence,
328 pub hierarchical: Option<HierarchicalConfidence>,
330 pub provenance: Option<Provenance>,
332 pub modality: Modality,
334 pub normalized: Option<String>,
336 pub negated: bool,
338 pub quantifier: Option<Quantifier>,
340}
341
342#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
347#[non_exhaustive]
348pub enum Quantifier {
349 Universal,
351 Existential,
353 None,
355 Definite,
357 Approximate,
359 MinBound,
361 MaxBound,
363 Bare,
365}
366
367impl<L> Signal<L> {
368 #[must_use]
378 pub fn new(
379 id: impl Into<SignalId>,
380 location: L,
381 surface: impl Into<String>,
382 label: impl Into<super::types::TypeLabel>,
383 confidence: f32,
384 ) -> Self {
385 Self {
386 id: id.into(),
387 location,
388 surface: surface.into(),
389 label: label.into(),
390 confidence: Confidence::new(confidence as f64),
391 hierarchical: None,
392 provenance: None,
393 modality: Modality::default(),
394 normalized: None,
395 negated: false,
396 quantifier: None,
397 }
398 }
399
400 #[must_use]
402 pub fn label(&self) -> &str {
403 self.label.as_str()
404 }
405
406 #[must_use]
408 pub fn type_label(&self) -> super::types::TypeLabel {
409 self.label.clone()
410 }
411
412 #[must_use]
414 pub fn surface(&self) -> &str {
415 &self.surface
416 }
417
418 #[must_use]
420 pub fn is_confident(&self, threshold: Confidence) -> bool {
421 self.confidence >= threshold
422 }
423
424 #[must_use]
426 pub fn with_modality(mut self, modality: Modality) -> Self {
427 self.modality = modality;
428 self
429 }
430
431 #[must_use]
433 pub fn negated(mut self) -> Self {
434 self.negated = true;
435 self
436 }
437
438 #[must_use]
440 pub fn with_quantifier(mut self, q: Quantifier) -> Self {
441 self.quantifier = Some(q);
442 self
443 }
444
445 #[must_use]
447 pub fn with_provenance(mut self, p: Provenance) -> Self {
448 self.provenance = Some(p);
449 self
450 }
451}
452
453impl Signal<Location> {
454 #[must_use]
456 pub fn text_offsets(&self) -> Option<(usize, usize)> {
457 self.location.text_offsets()
458 }
459
460 #[must_use]
477 pub fn validate_against(&self, source_text: &str) -> Option<SignalValidationError> {
478 let (start, end) = self.location.text_offsets()?;
479
480 let char_count = source_text.chars().count();
481
482 if end > char_count {
484 return Some(SignalValidationError::OutOfBounds {
485 signal_id: self.id,
486 end,
487 text_len: char_count,
488 });
489 }
490
491 if start >= end {
492 return Some(SignalValidationError::InvalidSpan {
493 signal_id: self.id,
494 start,
495 end,
496 });
497 }
498
499 let actual: String = source_text.chars().skip(start).take(end - start).collect();
501
502 if actual != self.surface {
503 return Some(SignalValidationError::TextMismatch {
504 signal_id: self.id,
505 expected: self.surface.clone(),
506 actual,
507 start,
508 end,
509 });
510 }
511
512 None
513 }
514
515 #[must_use]
517 pub fn is_valid(&self, source_text: &str) -> bool {
518 self.validate_against(source_text).is_none()
519 }
520
521 #[must_use]
536 pub fn from_text(
537 source: &str,
538 surface: &str,
539 label: impl Into<super::types::TypeLabel>,
540 confidence: f32,
541 ) -> Option<Self> {
542 Self::from_text_nth(source, surface, label, confidence, 0)
543 }
544
545 #[must_use]
547 pub fn from_text_nth(
548 source: &str,
549 surface: &str,
550 label: impl Into<super::types::TypeLabel>,
551 confidence: f32,
552 occurrence: usize,
553 ) -> Option<Self> {
554 for (count, (byte_idx, _)) in source.match_indices(surface).enumerate() {
556 if count == occurrence {
557 let start = source[..byte_idx].chars().count();
559 let end = start + surface.chars().count();
560
561 return Some(Self::new(
562 SignalId::ZERO,
563 Location::text(start, end),
564 surface,
565 label,
566 confidence,
567 ));
568 }
569 }
570
571 None
572 }
573}
574
575#[derive(Debug, Clone, PartialEq)]
577pub enum SignalValidationError {
578 OutOfBounds {
580 signal_id: SignalId,
582 end: usize,
584 text_len: usize,
586 },
587 InvalidSpan {
589 signal_id: SignalId,
591 start: usize,
593 end: usize,
595 },
596 TextMismatch {
598 signal_id: SignalId,
600 expected: String,
602 actual: String,
604 start: usize,
606 end: usize,
608 },
609}
610
611impl std::fmt::Display for SignalValidationError {
612 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
613 match self {
614 Self::OutOfBounds {
615 signal_id,
616 end,
617 text_len,
618 } => {
619 write!(
620 f,
621 "S{}: end offset {} exceeds text length {}",
622 signal_id, end, text_len
623 )
624 }
625 Self::InvalidSpan {
626 signal_id,
627 start,
628 end,
629 } => {
630 write!(f, "S{}: invalid span [{}, {})", signal_id, start, end)
631 }
632 Self::TextMismatch {
633 signal_id,
634 expected,
635 actual,
636 start,
637 end,
638 } => {
639 write!(
640 f,
641 "S{}: text mismatch at [{}, {}): expected '{}', found '{}'",
642 signal_id, start, end, expected, actual
643 )
644 }
645 }
646 }
647}
648
649impl std::error::Error for SignalValidationError {}
650
651impl From<&Entity> for Signal<Location> {
657 fn from(e: &Entity) -> Self {
658 let mut signal = Signal::new(
659 SignalId::ZERO,
660 Location::text(e.start(), e.end()),
661 &e.text,
662 e.entity_type.as_label(),
663 f32::from(e.confidence),
664 );
665 signal.normalized = e.normalized.clone();
666 signal.provenance = e.provenance.clone();
667 signal.hierarchical = e.hierarchical_confidence;
668 signal
669 }
670}
671
672pub use super::types::TrackId;
678
679#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
681pub struct SignalRef {
682 pub signal_id: SignalId,
684 pub position: u32,
686}
687
688#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
694pub struct TrackRef {
695 pub doc_id: String,
697 pub track_id: TrackId,
699}
700
701#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
721pub struct Track {
722 pub id: TrackId,
724 pub signals: Vec<SignalRef>,
726 pub entity_type: Option<super::types::TypeLabel>,
730 pub canonical_surface: String,
732 pub identity_id: Option<IdentityId>,
734 pub cluster_confidence: Confidence,
736 pub embedding: Option<Vec<f32>>,
739}
740
741impl Track {
742 #[must_use]
744 pub fn new(id: impl Into<TrackId>, canonical_surface: impl Into<String>) -> Self {
745 Self {
746 id: id.into(),
747 signals: Vec::new(),
748 entity_type: None,
749 canonical_surface: canonical_surface.into(),
750 identity_id: None,
751 cluster_confidence: Confidence::ONE,
752 embedding: None,
753 }
754 }
755
756 pub fn add_signal(&mut self, signal_id: impl Into<SignalId>, position: u32) {
758 let signal_id = signal_id.into();
759 self.signals.push(SignalRef {
760 signal_id,
761 position,
762 });
763 }
764
765 #[must_use]
767 pub fn len(&self) -> usize {
768 self.signals.len()
769 }
770
771 #[must_use]
773 pub fn is_empty(&self) -> bool {
774 self.signals.is_empty()
775 }
776
777 #[must_use]
779 pub fn is_singleton(&self) -> bool {
780 self.signals.len() == 1
781 }
782
783 #[must_use]
785 pub const fn id(&self) -> TrackId {
786 self.id
787 }
788
789 #[must_use]
791 pub fn signals(&self) -> &[SignalRef] {
792 &self.signals
793 }
794
795 #[must_use]
797 pub fn canonical_surface(&self) -> &str {
798 &self.canonical_surface
799 }
800
801 #[must_use]
803 pub const fn identity_id(&self) -> Option<IdentityId> {
804 self.identity_id
805 }
806
807 #[must_use]
809 pub const fn cluster_confidence(&self) -> Confidence {
810 self.cluster_confidence
811 }
812
813 pub fn set_cluster_confidence(&mut self, confidence: f32) {
815 self.cluster_confidence = Confidence::new(confidence as f64);
816 }
817
818 pub fn set_identity_id(&mut self, identity_id: IdentityId) {
820 self.identity_id = Some(identity_id);
821 }
822
823 pub fn clear_identity_id(&mut self) {
825 self.identity_id = None;
826 }
827
828 #[must_use]
830 pub fn with_identity(mut self, identity_id: IdentityId) -> Self {
831 self.identity_id = Some(identity_id);
832 self
833 }
834
835 #[must_use]
839 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
840 let s = entity_type.into();
841 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
842 self
843 }
844
845 #[must_use]
859 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
860 self.entity_type = Some(label);
861 self
862 }
863
864 #[must_use]
869 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
870 self.entity_type.clone()
871 }
872
873 #[must_use]
875 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
876 self.embedding = Some(embedding);
877 self
878 }
879
880 pub fn compute_spread(&self, doc: &GroundedDocument) -> Option<usize> {
884 if self.signals.is_empty() {
885 return Some(0);
886 }
887
888 let positions: Vec<usize> = self
889 .signals
890 .iter()
891 .filter_map(|sr| {
892 doc.signals
893 .iter()
894 .find(|s| s.id == sr.signal_id)
895 .and_then(|s| s.location.text_offsets())
896 .map(|(start, _)| start)
897 })
898 .collect();
899
900 if positions.is_empty() {
901 return None;
902 }
903
904 let min_pos = *positions.iter().min().expect("positions non-empty");
905 let max_pos = *positions.iter().max().expect("positions non-empty");
906 Some(max_pos.saturating_sub(min_pos))
907 }
908
909 pub fn collect_variations(&self, doc: &GroundedDocument) -> Vec<String> {
913 let mut variations: std::collections::HashSet<String> = std::collections::HashSet::new();
914
915 for sr in &self.signals {
916 if let Some(signal) = doc.signals.iter().find(|s| s.id == sr.signal_id) {
917 variations.insert(signal.surface.clone());
918 }
919 }
920
921 variations.into_iter().collect()
922 }
923
924 pub fn confidence_stats(&self, doc: &GroundedDocument) -> Option<(f32, f32, f32)> {
928 let confidences: Vec<f32> = self
929 .signals
930 .iter()
931 .filter_map(|sr| {
932 doc.signals
933 .iter()
934 .find(|s| s.id == sr.signal_id)
935 .map(|s| s.confidence.value() as f32)
936 })
937 .collect();
938
939 if confidences.is_empty() {
940 return None;
941 }
942
943 let min = confidences.iter().cloned().fold(f32::INFINITY, f32::min);
944 let max = confidences
945 .iter()
946 .cloned()
947 .fold(f32::NEG_INFINITY, f32::max);
948 let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
949
950 Some((min, max, mean))
951 }
952
953 pub fn compute_stats(&self, doc: &GroundedDocument, text_len: usize) -> TrackStats {
957 let chain_length = self.signals.len();
958 let spread = self.compute_spread(doc).unwrap_or(0);
959 let variations = self.collect_variations(doc);
960 let (min_conf, max_conf, mean_conf) = self.confidence_stats(doc).unwrap_or((0.0, 0.0, 0.0));
961
962 let positions: Vec<usize> = self
964 .signals
965 .iter()
966 .filter_map(|sr| {
967 doc.signals
968 .iter()
969 .find(|s| s.id == sr.signal_id)
970 .and_then(|s| s.location.text_offsets())
971 .map(|(start, _)| start)
972 })
973 .collect();
974
975 let first_position = positions.iter().min().copied().unwrap_or(0);
976 let last_position = positions.iter().max().copied().unwrap_or(0);
977 let relative_spread = if text_len > 0 {
978 spread as f64 / text_len as f64
979 } else {
980 0.0
981 };
982
983 TrackStats {
984 chain_length,
985 variation_count: variations.len(),
986 variations,
987 spread,
988 relative_spread,
989 first_position,
990 last_position,
991 min_confidence: Confidence::new(min_conf as f64),
992 max_confidence: Confidence::new(max_conf as f64),
993 mean_confidence: Confidence::new(mean_conf as f64),
994 has_embedding: self.embedding.is_some(),
995 }
996 }
997}
998
999#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1001pub struct TrackStats {
1002 pub chain_length: usize,
1004 pub variation_count: usize,
1006 pub variations: Vec<String>,
1008 pub spread: usize,
1010 pub relative_spread: f64,
1012 pub first_position: usize,
1014 pub last_position: usize,
1016 pub min_confidence: Confidence,
1018 pub max_confidence: Confidence,
1020 pub mean_confidence: Confidence,
1022 pub has_embedding: bool,
1024}
1025
1026pub use super::types::IdentityId;
1032
1033#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1038pub enum IdentitySource {
1039 CrossDocCoref {
1042 track_refs: Vec<TrackRef>,
1044 },
1045 KnowledgeBase {
1048 kb_name: String,
1050 kb_id: String,
1052 },
1053 Hybrid {
1056 track_refs: Vec<TrackRef>,
1058 kb_name: String,
1060 kb_id: String,
1062 },
1063}
1064
1065#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1087pub struct Identity {
1088 pub id: IdentityId,
1090 pub canonical_name: String,
1092 pub entity_type: Option<super::types::TypeLabel>,
1096 pub kb_id: Option<String>,
1098 pub kb_name: Option<String>,
1100 pub description: Option<String>,
1102 pub embedding: Option<Vec<f32>>,
1105 pub aliases: Vec<String>,
1107 pub confidence: Confidence,
1109 #[serde(default, skip_serializing_if = "Option::is_none")]
1111 pub source: Option<IdentitySource>,
1112}
1113
1114impl Identity {
1115 #[must_use]
1117 pub fn new(id: impl Into<IdentityId>, canonical_name: impl Into<String>) -> Self {
1118 Self {
1119 id: id.into(),
1120 canonical_name: canonical_name.into(),
1121 entity_type: None,
1122 kb_id: None,
1123 kb_name: None,
1124 description: None,
1125 embedding: None,
1126 aliases: Vec::new(),
1127 confidence: Confidence::ONE,
1128 source: None,
1129 }
1130 }
1131
1132 #[must_use]
1134 pub fn from_kb(
1135 id: impl Into<IdentityId>,
1136 canonical_name: impl Into<String>,
1137 kb_name: impl Into<String>,
1138 kb_id: impl Into<String>,
1139 ) -> Self {
1140 let kb_name_str = kb_name.into();
1141 let kb_id_str = kb_id.into();
1142 Self {
1143 id: id.into(),
1144 canonical_name: canonical_name.into(),
1145 entity_type: None,
1146 kb_id: Some(kb_id_str.clone()),
1147 kb_name: Some(kb_name_str.clone()),
1148 description: None,
1149 embedding: None,
1150 aliases: Vec::new(),
1151 confidence: Confidence::ONE,
1152 source: Some(IdentitySource::KnowledgeBase {
1153 kb_name: kb_name_str,
1154 kb_id: kb_id_str,
1155 }),
1156 }
1157 }
1158
1159 pub fn add_alias(&mut self, alias: impl Into<String>) {
1161 self.aliases.push(alias.into());
1162 }
1163
1164 #[must_use]
1166 pub const fn id(&self) -> IdentityId {
1167 self.id
1168 }
1169
1170 #[must_use]
1172 pub fn canonical_name(&self) -> &str {
1173 &self.canonical_name
1174 }
1175
1176 #[must_use]
1178 pub fn kb_id(&self) -> Option<&str> {
1179 self.kb_id.as_deref()
1180 }
1181
1182 #[must_use]
1184 pub fn kb_name(&self) -> Option<&str> {
1185 self.kb_name.as_deref()
1186 }
1187
1188 #[must_use]
1190 pub fn aliases(&self) -> &[String] {
1191 &self.aliases
1192 }
1193
1194 #[must_use]
1196 pub const fn confidence(&self) -> Confidence {
1197 self.confidence
1198 }
1199
1200 pub fn set_confidence(&mut self, confidence: f32) {
1202 self.confidence = Confidence::new(confidence as f64);
1203 }
1204
1205 #[must_use]
1207 pub fn source(&self) -> Option<&IdentitySource> {
1208 self.source.as_ref()
1209 }
1210
1211 #[must_use]
1213 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1214 self.embedding = Some(embedding);
1215 self
1216 }
1217
1218 #[must_use]
1222 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1223 let s = entity_type.into();
1224 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1225 self
1226 }
1227
1228 #[must_use]
1233 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1234 self.entity_type = Some(label);
1235 self
1236 }
1237
1238 #[must_use]
1243 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1244 self.entity_type.clone()
1245 }
1246
1247 #[must_use]
1249 pub fn with_description(mut self, description: impl Into<String>) -> Self {
1250 self.description = Some(description.into());
1251 self
1252 }
1253
1254 }
1256
1257#[derive(Deserialize)]
1265struct GroundedDocumentWire {
1266 id: String,
1267 text: String,
1268 signals: Vec<Signal<Location>>,
1269 tracks: HashMap<TrackId, Track>,
1270 identities: HashMap<IdentityId, Identity>,
1271}
1272
1273impl From<GroundedDocumentWire> for GroundedDocument {
1274 fn from(wire: GroundedDocumentWire) -> Self {
1275 let mut doc = Self {
1276 id: wire.id,
1277 text: wire.text,
1278 signals: wire.signals,
1279 tracks: wire.tracks,
1280 identities: wire.identities,
1281 signal_to_track: HashMap::new(),
1282 track_to_identity: HashMap::new(),
1283 next_signal_id: SignalId::ZERO,
1284 next_track_id: TrackId::ZERO,
1285 next_identity_id: IdentityId::ZERO,
1286 };
1287 doc.rebuild_indexes();
1288 doc
1289 }
1290}
1291
1292#[derive(Debug, Clone, Serialize, Deserialize)]
1356#[serde(from = "GroundedDocumentWire")]
1357pub struct GroundedDocument {
1358 id: String,
1360 text: String,
1362 signals: Vec<Signal<Location>>,
1364 tracks: HashMap<TrackId, Track>,
1366 identities: HashMap<IdentityId, Identity>,
1368 #[serde(skip)]
1371 signal_to_track: HashMap<SignalId, TrackId>,
1372 #[serde(skip)]
1375 track_to_identity: HashMap<TrackId, IdentityId>,
1376 #[serde(skip)]
1379 next_signal_id: SignalId,
1380 #[serde(skip)]
1383 next_track_id: TrackId,
1384 #[serde(skip)]
1387 next_identity_id: IdentityId,
1388}
1389
1390impl GroundedDocument {
1391 #[must_use]
1393 pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
1394 Self {
1395 id: id.into(),
1396 text: text.into(),
1397 signals: Vec::new(),
1398 tracks: HashMap::new(),
1399 identities: HashMap::new(),
1400 signal_to_track: HashMap::new(),
1401 track_to_identity: HashMap::new(),
1402 next_signal_id: SignalId::ZERO,
1403 next_track_id: TrackId::ZERO,
1404 next_identity_id: IdentityId::ZERO,
1405 }
1406 }
1407
1408 #[must_use]
1410 pub fn id(&self) -> &str {
1411 &self.id
1412 }
1413
1414 #[must_use]
1416 pub fn text(&self) -> &str {
1417 &self.text
1418 }
1419
1420 pub fn signals_mut(&mut self) -> &mut Vec<Signal<Location>> {
1422 &mut self.signals
1423 }
1424
1425 #[must_use]
1427 pub fn tracks_map(&self) -> &HashMap<TrackId, Track> {
1428 &self.tracks
1429 }
1430
1431 pub fn tracks_map_mut(&mut self) -> &mut HashMap<TrackId, Track> {
1436 &mut self.tracks
1437 }
1438
1439 #[must_use]
1441 pub fn identities_map(&self) -> &HashMap<IdentityId, Identity> {
1442 &self.identities
1443 }
1444
1445 pub fn identities_map_mut(&mut self) -> &mut HashMap<IdentityId, Identity> {
1450 &mut self.identities
1451 }
1452
1453 pub fn rebuild_indexes(&mut self) {
1461 self.signal_to_track.clear();
1462 self.track_to_identity.clear();
1463
1464 for (&track_id, track) in &self.tracks {
1465 for sig_ref in &track.signals {
1466 self.signal_to_track.insert(sig_ref.signal_id, track_id);
1467 }
1468 if let Some(identity_id) = track.identity_id {
1469 self.track_to_identity.insert(track_id, identity_id);
1470 }
1471 }
1472
1473 self.next_signal_id = self
1474 .signals
1475 .iter()
1476 .map(|s| s.id)
1477 .max()
1478 .map_or(SignalId::ZERO, |id| id + 1);
1479 self.next_track_id = self
1480 .tracks
1481 .keys()
1482 .copied()
1483 .max()
1484 .map_or(TrackId::ZERO, |id| id + 1);
1485 self.next_identity_id = self
1486 .identities
1487 .keys()
1488 .copied()
1489 .max()
1490 .map_or(IdentityId::ZERO, |id| id + 1);
1491 }
1492
1493 pub fn add_signal(&mut self, mut signal: Signal<Location>) -> SignalId {
1499 let id = self.next_signal_id;
1500 signal.id = id;
1501 self.signals.push(signal);
1502 self.next_signal_id += 1;
1503 id
1504 }
1505
1506 #[must_use]
1508 pub fn get_signal(&self, id: impl Into<SignalId>) -> Option<&Signal<Location>> {
1509 let id = id.into();
1510 self.signals.iter().find(|s| s.id == id)
1511 }
1512
1513 pub fn signals(&self) -> &[Signal<Location>] {
1515 &self.signals
1516 }
1517
1518 pub fn add_track(&mut self, mut track: Track) -> TrackId {
1524 let id = self.next_track_id;
1525 track.id = id;
1526
1527 for signal_ref in &track.signals {
1529 self.signal_to_track.insert(signal_ref.signal_id, id);
1530 }
1531
1532 self.tracks.insert(id, track);
1533 self.next_track_id += 1;
1534 id
1535 }
1536
1537 #[must_use]
1539 pub fn get_track(&self, id: impl Into<TrackId>) -> Option<&Track> {
1540 self.tracks.get(&id.into())
1541 }
1542
1543 #[must_use]
1545 pub fn get_track_mut(&mut self, id: impl Into<TrackId>) -> Option<&mut Track> {
1546 self.tracks.get_mut(&id.into())
1547 }
1548
1549 pub fn add_signal_to_track(
1554 &mut self,
1555 signal_id: impl Into<SignalId>,
1556 track_id: impl Into<TrackId>,
1557 position: u32,
1558 ) -> bool {
1559 let signal_id = signal_id.into();
1560 let track_id = track_id.into();
1561 if let Some(track) = self.tracks.get_mut(&track_id) {
1562 track.add_signal(signal_id, position);
1563 self.signal_to_track.insert(signal_id, track_id);
1564 true
1565 } else {
1566 false
1567 }
1568 }
1569
1570 #[must_use]
1572 pub fn track_for_signal(&self, signal_id: SignalId) -> Option<&Track> {
1573 let track_id = self.signal_to_track.get(&signal_id)?;
1574 self.tracks.get(track_id)
1575 }
1576
1577 pub fn tracks(&self) -> impl Iterator<Item = &Track> {
1579 self.tracks.values()
1580 }
1581
1582 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
1588 let id = self.next_identity_id;
1589 identity.id = id;
1590 self.identities.insert(id, identity);
1591 self.next_identity_id += 1;
1592 id
1593 }
1594
1595 pub fn link_track_to_identity(
1597 &mut self,
1598 track_id: impl Into<TrackId>,
1599 identity_id: impl Into<IdentityId>,
1600 ) {
1601 let track_id = track_id.into();
1602 let identity_id = identity_id.into();
1603 if let Some(track) = self.tracks.get_mut(&track_id) {
1604 track.identity_id = Some(identity_id);
1605 self.track_to_identity.insert(track_id, identity_id);
1606 }
1607 }
1608
1609 #[must_use]
1611 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
1612 self.identities.get(&id)
1613 }
1614
1615 #[must_use]
1617 pub fn identity_for_track(&self, track_id: TrackId) -> Option<&Identity> {
1618 let identity_id = self.track_to_identity.get(&track_id)?;
1619 self.identities.get(identity_id)
1620 }
1621
1622 #[must_use]
1624 pub fn identity_for_signal(&self, signal_id: SignalId) -> Option<&Identity> {
1625 let track_id = self.signal_to_track.get(&signal_id)?;
1626 self.identity_for_track(*track_id)
1627 }
1628
1629 pub fn identities(&self) -> impl Iterator<Item = &Identity> {
1631 self.identities.values()
1632 }
1633
1634 #[must_use]
1639 pub fn track_ref(&self, track_id: TrackId) -> Option<TrackRef> {
1640 if self.tracks.contains_key(&track_id) {
1642 Some(TrackRef {
1643 doc_id: self.id.clone(),
1644 track_id,
1645 })
1646 } else {
1647 None
1648 }
1649 }
1650
1651 #[must_use]
1657 pub fn to_entities(&self) -> Vec<Entity> {
1658 self.signals
1659 .iter()
1660 .map(|signal| {
1661 let (start, end) = signal.location.text_offsets().unwrap_or((0, 0));
1662 let track = self.track_for_signal(signal.id);
1663 let identity = track.and_then(|t| self.identity_for_track(t.id));
1664
1665 {
1666 let mut entity = Entity::new(
1667 signal.surface.clone(),
1668 EntityType::from_label(signal.label.as_str()),
1669 start,
1670 end,
1671 signal.confidence,
1672 );
1673 entity.normalized = signal.normalized.clone();
1674 entity.provenance = signal.provenance.clone();
1675 entity.kb_id = identity.and_then(|i| i.kb_id.clone());
1676 entity.canonical_id = track.map(|t| super::types::CanonicalId::new(t.id.get()));
1677 entity.hierarchical_confidence = signal.hierarchical;
1678 if let Location::Discontinuous { segments } = &signal.location {
1679 entity.set_discontinuous_span(DiscontinuousSpan::new(
1680 segments.iter().map(|(s, e)| (*s)..(*e)).collect(),
1681 ));
1682 }
1683 entity
1684 }
1685 })
1686 .collect()
1687 }
1688
1689 #[must_use]
1691 pub fn from_entities(
1692 id: impl Into<String>,
1693 text: impl Into<String>,
1694 entities: &[Entity],
1695 ) -> Self {
1696 let mut doc = Self::new(id, text);
1697
1698 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1704 enum TrackKey {
1705 Canonical(super::types::CanonicalId),
1706 Singleton(usize),
1707 }
1708
1709 let mut tracks_map: HashMap<TrackKey, Vec<SignalId>> = HashMap::new();
1710 let mut signal_to_entity_idx: HashMap<SignalId, usize> = HashMap::new();
1711
1712 for (idx, entity) in entities.iter().enumerate() {
1713 let location = if let Some(disc) = &entity.discontinuous_span {
1714 Location::Discontinuous {
1715 segments: disc.segments().iter().map(|r| (r.start, r.end)).collect(),
1716 }
1717 } else if let Some(visual) = &entity.visual_span {
1718 Location::from(visual)
1719 } else {
1720 Location::text(entity.start(), entity.end())
1721 };
1722
1723 let mut signal = Signal::new(
1724 SignalId::new(idx as u64),
1725 location,
1726 &entity.text,
1727 entity.entity_type.as_label(),
1728 f32::from(entity.confidence),
1729 );
1730 signal.normalized = entity.normalized.clone();
1731 signal.provenance = entity.provenance.clone();
1732 signal.hierarchical = entity.hierarchical_confidence;
1733
1734 let signal_id = doc.add_signal(signal);
1735 signal_to_entity_idx.insert(signal_id, idx);
1736
1737 let key = match entity.canonical_id {
1738 Some(cid) => TrackKey::Canonical(cid),
1739 None => TrackKey::Singleton(idx),
1740 };
1741 tracks_map.entry(key).or_default().push(signal_id);
1742 }
1743
1744 for (_key, signal_ids) in tracks_map {
1746 if let Some(first_signal) = signal_ids.first().and_then(|id| doc.get_signal(*id)) {
1747 let mut track = Track::new(doc.next_track_id, &first_signal.surface);
1748 track.entity_type =
1749 Some(super::types::TypeLabel::from(first_signal.label.as_str()));
1750
1751 for (pos, &signal_id) in signal_ids.iter().enumerate() {
1752 track.add_signal(signal_id, pos as u32);
1753 }
1754
1755 let kb_id = signal_ids.iter().find_map(|sid| {
1758 let ent_idx = signal_to_entity_idx.get(sid).copied()?;
1759 entities.get(ent_idx)?.kb_id.clone()
1760 });
1761 if let Some(kb_id) = kb_id {
1762 let identity = Identity::from_kb(
1763 doc.next_identity_id,
1764 &track.canonical_surface,
1765 "unknown",
1766 kb_id,
1767 );
1768 let identity_id = doc.add_identity(identity);
1769 track = track.with_identity(identity_id);
1770 }
1771
1772 doc.add_track(track);
1773 }
1774 }
1775
1776 doc
1777 }
1778
1779 #[must_use]
1781 pub fn signals_with_label(&self, label: &str) -> Vec<&Signal<Location>> {
1782 let want = super::types::TypeLabel::from(label);
1783 self.signals.iter().filter(|s| s.label == want).collect()
1784 }
1785
1786 #[must_use]
1788 pub fn confident_signals(&self, threshold: Confidence) -> Vec<&Signal<Location>> {
1789 self.signals
1790 .iter()
1791 .filter(|s| s.confidence >= threshold)
1792 .collect()
1793 }
1794
1795 pub fn linked_tracks(&self) -> impl Iterator<Item = &Track> {
1797 self.tracks.values().filter(|t| t.identity_id.is_some())
1798 }
1799
1800 pub fn unlinked_tracks(&self) -> impl Iterator<Item = &Track> {
1802 self.tracks.values().filter(|t| t.identity_id.is_none())
1803 }
1804
1805 #[must_use]
1807 pub fn untracked_signal_count(&self) -> usize {
1808 self.signals
1809 .iter()
1810 .filter(|s| !self.signal_to_track.contains_key(&s.id))
1811 .count()
1812 }
1813
1814 #[must_use]
1816 pub fn untracked_signals(&self) -> Vec<&Signal<Location>> {
1817 self.signals
1818 .iter()
1819 .filter(|s| !self.signal_to_track.contains_key(&s.id))
1820 .collect()
1821 }
1822
1823 #[must_use]
1829 pub fn signals_by_modality(&self, modality: Modality) -> Vec<&Signal<Location>> {
1830 self.signals
1831 .iter()
1832 .filter(|s| s.modality == modality)
1833 .collect()
1834 }
1835
1836 #[must_use]
1838 pub fn text_signals(&self) -> Vec<&Signal<Location>> {
1839 self.signals_by_modality(Modality::Symbolic)
1840 }
1841
1842 #[must_use]
1844 pub fn visual_signals(&self) -> Vec<&Signal<Location>> {
1845 self.signals_by_modality(Modality::Iconic)
1846 }
1847
1848 #[must_use]
1850 pub fn overlapping_signals(&self, location: &Location) -> Vec<&Signal<Location>> {
1851 self.signals
1852 .iter()
1853 .filter(|s| s.location.overlaps(location))
1854 .collect()
1855 }
1856
1857 #[must_use]
1859 pub fn signals_in_range(&self, start: usize, end: usize) -> Vec<&Signal<Location>> {
1860 self.signals
1861 .iter()
1862 .filter(|s| {
1863 if let Some((s_start, s_end)) = s.location.text_offsets() {
1864 s_start >= start && s_end <= end
1865 } else {
1866 false
1867 }
1868 })
1869 .collect()
1870 }
1871
1872 #[must_use]
1874 pub fn negated_signals(&self) -> Vec<&Signal<Location>> {
1875 self.signals.iter().filter(|s| s.negated).collect()
1876 }
1877
1878 #[must_use]
1880 pub fn quantified_signals(&self, quantifier: Quantifier) -> Vec<&Signal<Location>> {
1881 self.signals
1882 .iter()
1883 .filter(|s| s.quantifier == Some(quantifier))
1884 .collect()
1885 }
1886
1887 #[must_use]
1909 pub fn validate(&self) -> Vec<SignalValidationError> {
1910 self.signals
1911 .iter()
1912 .filter_map(|s| s.validate_against(&self.text))
1913 .collect()
1914 }
1915
1916 #[must_use]
1940 pub fn validate_invariants(&self) -> Vec<String> {
1941 let mut errors = Vec::new();
1942
1943 let mut seen_ids = std::collections::HashSet::new();
1945 for signal in &self.signals {
1946 if !seen_ids.insert(signal.id) {
1947 errors.push(format!("Duplicate signal ID: {}", signal.id));
1948 }
1949 }
1950
1951 let signal_ids: std::collections::HashSet<_> = self.signals.iter().map(|s| s.id).collect();
1953
1954 for (track_id, track) in &self.tracks {
1956 for signal_ref in &track.signals {
1957 if !signal_ids.contains(&signal_ref.signal_id) {
1958 errors.push(format!(
1959 "Track {} references non-existent signal {}",
1960 track_id, signal_ref.signal_id
1961 ));
1962 }
1963 }
1964 }
1965
1966 for (signal_id, track_id) in &self.signal_to_track {
1968 if let Some(track) = self.tracks.get(track_id) {
1970 if !track.signals.iter().any(|r| r.signal_id == *signal_id) {
1972 errors.push(format!(
1973 "signal_to_track[{}] = {} but track doesn't contain signal",
1974 signal_id, track_id
1975 ));
1976 }
1977 } else {
1978 errors.push(format!(
1979 "signal_to_track[{}] = {} but track doesn't exist",
1980 signal_id, track_id
1981 ));
1982 }
1983 }
1984
1985 for (track_id, identity_id) in &self.track_to_identity {
1987 if let Some(track) = self.tracks.get(track_id) {
1989 if track.identity_id != Some(*identity_id) {
1990 errors.push(format!(
1991 "track_to_identity[{}] = {} but track.identity_id = {:?}",
1992 track_id, identity_id, track.identity_id
1993 ));
1994 }
1995 } else {
1996 errors.push(format!(
1997 "track_to_identity[{}] = {} but track doesn't exist",
1998 track_id, identity_id
1999 ));
2000 }
2001
2002 if !self.identities.contains_key(identity_id) {
2004 errors.push(format!(
2005 "track_to_identity[{}] = {} but identity doesn't exist",
2006 track_id, identity_id
2007 ));
2008 }
2009 }
2010
2011 for (track_id, track) in &self.tracks {
2013 if let Some(identity_id) = track.identity_id {
2014 if !self.identities.contains_key(&identity_id) {
2015 errors.push(format!(
2016 "Track {} references non-existent identity {}",
2017 track_id, identity_id
2018 ));
2019 }
2020 }
2021 }
2022
2023 errors
2024 }
2025
2026 #[must_use]
2028 pub fn invariants_hold(&self) -> bool {
2029 self.validate_invariants().is_empty()
2030 }
2031
2032 #[must_use]
2034 pub fn is_valid(&self) -> bool {
2035 self.signals.iter().all(|s| s.is_valid(&self.text))
2036 }
2037
2038 pub fn add_signal_validated(
2042 &mut self,
2043 signal: Signal<Location>,
2044 ) -> Result<SignalId, SignalValidationError> {
2045 if let Some(err) = signal.validate_against(&self.text) {
2046 return Err(err);
2047 }
2048 Ok(self.add_signal(signal))
2049 }
2050
2051 pub fn add_signal_from_text(
2065 &mut self,
2066 surface: &str,
2067 label: impl Into<super::types::TypeLabel>,
2068 confidence: f32,
2069 ) -> Option<SignalId> {
2070 let signal = Signal::from_text(&self.text, surface, label, confidence)?;
2071 Some(self.add_signal(signal))
2072 }
2073
2074 pub fn add_signal_from_text_nth(
2076 &mut self,
2077 surface: &str,
2078 label: impl Into<super::types::TypeLabel>,
2079 confidence: f32,
2080 occurrence: usize,
2081 ) -> Option<SignalId> {
2082 let signal = Signal::from_text_nth(&self.text, surface, label, confidence, occurrence)?;
2083 Some(self.add_signal(signal))
2084 }
2085
2086 #[must_use]
2092 pub fn stats(&self) -> DocumentStats {
2093 let signal_count = self.signals.len();
2094 let track_count = self.tracks.len();
2095 let identity_count = self.identities.len();
2096
2097 let linked_track_count = self
2098 .tracks
2099 .values()
2100 .filter(|t| t.identity_id.is_some())
2101 .count();
2102 let untracked_count = self.untracked_signal_count();
2103
2104 let avg_track_size = if track_count > 0 {
2105 self.tracks.values().map(|t| t.len()).sum::<usize>() as f32 / track_count as f32
2106 } else {
2107 0.0
2108 };
2109
2110 let singleton_count = self.tracks.values().filter(|t| t.is_singleton()).count();
2111
2112 let avg_confidence = Confidence::new(if signal_count > 0 {
2113 self.signals
2114 .iter()
2115 .map(|s| s.confidence.value())
2116 .sum::<f64>()
2117 / signal_count as f64
2118 } else {
2119 0.0
2120 });
2121
2122 let negated_count = self.signals.iter().filter(|s| s.negated).count();
2123
2124 let symbolic_count = self
2126 .signals
2127 .iter()
2128 .filter(|s| s.modality == Modality::Symbolic)
2129 .count();
2130 let iconic_count = self
2131 .signals
2132 .iter()
2133 .filter(|s| s.modality == Modality::Iconic)
2134 .count();
2135 let hybrid_count = self
2136 .signals
2137 .iter()
2138 .filter(|s| s.modality == Modality::Hybrid)
2139 .count();
2140
2141 DocumentStats {
2142 signal_count,
2143 track_count,
2144 identity_count,
2145 linked_track_count,
2146 untracked_count,
2147 avg_track_size,
2148 singleton_count,
2149 avg_confidence,
2150 negated_count,
2151 symbolic_count,
2152 iconic_count,
2153 hybrid_count,
2154 }
2155 }
2156
2157 pub fn add_signals(
2165 &mut self,
2166 signals: impl IntoIterator<Item = Signal<Location>>,
2167 ) -> Vec<SignalId> {
2168 signals.into_iter().map(|s| self.add_signal(s)).collect()
2169 }
2170
2171 pub fn create_track_from_signals(
2175 &mut self,
2176 canonical: impl Into<String>,
2177 signal_ids: &[SignalId],
2178 ) -> Option<TrackId> {
2179 if signal_ids.is_empty() {
2180 return None;
2181 }
2182
2183 let mut track = Track::new(TrackId::ZERO, canonical);
2184 for (pos, &id) in signal_ids.iter().enumerate() {
2185 track.add_signal(id, pos as u32);
2186 }
2187 Some(self.add_track(track))
2188 }
2189
2190 pub fn merge_tracks(&mut self, track_ids: &[TrackId]) -> Option<TrackId> {
2195 if track_ids.is_empty() {
2196 return None;
2197 }
2198
2199 let mut all_signals: Vec<SignalRef> = Vec::new();
2201 let mut canonical = String::new();
2202 let mut entity_type = None;
2203
2204 for &track_id in track_ids {
2205 if let Some(track) = self.tracks.get(&track_id) {
2206 if canonical.is_empty() {
2207 canonical = track.canonical_surface.clone();
2208 entity_type = track.entity_type.clone();
2209 }
2210 all_signals.extend(track.signals.iter().cloned());
2211 }
2212 }
2213
2214 if all_signals.is_empty() {
2215 return None;
2216 }
2217
2218 all_signals.sort_by_key(|s| s.position);
2220
2221 for &track_id in track_ids {
2223 self.tracks.remove(&track_id);
2224 }
2225
2226 let mut new_track = Track::new(TrackId::ZERO, canonical);
2228 new_track.entity_type = entity_type;
2229 for (pos, signal_ref) in all_signals.iter().enumerate() {
2230 new_track.add_signal(signal_ref.signal_id, pos as u32);
2231 }
2232
2233 Some(self.add_track(new_track))
2234 }
2235
2236 #[must_use]
2238 pub fn find_overlapping_signal_pairs(&self) -> Vec<(SignalId, SignalId)> {
2239 let mut pairs = Vec::new();
2240 let signals: Vec<_> = self.signals.iter().collect();
2241
2242 for i in 0..signals.len() {
2243 for j in (i + 1)..signals.len() {
2244 if signals[i].location.overlaps(&signals[j].location) {
2245 pairs.push((signals[i].id, signals[j].id));
2246 }
2247 }
2248 }
2249
2250 pairs
2251 }
2252}
2253
2254#[derive(Debug, Clone, Copy, Default)]
2256pub struct DocumentStats {
2257 pub signal_count: usize,
2259 pub track_count: usize,
2261 pub identity_count: usize,
2263 pub linked_track_count: usize,
2265 pub untracked_count: usize,
2267 pub avg_track_size: f32,
2269 pub singleton_count: usize,
2271 pub avg_confidence: Confidence,
2273 pub negated_count: usize,
2275 pub symbolic_count: usize,
2277 pub iconic_count: usize,
2279 pub hybrid_count: usize,
2281}
2282
2283impl std::fmt::Display for DocumentStats {
2284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2285 writeln!(f, "Document Statistics:")?;
2286 writeln!(
2287 f,
2288 " Signals: {} (avg confidence: {:.2})",
2289 self.signal_count,
2290 self.avg_confidence.value()
2291 )?;
2292 writeln!(
2293 f,
2294 " Tracks: {} (avg size: {:.1}, singletons: {})",
2295 self.track_count, self.avg_track_size, self.singleton_count
2296 )?;
2297 writeln!(
2298 f,
2299 " Identities: {} ({} tracks linked)",
2300 self.identity_count, self.linked_track_count
2301 )?;
2302 writeln!(f, " Untracked signals: {}", self.untracked_count)?;
2303 writeln!(
2304 f,
2305 " Modalities: {} symbolic, {} iconic, {} hybrid",
2306 self.symbolic_count, self.iconic_count, self.hybrid_count
2307 )?;
2308 if self.negated_count > 0 {
2309 writeln!(f, " Negated: {}", self.negated_count)?;
2310 }
2311 Ok(())
2312 }
2313}
2314
2315#[derive(Debug, Clone)]
2325struct IntervalNode {
2326 signal_id: SignalId,
2328 start: usize,
2330 end: usize,
2332 max_end: usize,
2334 left: Option<Box<IntervalNode>>,
2336 right: Option<Box<IntervalNode>>,
2338}
2339
2340impl IntervalNode {
2341 fn new(signal_id: SignalId, start: usize, end: usize) -> Self {
2342 Self {
2343 signal_id,
2344 start,
2345 end,
2346 max_end: end,
2347 left: None,
2348 right: None,
2349 }
2350 }
2351
2352 fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2353 self.max_end = self.max_end.max(end);
2354
2355 if start < self.start {
2356 if let Some(ref mut left) = self.left {
2357 left.insert(signal_id, start, end);
2358 } else {
2359 self.left = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2360 }
2361 } else if let Some(ref mut right) = self.right {
2362 right.insert(signal_id, start, end);
2363 } else {
2364 self.right = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2365 }
2366 }
2367
2368 fn query_overlap(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2369 if self.start < query_end && query_start < self.end {
2371 results.push(self.signal_id);
2372 }
2373
2374 if let Some(ref left) = self.left {
2376 if left.max_end > query_start {
2377 left.query_overlap(query_start, query_end, results);
2378 }
2379 }
2380
2381 if let Some(ref right) = self.right {
2383 if self.start < query_end {
2384 right.query_overlap(query_start, query_end, results);
2385 }
2386 }
2387 }
2388
2389 fn query_containing(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2390 if self.start <= query_start && self.end >= query_end {
2392 results.push(self.signal_id);
2393 }
2394
2395 if let Some(ref left) = self.left {
2397 if left.max_end >= query_end {
2398 left.query_containing(query_start, query_end, results);
2399 }
2400 }
2401
2402 if let Some(ref right) = self.right {
2404 if self.start <= query_start {
2405 right.query_containing(query_start, query_end, results);
2406 }
2407 }
2408 }
2409
2410 fn query_contained_in(
2411 &self,
2412 range_start: usize,
2413 range_end: usize,
2414 results: &mut Vec<SignalId>,
2415 ) {
2416 if self.start >= range_start && self.end <= range_end {
2418 results.push(self.signal_id);
2419 }
2420
2421 if let Some(ref left) = self.left {
2423 left.query_contained_in(range_start, range_end, results);
2424 }
2425
2426 if let Some(ref right) = self.right {
2428 if self.start < range_end {
2429 right.query_contained_in(range_start, range_end, results);
2430 }
2431 }
2432 }
2433}
2434
2435#[derive(Debug, Clone, Default)]
2451pub struct TextSpatialIndex {
2452 root: Option<IntervalNode>,
2453 size: usize,
2454}
2455
2456impl TextSpatialIndex {
2457 #[must_use]
2459 pub fn new() -> Self {
2460 Self::default()
2461 }
2462
2463 #[must_use]
2465 pub fn from_signals(signals: &[Signal<Location>]) -> Self {
2466 let mut index = Self::new();
2467 for signal in signals {
2468 if let Some((start, end)) = signal.location.text_offsets() {
2469 index.insert(signal.id, start, end);
2470 }
2471 }
2472 index
2473 }
2474
2475 pub fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2477 if let Some(ref mut root) = self.root {
2478 root.insert(signal_id, start, end);
2479 } else {
2480 self.root = Some(IntervalNode::new(signal_id, start, end));
2481 }
2482 self.size += 1;
2483 }
2484
2485 #[must_use]
2487 pub fn query_overlap(&self, start: usize, end: usize) -> Vec<SignalId> {
2488 let mut results = Vec::new();
2489 if let Some(ref root) = self.root {
2490 root.query_overlap(start, end, &mut results);
2491 }
2492 results
2493 }
2494
2495 #[must_use]
2497 pub fn query_containing(&self, start: usize, end: usize) -> Vec<SignalId> {
2498 let mut results = Vec::new();
2499 if let Some(ref root) = self.root {
2500 root.query_containing(start, end, &mut results);
2501 }
2502 results
2503 }
2504
2505 #[must_use]
2507 pub fn query_contained_in(&self, start: usize, end: usize) -> Vec<SignalId> {
2508 let mut results = Vec::new();
2509 if let Some(ref root) = self.root {
2510 root.query_contained_in(start, end, &mut results);
2511 }
2512 results
2513 }
2514
2515 #[must_use]
2517 pub fn len(&self) -> usize {
2518 self.size
2519 }
2520
2521 #[must_use]
2523 pub fn is_empty(&self) -> bool {
2524 self.size == 0
2525 }
2526}
2527
2528impl GroundedDocument {
2529 #[must_use]
2548 pub fn build_text_index(&self) -> TextSpatialIndex {
2549 TextSpatialIndex::from_signals(&self.signals)
2550 }
2551
2552 #[must_use]
2557 pub fn query_signals_in_range_indexed(
2558 &self,
2559 start: usize,
2560 end: usize,
2561 ) -> Vec<&Signal<Location>> {
2562 let index = self.build_text_index();
2563 let ids = index.query_contained_in(start, end);
2564 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2565 }
2566
2567 #[must_use]
2569 pub fn query_overlapping_signals_indexed(
2570 &self,
2571 start: usize,
2572 end: usize,
2573 ) -> Vec<&Signal<Location>> {
2574 let index = self.build_text_index();
2575 let ids = index.query_overlap(start, end);
2576 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2577 }
2578
2579 #[must_use]
2592 pub fn to_coref_document(&self) -> super::coref::CorefDocument {
2593 use super::coref::{CorefChain, CorefDocument, Mention};
2594 use std::collections::HashMap;
2595
2596 let signal_by_id: HashMap<SignalId, &Signal<Location>> =
2598 self.signals.iter().map(|s| (s.id, s)).collect();
2599
2600 let mut chains: Vec<CorefChain> = Vec::new();
2601
2602 for track in self.tracks.values() {
2603 let mut mentions: Vec<Mention> = Vec::new();
2604
2605 for sref in &track.signals {
2606 let Some(signal) = signal_by_id.get(&sref.signal_id) else {
2607 continue;
2608 };
2609
2610 let Some((start, end)) = signal.location.text_offsets() else {
2611 continue;
2612 };
2613
2614 let mut m = Mention::new(signal.surface.clone(), start, end);
2615 m.entity_type = Some(signal.label.to_string());
2616 mentions.push(m);
2617 }
2618
2619 if mentions.is_empty() {
2620 continue;
2621 }
2622
2623 let mut chain = CorefChain::new(mentions);
2624 chain.entity_type = track.entity_type.as_ref().map(|t| t.to_string());
2625 chains.push(chain);
2626 }
2627
2628 chains.sort_by_key(|c| c.mentions.first().map(|m| m.start).unwrap_or(usize::MAX));
2630
2631 CorefDocument::with_id(&self.text, &self.id, chains)
2632 }
2633}
2634
2635pub fn render_document_html(doc: &GroundedDocument) -> String {
2643 let mut html = String::new();
2644 let stats = doc.stats();
2645
2646 html.push_str(r#"<!DOCTYPE html>
2647<html>
2648<head>
2649<meta charset="UTF-8">
2650<meta name="color-scheme" content="dark light">
2651<title>grounded::GroundedDocument</title>
2652<style>
2653:root{
2654 /* Allow UA widgets (inputs/scrollbars) to match the theme */
2655 color-scheme: light dark;
2656 /* Dark (default) */
2657 --bg:#0a0a0a;
2658 --panel-bg:#0d0d0d;
2659 --text:#b0b0b0;
2660 --text-strong:#fff;
2661 --muted:#666;
2662 --border:#222;
2663 --border-strong:#333;
2664 --hover:#111;
2665 --input-bg:#080808;
2666 --active:#fff;
2667 --track-strong:rgba(255,255,255,0.35);
2668 --track-soft:rgba(255,255,255,0.18);
2669 /* Entity colors (dark) */
2670 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2671 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2672 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2673 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2674 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2675 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2676 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2677}
2678@media (prefers-color-scheme: light){
2679 :root{
2680 --bg:#ffffff;
2681 --panel-bg:#f7f7f7;
2682 --text:#222;
2683 --text-strong:#000;
2684 --muted:#555;
2685 --border:#d6d6d6;
2686 --border-strong:#c6c6c6;
2687 --hover:#f0f0f0;
2688 --input-bg:#ffffff;
2689 --active:#000;
2690 --track-strong:rgba(0,0,0,0.25);
2691 --track-soft:rgba(0,0,0,0.12);
2692 /* Entity colors (light) */
2693 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2694 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2695 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2696 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2697 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2698 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2699 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2700 }
2701}
2702html[data-theme='dark']{
2703 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
2704 --muted:#666; --border:#222; --border-strong:#333; --hover:#111;
2705 --input-bg:#080808; --active:#fff;
2706 --track-strong:rgba(255,255,255,0.35); --track-soft:rgba(255,255,255,0.18);
2707 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2708 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2709 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2710 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2711 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2712 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2713 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2714}
2715html[data-theme='light']{
2716 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
2717 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0;
2718 --input-bg:#ffffff; --active:#000;
2719 --track-strong:rgba(0,0,0,0.25); --track-soft:rgba(0,0,0,0.12);
2720 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2721 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2722 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2723 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2724 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2725 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2726 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2727}
2728
2729*{box-sizing:border-box;margin:0;padding:0}
2730body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
2731h1,h2,h3{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
2732h1{font-size:14px}h2{font-size:12px}h3{font-size:11px;color:var(--muted)}
2733 a{color:inherit}
2734 a:hover{text-decoration:underline}
2735table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
2736th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
2737th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
2738tr:hover{background:var(--hover)}
2739.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:8px}
2740.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
2741.panel-h{display:flex;align-items:center;gap:8px}
2742.toggle{cursor:pointer;user-select:none;color:var(--muted);border:1px solid var(--border);background:var(--bg);padding:2px 6px;font-size:10px}
2743.panel-collapsed table,.panel-collapsed .panel-body{display:none}
2744.toolbar{display:flex;gap:8px;align-items:center;margin:8px 0 0}
2745.toolbar input{width:100%;max-width:520px;background:var(--input-bg);border:1px solid var(--border);color:var(--text);padding:6px 8px;font:12px monospace}
2746.muted{color:var(--muted)}
2747.panel-body{white-space:pre-wrap;word-break:break-word}
2748.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
2749.e{padding:1px 2px;border-bottom:1px solid}
2750.seg{cursor:pointer}
2751.e-per{background:var(--per-bg);border-color:var(--per-br);color:var(--per-tx)}
2752.e-org{background:var(--org-bg);border-color:var(--org-br);color:var(--org-tx)}
2753.e-loc{background:var(--loc-bg);border-color:var(--loc-br);color:var(--loc-tx)}
2754.e-misc{background:var(--mis-bg);border-color:var(--mis-br);color:var(--mis-tx)}
2755.e-date{background:var(--dat-bg);border-color:var(--dat-br);color:var(--dat-tx)}
2756.e-track{box-shadow:inset 0 0 0 1px var(--track-strong)}
2757.e-track-hover{box-shadow:inset 0 0 0 1px var(--track-soft)}
2758.e-active{outline:2px solid var(--active);outline-offset:1px}
2759.conf{color:var(--muted);font-size:10px}
2760.badge{display:inline-block;padding:1px 4px;font-size:9px;text-transform:uppercase}
2761.badge-y{background:var(--badge-y-bg);color:var(--badge-y-tx);border:1px solid var(--badge-y-br)}
2762.badge-n{background:var(--badge-n-bg);color:var(--badge-n-tx);border:1px solid var(--badge-n-br)}
2763.stats{display:flex;gap:16px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
2764.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
2765.id{color:var(--muted);font-size:9px}
2766.kb{color:var(--muted)}
2767.arrow{color:var(--muted)}
2768</style>
2769</head>
2770<body>
2771"#);
2772
2773 html.push_str(&format!(
2775 r#"<div class="panel-h" style="justify-content:space-between"><h1>doc_id="{}" len={}</h1><span class="toggle" id="theme-toggle" title="toggle theme (auto → dark → light)">theme: auto</span></div>"#,
2776 html_escape(&doc.id),
2777 doc.text.len()
2778 ));
2779
2780 html.push_str(r#"<div class="stats">"#);
2781 html.push_str(&format!(
2782 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">signals</div></div>"#,
2783 stats.signal_count
2784 ));
2785 html.push_str(&format!(
2786 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">tracks</div></div>"#,
2787 stats.track_count
2788 ));
2789 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">identities</div></div>"#, stats.identity_count));
2790 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{:.2}</div><div class="stat-l">avg_conf</div></div>"#, stats.avg_confidence));
2791 html.push_str(&format!(
2792 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">linked</div></div>"#,
2793 stats.linked_track_count
2794 ));
2795 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">untracked</div></div>"#, stats.untracked_count));
2796 if stats.iconic_count > 0 || stats.hybrid_count > 0 {
2797 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}/{}/{}</div><div class="stat-l">sym/ico/hyb</div></div>"#,
2798 stats.symbolic_count, stats.iconic_count, stats.hybrid_count));
2799 }
2800 html.push_str(r#"</div>"#);
2801
2802 html.push_str(r#"<h2>text</h2>"#);
2804 html.push_str(r#"<div class="text-box">"#);
2805 html.push_str(&annotate_text_html(
2806 &doc.text,
2807 doc.signals(),
2808 &doc.signal_to_track,
2809 ));
2810 html.push_str(r#"</div>"#);
2811
2812 html.push_str(
2814 r#"<h2>selection</h2><div class="panel" id="selection-panel" role="region" aria-label="selection"><div class="panel-h"><h3>selection</h3><span class="muted" id="selection-hint" role="status" aria-live="polite">click a mention / row to see coref track details</span></div><pre class="panel-body" id="selection-body" role="textbox" aria-readonly="true" aria-label="selection details">—</pre></div>"#,
2815 );
2816
2817 html.push_str(r#"<div class="grid">"#);
2819
2820 html.push_str(r#"<div class="panel" id="panel-signals"><div class="panel-h"><h3>signals (level 1)</h3><span class="toggle" data-toggle="panel-signals">toggle</span></div><div class="toolbar"><input id="signal-filter" type="text" placeholder="filter signals: id / label / surface (e.g. 'PER', 'S12', 'Paris')" /><span class="muted" id="signal-filter-count"></span></div><table id="signals-table">"#);
2822 html.push_str(r#"<tr><th>id</th><th>span</th><th>surface</th><th>label</th><th>conf</th><th>track</th></tr>"#);
2823 for signal in doc.signals() {
2824 let (span, start_opt, end_opt) = if let Some((s, e)) = signal.location.text_offsets() {
2825 (format!("[{},{})", s, e), Some(s), Some(e))
2826 } else {
2827 ("bbox".to_string(), None, None)
2828 };
2829 let track_id_num = doc.signal_to_track.get(&signal.id).copied();
2830 let track_id = track_id_num
2831 .map(|t| format!("T{}", t))
2832 .unwrap_or_else(|| "-".to_string());
2833 let track_attr = track_id_num
2834 .map(|t| format!(r#" data-track="{}""#, t))
2835 .unwrap_or_default();
2836 let offs_attr = match (start_opt, end_opt) {
2837 (Some(s), Some(e)) => format!(r#" data-start="{}" data-end="{}""#, s, e),
2838 _ => String::new(),
2839 };
2840 let neg = if signal.negated { " NEG" } else { "" };
2841 html.push_str(&format!(
2842 r#"<tr data-sid="S{sid}" data-label="{label}" data-surface="{surface}"{track_attr}{offs_attr} data-conf="{conf:.2}"><td class="id"><a href='#S{sid}'>S{sid}</a></td><td>{span}</td><td>{surface}</td><td>{label}{neg}</td><td class="conf">{conf:.2}</td><td class="id">{track}</td></tr>"#,
2843 sid = signal.id,
2844 span = span,
2845 surface = html_escape(&signal.surface),
2846 label = html_escape(signal.label.as_str()),
2847 neg = neg,
2848 conf = signal.confidence.value(),
2849 track = track_id,
2850 track_attr = track_attr,
2851 offs_attr = offs_attr
2852 ));
2853 }
2854 html.push_str(r#"</table></div>"#);
2855
2856 html.push_str(r#"<div class="panel" id="panel-tracks"><div class="panel-h"><h3>tracks (level 2)</h3><span class="toggle" data-toggle="panel-tracks">toggle</span></div><table id="tracks-table">"#);
2858 html.push_str(r#"<tr><th>id</th><th>canonical</th><th>type</th><th>|S|</th><th>signals</th><th>identity</th></tr>"#);
2859 for track in doc.tracks() {
2860 let entity_type = track
2861 .entity_type
2862 .as_ref()
2863 .map(|t| t.as_str())
2864 .unwrap_or("-");
2865 let signals: Vec<String> = track
2866 .signals
2867 .iter()
2868 .map(|s| format!("S{}", s.signal_id))
2869 .collect();
2870 let identity = doc
2871 .identity_for_track(track.id)
2872 .map(|i| format!("I{}", i.id))
2873 .unwrap_or_else(|| "-".to_string());
2874 let linked_badge = if track.identity_id.is_some() {
2875 r#"<span class="badge badge-y">y</span>"#
2876 } else {
2877 r#"<span class="badge badge-n">n</span>"#
2878 };
2879 html.push_str(&format!(
2880 r#"<tr data-tid="{tid}"><td class="id">T{tid}</td><td>{canonical_surface}</td><td>{etype}</td><td>{n}</td><td class="id">{sigs}</td><td class="id">{ident} {badge}</td></tr>"#,
2881 tid = track.id,
2882 canonical_surface = html_escape(&track.canonical_surface),
2883 etype = html_escape(entity_type),
2884 n = track.len(),
2885 sigs = html_escape(&signals.join(" ")),
2886 ident = identity,
2887 badge = linked_badge
2888 ));
2889 }
2890 html.push_str(r#"</table></div>"#);
2891
2892 html.push_str(r#"<div class="panel" id="panel-identities"><div class="panel-h"><h3>identities (level 3)</h3><span class="toggle" data-toggle="panel-identities">toggle</span></div><table>"#);
2894 html.push_str(r#"<tr><th>id</th><th>name</th><th>type</th><th>kb</th><th>kb_id</th><th>aliases</th></tr>"#);
2895 for identity in doc.identities() {
2896 let kb = identity.kb_name.as_deref().unwrap_or("-");
2897 let kb_id = identity.kb_id.as_deref().unwrap_or("-");
2898 let entity_type = identity
2899 .entity_type
2900 .as_ref()
2901 .map(|t| t.as_str())
2902 .unwrap_or("-");
2903 let aliases = if identity.aliases.is_empty() {
2904 "-".to_string()
2905 } else {
2906 identity.aliases.join(", ")
2907 };
2908 html.push_str(&format!(
2909 r#"<tr><td class="id">I{}</td><td>{}</td><td>{}</td><td class="kb">{}</td><td class="kb">{}</td><td>{}</td></tr>"#,
2910 identity.id, html_escape(&identity.canonical_name), entity_type, kb, kb_id, html_escape(&aliases)
2911 ));
2912 }
2913 html.push_str(r#"</table></div>"#);
2914
2915 html.push_str(r#"</div>"#); html.push_str(r#"<h2>hierarchy trace</h2><div class="panel"><table>"#);
2919 html.push_str(r#"<tr><th>signal</th><th></th><th>track</th><th></th><th>identity</th><th>kb_id</th></tr>"#);
2920 for signal in doc.signals() {
2921 let track = doc.track_for_signal(signal.id);
2922 let identity = doc.identity_for_signal(signal.id);
2923
2924 let track_str = track
2925 .map(|t| format!("T{} \"{}\"", t.id, html_escape(&t.canonical_surface)))
2926 .unwrap_or_else(|| "-".to_string());
2927 let identity_str = identity
2928 .map(|i| format!("I{} \"{}\"", i.id, html_escape(&i.canonical_name)))
2929 .unwrap_or_else(|| "-".to_string());
2930 let kb_str = identity
2931 .and_then(|i| i.kb_id.as_ref())
2932 .map(|s| s.as_str())
2933 .unwrap_or("-");
2934
2935 html.push_str(&format!(
2936 r#"<tr><td>S{} "{}"</td><td class="arrow">→</td><td>{}</td><td class="arrow">→</td><td>{}</td><td class="kb">{}</td></tr>"#,
2937 signal.id, html_escape(&signal.surface), track_str, identity_str, kb_str
2938 ));
2939 }
2940 html.push_str(r#"</table></div>"#);
2941
2942 html.push_str(r#"<script>
2945(() => {
2946 // Index signal metadata from the signals table, and map signal/track → text elements.
2947 const signalMeta = new Map();
2948 document.querySelectorAll('#signals-table tr[data-sid]').forEach((row) => {
2949 const sid = row.getAttribute('data-sid');
2950 if (!sid) return;
2951 signalMeta.set(sid, {
2952 sid,
2953 label: row.getAttribute('data-label') || '',
2954 surface: row.getAttribute('data-surface') || '',
2955 conf: row.getAttribute('data-conf') || '',
2956 start: row.getAttribute('data-start'),
2957 end: row.getAttribute('data-end'),
2958 track: row.getAttribute('data-track'),
2959 });
2960 });
2961
2962 const signalEls = new Map();
2963 const addSignalEl = (sid, el) => {
2964 if (!sid || !el) return;
2965 const arr = signalEls.get(sid) || [];
2966 arr.push(el);
2967 signalEls.set(sid, arr);
2968 };
2969 // Old-style inline spans (non-overlapping renderer).
2970 document.querySelectorAll('span.e[data-sid]').forEach((el) => {
2971 addSignalEl(el.getAttribute('data-sid'), el);
2972 });
2973 // Segmented spans (overlap/discontinuous-safe renderer).
2974 document.querySelectorAll('span.seg[data-sids]').forEach((el) => {
2975 const raw = (el.getAttribute('data-sids') || '').trim();
2976 if (!raw) return;
2977 raw.split(/\s+/).filter(Boolean).forEach((sid) => addSignalEl(sid, el));
2978 });
2979
2980 const trackEls = new Map();
2981 for (const [sid, els] of signalEls.entries()) {
2982 const meta = signalMeta.get(sid);
2983 const tid = meta ? meta.track : null;
2984 if (!tid) continue;
2985 const arr = trackEls.get(tid) || [];
2986 els.forEach((el) => arr.push(el));
2987 trackEls.set(tid, arr);
2988 }
2989
2990 const selectionBody = document.getElementById('selection-body');
2991 const selectionHint = document.getElementById('selection-hint');
2992 const defaultHint = selectionHint ? (selectionHint.textContent || '') : '';
2993 const setSelection = (text) => {
2994 if (!selectionBody) return;
2995 selectionBody.textContent = text;
2996 };
2997 const setHint = (text) => {
2998 if (!selectionHint) return;
2999 selectionHint.textContent = text || defaultHint;
3000 };
3001
3002 // Theme toggle: auto (prefers-color-scheme) → dark → light.
3003 const themeBtn = document.getElementById('theme-toggle');
3004 const themeKey = 'anno-theme';
3005 const applyTheme = (theme) => {
3006 const t = theme || 'auto';
3007 if (t === 'auto') {
3008 delete document.documentElement.dataset.theme;
3009 } else {
3010 document.documentElement.dataset.theme = t;
3011 }
3012 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
3013 };
3014 const readTheme = () => {
3015 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
3016 };
3017 const writeTheme = (t) => {
3018 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
3019 };
3020 applyTheme(readTheme());
3021 if (themeBtn) {
3022 themeBtn.addEventListener('click', () => {
3023 const cur = readTheme();
3024 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
3025 writeTheme(next);
3026 applyTheme(next);
3027 });
3028 }
3029
3030 let activeSignalEls = [];
3031 let activeSignalRow = null;
3032 const clearActive = () => {
3033 if (activeSignalEls && activeSignalEls.length) {
3034 activeSignalEls.forEach((el) => el.classList.remove('e-active'));
3035 }
3036 if (activeSignalRow) activeSignalRow.classList.remove('e-active');
3037 activeSignalEls = [];
3038 activeSignalRow = null;
3039 };
3040
3041 let activeTrack = null;
3042 let hoverTrack = null;
3043
3044 const removeTrackClass = (tid, cls) => {
3045 if (!tid) return;
3046 const els = trackEls.get(tid);
3047 if (!els) return;
3048 els.forEach((el) => el.classList.remove(cls));
3049 };
3050
3051 const addTrackClass = (tid, cls) => {
3052 if (!tid) return;
3053 const els = trackEls.get(tid);
3054 if (!els) return;
3055 els.forEach((el) => el.classList.add(cls));
3056 };
3057
3058 const trackSize = (tid) => {
3059 const els = tid ? trackEls.get(tid) : null;
3060 return els ? els.length : 0;
3061 };
3062
3063 const getTrackSelectionText = (tid) => {
3064 if (!tid) return 'track: - (untracked)';
3065 const row = document.querySelector(`#tracks-table tr[data-tid='${tid}']`);
3066 if (!row) return `track T${tid}`;
3067 const cells = row.querySelectorAll('td');
3068 const canonical = (cells[1]?.textContent || '').trim();
3069 const etype = (cells[2]?.textContent || '').trim();
3070 const count = (cells[3]?.textContent || '').trim();
3071 const sigs = (cells[4]?.textContent || '').trim();
3072 const lines = [];
3073 lines.push(`track T${tid} canonical="${canonical}" type="${etype}" mentions=${count}`);
3074 if (sigs) lines.push(`track signals: ${sigs}`);
3075 return lines.join('\n');
3076 };
3077
3078 const renderTrackSelection = (tid) => setSelection(getTrackSelectionText(tid));
3079
3080 const renderSignalSelectionBySid = (sid) => {
3081 const meta = signalMeta.get(sid);
3082 const label = meta ? (meta.label || '') : '';
3083 const conf = meta ? (meta.conf || '') : '';
3084 const start = meta ? meta.start : null;
3085 const end = meta ? meta.end : null;
3086 const tid = meta ? meta.track : null;
3087 const lines = [];
3088 if (start !== null && end !== null) {
3089 lines.push(`signal ${sid} label=${label} conf=${conf} span=[${start},${end})`);
3090 } else {
3091 lines.push(`signal ${sid} label=${label} conf=${conf}`);
3092 }
3093 if (meta && meta.surface) lines.push(`surface: ${meta.surface}`);
3094 lines.push('');
3095 lines.push(getTrackSelectionText(tid));
3096 setSelection(lines.join('\n'));
3097 };
3098
3099 const setActiveTrack = (tid) => {
3100 const next = tid || null;
3101 if (activeTrack === next) return;
3102 removeTrackClass(activeTrack, 'e-track');
3103 activeTrack = next;
3104 if (activeTrack) addTrackClass(activeTrack, 'e-track');
3105 if (hoverTrack && activeTrack && hoverTrack === activeTrack) {
3106 removeTrackClass(hoverTrack, 'e-track-hover');
3107 }
3108 };
3109
3110 const setHoverTrack = (tid) => {
3111 const next = tid || null;
3112 if (hoverTrack === next) return;
3113 removeTrackClass(hoverTrack, 'e-track-hover');
3114 hoverTrack = next;
3115 if (!hoverTrack) {
3116 setHint('');
3117 return;
3118 }
3119 if (activeTrack && hoverTrack === activeTrack) {
3120 setHint(`selected track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3121 return;
3122 }
3123 addTrackClass(hoverTrack, 'e-track-hover');
3124 setHint(`hover track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3125 };
3126
3127 const emitToParentSpan = (start, end) => {
3128 try {
3129 if (!window.parent || window.parent === window) return;
3130 if (start === null || end === null) return;
3131 window.parent.postMessage({ type: 'anno:activate-span', start: Number(start), end: Number(end) }, '*');
3132 } catch (_) {
3133 // ignore: best-effort bridge for iframe containers
3134 }
3135 };
3136
3137 const activateBySpan = (start, end, emit) => {
3138 if (start === null || end === null || start === undefined || end === undefined) return;
3139 // Prefer an exact signal span if present; otherwise fall back to the table row metadata.
3140 const el = document.querySelector(`span.e[data-sid][data-start='${start}'][data-end='${end}']`);
3141 if (el) {
3142 const sid = el.getAttribute('data-sid');
3143 if (sid) activateSignal(sid, emit);
3144 return;
3145 }
3146 const row = document.querySelector(`#signals-table tr[data-start='${start}'][data-end='${end}']`);
3147 if (!row) return;
3148 const sid = row.getAttribute('data-sid');
3149 if (!sid) return;
3150 activateSignal(sid, emit);
3151 };
3152
3153 const activateSignal = (sid, emit) => {
3154 clearActive();
3155 const els = signalEls.get(sid) || [];
3156 if (!els.length) return;
3157 els.forEach((el) => el.classList.add('e-active'));
3158 activeSignalEls = els;
3159 const row = document.querySelector(`#signals-table tr[data-sid='${sid}']`);
3160 if (row) {
3161 row.classList.add('e-active');
3162 activeSignalRow = row;
3163 }
3164 const primaryEl = els[0];
3165 primaryEl.scrollIntoView({ block: 'center', behavior: 'smooth' });
3166 const meta = signalMeta.get(sid);
3167 const tid = meta ? meta.track : primaryEl.getAttribute('data-track');
3168 setActiveTrack(tid);
3169 renderSignalSelectionBySid(sid);
3170 if (emit && meta && meta.start !== null && meta.end !== null) {
3171 emitToParentSpan(meta.start, meta.end);
3172 }
3173 };
3174
3175 // Table click
3176 const signalsTable = document.getElementById('signals-table');
3177 if (signalsTable) {
3178 signalsTable.addEventListener('click', (ev) => {
3179 const a = ev.target && ev.target.closest ? ev.target.closest("a[href^='#S']") : null;
3180 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3181 const sid = (a && a.getAttribute('href') ? a.getAttribute('href').slice(1) : null) || (row ? row.getAttribute('data-sid') : null);
3182 if (!sid) return;
3183 ev.preventDefault();
3184 activateSignal(sid, true);
3185 history.replaceState(null, '', '#' + sid);
3186 });
3187
3188 // Hover a signals row → preview track highlight
3189 signalsTable.addEventListener('mouseover', (ev) => {
3190 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3191 if (!row) return;
3192 const tid = row.getAttribute('data-track');
3193 setHoverTrack(tid);
3194 });
3195 signalsTable.addEventListener('mouseout', (ev) => {
3196 const to = ev.relatedTarget;
3197 if (to && signalsTable.contains(to)) return;
3198 setHoverTrack(null);
3199 });
3200 }
3201
3202 // Clicking an inline entity should also toggle active highlight.
3203 const pickPrimarySid = (el) => {
3204 if (!el) return null;
3205 const p = el.getAttribute('data-primary');
3206 if (p) return p;
3207 const raw = (el.getAttribute('data-sids') || '').trim();
3208 if (!raw) return null;
3209 const sids = raw.split(/\s+/).filter(Boolean);
3210 if (!sids.length) return null;
3211 // Prefer the shortest mention span from metadata.
3212 let best = sids[0];
3213 let bestLen = null;
3214 for (const sid of sids) {
3215 const meta = signalMeta.get(sid);
3216 const s = meta && meta.start !== null ? Number(meta.start) : null;
3217 const e = meta && meta.end !== null ? Number(meta.end) : null;
3218 const len = (s !== null && e !== null) ? (e - s) : null;
3219 if (len === null) continue;
3220 if (bestLen === null || len < bestLen) {
3221 best = sid;
3222 bestLen = len;
3223 }
3224 }
3225 return best;
3226 };
3227
3228 document.addEventListener('click', (ev) => {
3229 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3230 if (span) {
3231 activateSignal(span.getAttribute('data-sid'), true);
3232 return;
3233 }
3234 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3235 if (!seg) return;
3236 activateSignal(pickPrimarySid(seg), true);
3237 });
3238
3239 // Hover an inline entity → preview highlight its track
3240 document.addEventListener('mouseover', (ev) => {
3241 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3242 if (span) {
3243 setHoverTrack(span.getAttribute('data-track'));
3244 return;
3245 }
3246 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3247 if (!seg) return;
3248 const sid = pickPrimarySid(seg);
3249 const meta = sid ? signalMeta.get(sid) : null;
3250 setHoverTrack(meta ? meta.track : null);
3251 });
3252 document.addEventListener('mouseout', (ev) => {
3253 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3254 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3255 if (!span && !seg) return;
3256 const to = ev.relatedTarget;
3257 if (to && to.closest && (to.closest('span.e[data-sid]') || to.closest('span.seg[data-sids]'))) return;
3258 setHoverTrack(null);
3259 });
3260
3261 // Clicking a track row → select track (highlight + details)
3262 const tracksTable = document.getElementById('tracks-table');
3263 if (tracksTable) {
3264 tracksTable.addEventListener('click', (ev) => {
3265 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3266 if (!row) return;
3267 const tid = row.getAttribute('data-tid');
3268 setActiveTrack(tid);
3269 renderTrackSelection(tid);
3270 });
3271 tracksTable.addEventListener('mouseover', (ev) => {
3272 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3273 if (!row) return;
3274 setHoverTrack(row.getAttribute('data-tid'));
3275 });
3276 tracksTable.addEventListener('mouseout', (ev) => {
3277 const to = ev.relatedTarget;
3278 if (to && tracksTable.contains(to)) return;
3279 setHoverTrack(null);
3280 });
3281 }
3282
3283 // Filter
3284 const input = document.getElementById('signal-filter');
3285 const countEl = document.getElementById('signal-filter-count');
3286 if (input && signalsTable) {
3287 const update = () => {
3288 const q = (input.value || '').trim().toLowerCase();
3289 let shown = 0;
3290 const rows = signalsTable.querySelectorAll('tr[data-sid]');
3291 rows.forEach(row => {
3292 const sid = (row.getAttribute('data-sid') || '').toLowerCase();
3293 const label = (row.getAttribute('data-label') || '').toLowerCase();
3294 const surface = (row.getAttribute('data-surface') || '').toLowerCase();
3295 const ok = !q || sid.includes(q) || label.includes(q) || surface.includes(q);
3296 row.style.display = ok ? '' : 'none';
3297 if (ok) shown += 1;
3298 });
3299 if (countEl) countEl.textContent = shown + ' shown';
3300 };
3301 input.addEventListener('input', update);
3302 update();
3303 }
3304
3305 // Panel toggles
3306 document.querySelectorAll('[data-toggle]').forEach(btn => {
3307 btn.addEventListener('click', () => {
3308 const id = btn.getAttribute('data-toggle');
3309 const panel = id ? document.getElementById(id) : null;
3310 if (!panel) return;
3311 panel.classList.toggle('panel-collapsed');
3312 });
3313 });
3314
3315 // If URL hash is #S123, focus it.
3316 const hash = (location.hash || '').slice(1);
3317 if (hash && hash.startsWith('S')) activateSignal(hash, false);
3318
3319 // Optional: allow parent pages (e.g., dataset explorers) to sync selection across iframes.
3320 window.addEventListener('message', (ev) => {
3321 const data = ev && ev.data ? ev.data : null;
3322 if (!data || data.type !== 'anno:activate-span') return;
3323 if (typeof data.start !== 'number' || typeof data.end !== 'number') return;
3324 activateBySpan(data.start, data.end, false);
3325 });
3326})();
3327</script>"#);
3328
3329 html.push_str(r#"</body></html>"#);
3330 html
3331}
3332
3333fn html_escape(s: &str) -> String {
3334 s.replace('&', "&")
3335 .replace('<', "<")
3336 .replace('>', ">")
3337 .replace('"', """)
3338}
3339
3340fn annotate_text_html(
3341 text: &str,
3342 signals: &[Signal<Location>],
3343 signal_to_track: &std::collections::HashMap<SignalId, TrackId>,
3344) -> String {
3345 let char_count = text.chars().count();
3346 if char_count == 0 {
3347 return String::new();
3348 }
3349
3350 #[derive(Debug, Clone)]
3351 struct SigMeta {
3352 sid: String,
3353 label: String,
3354 conf: f64,
3355 track_id: Option<TrackId>,
3356 covered_len: usize,
3357 }
3358
3359 #[derive(Debug, Clone)]
3360 struct Event {
3361 pos: usize,
3362 meta_idx: usize,
3363 delta: i32, }
3365
3366 let mut metas: Vec<SigMeta> = Vec::new();
3368 let mut events: Vec<Event> = Vec::new();
3369 let mut boundaries: Vec<usize> = vec![0, char_count];
3370
3371 for s in signals {
3372 let raw_segments: Vec<(usize, usize)> = match &s.location {
3373 Location::Text { start, end } => vec![(*start, *end)],
3374 Location::Discontinuous { segments } => segments.clone(),
3375 };
3376 if raw_segments.is_empty() {
3377 continue;
3378 }
3379
3380 let mut cleaned: Vec<(usize, usize)> = Vec::new();
3381 let mut covered_len = 0usize;
3382 for (start, end) in raw_segments {
3383 let start = start.min(char_count);
3384 let end = end.min(char_count);
3385 if start >= end {
3386 continue;
3387 }
3388 covered_len = covered_len.saturating_add(end - start);
3389 cleaned.push((start, end));
3390 }
3391 if cleaned.is_empty() {
3392 continue;
3393 }
3394
3395 let meta_idx = metas.len();
3396 let track_id = signal_to_track.get(&s.id).copied();
3397 metas.push(SigMeta {
3398 sid: format!("S{}", s.id),
3399 label: s.label.to_string(),
3400 conf: s.confidence.value(),
3401 track_id,
3402 covered_len,
3403 });
3404
3405 for (start, end) in cleaned {
3406 boundaries.push(start);
3407 boundaries.push(end);
3408 events.push(Event {
3409 pos: start,
3410 meta_idx,
3411 delta: 1,
3412 });
3413 events.push(Event {
3414 pos: end,
3415 meta_idx,
3416 delta: -1,
3417 });
3418 }
3419 }
3420
3421 if metas.is_empty() {
3422 return html_escape(text);
3423 }
3424
3425 boundaries.sort_unstable();
3426 boundaries.dedup();
3427 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
3428
3429 let mut active_counts: Vec<u32> = vec![0; metas.len()];
3430 let mut active: Vec<usize> = Vec::new();
3431 let mut ev_idx = 0usize;
3432
3433 let mut result = String::new();
3434
3435 for bi in 0..boundaries.len().saturating_sub(1) {
3436 let pos = boundaries[bi];
3437 while ev_idx < events.len() && events[ev_idx].pos == pos {
3439 let e = &events[ev_idx];
3440 let idx = e.meta_idx;
3441 if e.delta < 0 {
3442 if active_counts[idx] > 0 {
3443 active_counts[idx] -= 1;
3444 if active_counts[idx] == 0 {
3445 active.retain(|&x| x != idx);
3446 }
3447 }
3448 } else {
3449 active_counts[idx] += 1;
3450 if active_counts[idx] == 1 {
3451 active.push(idx);
3452 }
3453 }
3454 ev_idx += 1;
3455 }
3456
3457 let next = boundaries[bi + 1];
3458 if next <= pos {
3459 continue;
3460 }
3461
3462 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
3463 if active.is_empty() {
3464 result.push_str(&html_escape(&seg_text));
3465 continue;
3466 }
3467
3468 let primary_idx = active
3470 .iter()
3471 .copied()
3472 .min_by(|a, b| {
3473 metas[*a]
3474 .covered_len
3475 .cmp(&metas[*b].covered_len)
3476 .then_with(|| {
3477 metas[*b]
3478 .conf
3479 .partial_cmp(&metas[*a].conf)
3480 .unwrap_or(std::cmp::Ordering::Equal)
3481 })
3482 })
3483 .unwrap_or(active[0]);
3484 let primary = &metas[primary_idx];
3485
3486 let class = match primary.label.to_uppercase().as_str() {
3487 "PER" | "PERSON" => "e-per",
3488 "ORG" | "ORGANIZATION" | "COMPANY" => "e-org",
3489 "LOC" | "LOCATION" | "GPE" => "e-loc",
3490 "DATE" | "TIME" => "e-date",
3491 _ => "e-misc",
3492 };
3493
3494 let mut sids: Vec<&str> = active.iter().map(|i| metas[*i].sid.as_str()).collect();
3495 sids.sort_unstable();
3496 let data_sids = sids.join(" ");
3497
3498 let mut title = format!(
3499 "sids=[{}] primary={} [{}..{})",
3500 data_sids, primary.sid, pos, next
3501 );
3502 if let Some(t) = primary.track_id {
3503 title.push_str(&format!(" track=T{}", t));
3504 }
3505
3506 result.push_str(&format!(
3507 r#"<span class="e seg {class}" data-sids="{sids}" data-start="{start}" data-end="{end}" data-primary="{primary}" title="{title}">{text}</span>"#,
3508 class = class,
3509 sids = html_escape(&data_sids),
3510 start = pos,
3511 end = next,
3512 primary = html_escape(&primary.sid),
3513 title = html_escape(&title),
3514 text = html_escape(&seg_text),
3515 ));
3516 }
3517
3518 result
3519}
3520
3521#[derive(Debug, Clone)]
3527pub struct EvalComparison {
3528 pub text: String,
3530 pub gold: Vec<Signal<Location>>,
3532 pub predicted: Vec<Signal<Location>>,
3534 pub matches: Vec<EvalMatch>,
3536}
3537
3538#[derive(Debug, Clone)]
3540pub enum EvalMatch {
3541 Correct {
3543 gold_id: SignalId,
3545 pred_id: SignalId,
3547 },
3548 TypeMismatch {
3550 gold_id: SignalId,
3552 pred_id: SignalId,
3554 gold_label: String,
3556 pred_label: String,
3558 },
3559 BoundaryError {
3561 gold_id: SignalId,
3563 pred_id: SignalId,
3565 iou: f64,
3567 },
3568 Spurious {
3570 pred_id: SignalId,
3572 },
3573 Missed {
3575 gold_id: SignalId,
3577 },
3578}
3579
3580impl EvalComparison {
3581 #[must_use]
3601 pub fn compare(
3602 text: &str,
3603 gold: Vec<Signal<Location>>,
3604 predicted: Vec<Signal<Location>>,
3605 ) -> Self {
3606 let mut matches = Vec::new();
3607 let mut gold_matched = vec![false; gold.len()];
3608 let mut pred_matched = vec![false; predicted.len()];
3609
3610 for (pi, pred) in predicted.iter().enumerate() {
3612 let pred_offsets = match pred.location.text_offsets() {
3613 Some(o) => o,
3614 None => continue,
3615 };
3616
3617 for (gi, g) in gold.iter().enumerate() {
3618 if gold_matched[gi] {
3619 continue;
3620 }
3621 let gold_offsets = match g.location.text_offsets() {
3622 Some(o) => o,
3623 None => continue,
3624 };
3625
3626 if pred_offsets == gold_offsets {
3628 if pred.label == g.label {
3629 matches.push(EvalMatch::Correct {
3630 gold_id: g.id,
3631 pred_id: pred.id,
3632 });
3633 } else {
3634 matches.push(EvalMatch::TypeMismatch {
3635 gold_id: g.id,
3636 pred_id: pred.id,
3637 gold_label: g.label.to_string(),
3638 pred_label: pred.label.to_string(),
3639 });
3640 }
3641 gold_matched[gi] = true;
3642 pred_matched[pi] = true;
3643 break;
3644 }
3645 }
3646 }
3647
3648 for (pi, pred) in predicted.iter().enumerate() {
3650 if pred_matched[pi] {
3651 continue;
3652 }
3653 let pred_offsets = match pred.location.text_offsets() {
3654 Some(o) => o,
3655 None => continue,
3656 };
3657
3658 for (gi, g) in gold.iter().enumerate() {
3659 if gold_matched[gi] {
3660 continue;
3661 }
3662 let gold_offsets = match g.location.text_offsets() {
3663 Some(o) => o,
3664 None => continue,
3665 };
3666
3667 if pred_offsets.0 < gold_offsets.1 && pred_offsets.1 > gold_offsets.0 {
3669 let iou = pred.location.iou(&g.location).unwrap_or(0.0);
3670 matches.push(EvalMatch::BoundaryError {
3671 gold_id: g.id,
3672 pred_id: pred.id,
3673 iou,
3674 });
3675 gold_matched[gi] = true;
3676 pred_matched[pi] = true;
3677 break;
3678 }
3679 }
3680 }
3681
3682 for (pi, pred) in predicted.iter().enumerate() {
3684 if !pred_matched[pi] {
3685 matches.push(EvalMatch::Spurious { pred_id: pred.id });
3686 }
3687 }
3688
3689 for (gi, g) in gold.iter().enumerate() {
3691 if !gold_matched[gi] {
3692 matches.push(EvalMatch::Missed { gold_id: g.id });
3693 }
3694 }
3695
3696 Self {
3697 text: text.to_string(),
3698 gold,
3699 predicted,
3700 matches,
3701 }
3702 }
3703
3704 #[must_use]
3706 pub fn correct_count(&self) -> usize {
3707 self.matches
3708 .iter()
3709 .filter(|m| matches!(m, EvalMatch::Correct { .. }))
3710 .count()
3711 }
3712
3713 #[must_use]
3715 pub fn error_count(&self) -> usize {
3716 self.matches.len() - self.correct_count()
3717 }
3718
3719 #[must_use]
3721 pub fn precision(&self) -> f64 {
3722 if self.predicted.is_empty() {
3723 0.0
3724 } else {
3725 self.correct_count() as f64 / self.predicted.len() as f64
3726 }
3727 }
3728
3729 #[must_use]
3731 pub fn recall(&self) -> f64 {
3732 if self.gold.is_empty() {
3733 0.0
3734 } else {
3735 self.correct_count() as f64 / self.gold.len() as f64
3736 }
3737 }
3738
3739 #[must_use]
3741 pub fn f1(&self) -> f64 {
3742 let p = self.precision();
3743 let r = self.recall();
3744 if p + r > 0.0 {
3745 2.0 * p * r / (p + r)
3746 } else {
3747 0.0
3748 }
3749 }
3750}
3751
3752pub fn render_eval_html(cmp: &EvalComparison) -> String {
3756 render_eval_html_with_title(cmp, "eval comparison")
3757}
3758
3759#[must_use]
3763pub fn render_eval_html_with_title(cmp: &EvalComparison, title: &str) -> String {
3764 let mut html = String::new();
3765 let title = html_escape(title);
3766
3767 html.push_str(
3768 r#"<!DOCTYPE html>
3769<html>
3770<head>
3771<meta charset="UTF-8">
3772<meta name="color-scheme" content="dark light">
3773"#,
3774 );
3775 html.push_str(&format!("<title>{}</title>", title));
3776 html.push_str(r#"
3777:root{
3778 color-scheme: light dark;
3779 --bg:#0a0a0a;
3780 --panel-bg:#0d0d0d;
3781 --text:#b0b0b0;
3782 --text-strong:#fff;
3783 --muted:#666;
3784 --border:#222;
3785 --border-strong:#333;
3786 --hover:#111;
3787 --input-bg:#080808;
3788 --active:#ddd;
3789 /* Eval entity colors (dark) */
3790 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3791 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3792 /* Match row borders */
3793 --m-ok:#4a8a4a;
3794 --m-type:#8a8a4a;
3795 --m-bound:#4a8a8a;
3796 --m-fp:#8a4a4a;
3797 --m-fn:#8a4a8a;
3798}
3799@media (prefers-color-scheme: light){
3800 :root{
3801 --bg:#ffffff;
3802 --panel-bg:#f7f7f7;
3803 --text:#222;
3804 --text-strong:#000;
3805 --muted:#555;
3806 --border:#d6d6d6;
3807 --border-strong:#c6c6c6;
3808 --hover:#f0f0f0;
3809 --input-bg:#ffffff;
3810 --active:#000;
3811 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
3812 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
3813 --m-ok:#2f8a2f;
3814 --m-type:#8a7a2f;
3815 --m-bound:#2f7a8a;
3816 --m-fp:#8a2f2f;
3817 --m-fn:#6a2f8a;
3818 }
3819}
3820html[data-theme='dark']{
3821 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
3822 --muted:#666; --border:#222; --border-strong:#333; --hover:#111; --input-bg:#080808; --active:#ddd;
3823 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3824 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3825 --m-ok:#4a8a4a; --m-type:#8a8a4a; --m-bound:#4a8a8a; --m-fp:#8a4a4a; --m-fn:#8a4a8a;
3826}
3827html[data-theme='light']{
3828 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
3829 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0; --input-bg:#ffffff; --active:#000;
3830 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
3831 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
3832 --m-ok:#2f8a2f; --m-type:#8a7a2f; --m-bound:#2f7a8a; --m-fp:#8a2f2f; --m-fn:#6a2f8a;
3833}
3834
3835<style>
3836*{box-sizing:border-box;margin:0;padding:0}
3837body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
3838h1,h2{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
3839h1{font-size:14px}h2{font-size:12px}
3840table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
3841th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
3842th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
3843tr:hover{background:var(--hover)}
3844.grid{display:grid;grid-template-columns:1fr 1fr;gap:8px}
3845.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
3846.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
3847.stats{display:flex;gap:24px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
3848.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
3849/* Entities */
3850.e{padding:1px 2px;border-bottom:2px solid}
3851.seg{cursor:pointer}
3852.e-gold{background:var(--gold-bg);border-color:var(--gold-br);color:var(--gold-tx)}
3853.e-pred{background:var(--pred-bg);border-color:var(--pred-br);color:var(--pred-tx)}
3854.e-active{outline:1px solid var(--active);outline-offset:1px}
3855/* Match types */
3856.correct{background:#1a2e1a;border-color:#4a8a4a}
3857.type-err{background:#2e2e1a;border-color:#8a8a4a}
3858.boundary{background:#1a2e2e;border-color:#4a8a8a}
3859.spurious{background:#2e1a1a;border-color:#8a4a4a}
3860.missed{background:#2e1a2e;border-color:#8a4a8a}
3861.match-row.correct{border-left:3px solid var(--m-ok)}
3862.match-row.type-err{border-left:3px solid var(--m-type)}
3863.match-row.boundary{border-left:3px solid var(--m-bound)}
3864.match-row.spurious{border-left:3px solid var(--m-fp)}
3865.match-row.missed{border-left:3px solid var(--m-fn)}
3866.match-row.active{outline:1px solid var(--muted)}
3867.sel{color:var(--muted);margin:6px 0 12px}
3868.metric{font-size:14px;color:var(--muted)}.metric b{color:var(--text-strong)}
3869</style>
3870</head>
3871<body>
3872"#);
3873
3874 html.push_str(&format!(
3876 "<div class=\"panel-h\" style=\"justify-content:space-between\"><h1>{}</h1><span class=\"toggle\" id=\"theme-toggle\" title=\"toggle theme (auto → dark → light)\">theme: auto</span></div>",
3877 title
3878 ));
3879
3880 html.push_str("<div class=\"stats\">");
3882 html.push_str(&format!(
3883 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">gold</div></div>",
3884 cmp.gold.len()
3885 ));
3886 html.push_str(&format!(
3887 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">predicted</div></div>",
3888 cmp.predicted.len()
3889 ));
3890 html.push_str(&format!(
3891 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">correct</div></div>",
3892 cmp.correct_count()
3893 ));
3894 html.push_str(&format!(
3895 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">errors</div></div>",
3896 cmp.error_count()
3897 ));
3898 html.push_str(&format!(
3899 "<div class=\"metric\">P=<b>{:.1}%</b> R=<b>{:.1}%</b> F1=<b>{:.1}%</b></div>",
3900 cmp.precision() * 100.0,
3901 cmp.recall() * 100.0,
3902 cmp.f1() * 100.0
3903 ));
3904 html.push_str("</div>");
3905
3906 html.push_str("<div id=\"selection\" class=\"sel\">click a match row to select spans</div>");
3908
3909 html.push_str("<div class=\"grid\">");
3911
3912 html.push_str("<div class=\"panel\"><h2>gold (ground truth)</h2><div class=\"text-box\">");
3914 let gold_spans: Vec<EvalHtmlSpan> = cmp
3915 .gold
3916 .iter()
3917 .map(|s| {
3918 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
3919 EvalHtmlSpan {
3920 start,
3921 end,
3922 label: s.label.to_string(),
3923 class: "e-gold",
3924 id: format!("G{}", s.id),
3925 }
3926 })
3927 .collect();
3928 html.push_str(&annotate_text_spans(&cmp.text, &gold_spans));
3929 html.push_str("</div></div>");
3930
3931 html.push_str("<div class=\"panel\"><h2>predicted</h2><div class=\"text-box\">");
3933 let pred_spans: Vec<EvalHtmlSpan> = cmp
3934 .predicted
3935 .iter()
3936 .map(|s| {
3937 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
3938 EvalHtmlSpan {
3939 start,
3940 end,
3941 label: s.label.to_string(),
3942 class: "e-pred",
3943 id: format!("P{}", s.id),
3944 }
3945 })
3946 .collect();
3947 html.push_str(&annotate_text_spans(&cmp.text, &pred_spans));
3948 html.push_str("</div></div>");
3949
3950 html.push_str("</div>");
3951
3952 html.push_str("<h2>matches</h2><table>");
3954 html.push_str("<tr><th>type</th><th>gold</th><th>predicted</th><th>notes</th></tr>");
3955
3956 for (mi, m) in cmp.matches.iter().enumerate() {
3957 let (class, mtype, gold_text, pred_text, notes, gid, pid) = match m {
3958 EvalMatch::Correct { gold_id, pred_id } => {
3959 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3960 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
3961 (
3962 "correct",
3963 "✓",
3964 g.map(|s| format!("[{}] {}", s.label, s.surface()))
3965 .unwrap_or_default(),
3966 p.map(|s| format!("[{}] {}", s.label, s.surface()))
3967 .unwrap_or_default(),
3968 String::new(),
3969 Some(format!("G{}", gold_id)),
3970 Some(format!("P{}", pred_id)),
3971 )
3972 }
3973 EvalMatch::TypeMismatch {
3974 gold_id,
3975 pred_id,
3976 gold_label,
3977 pred_label,
3978 } => {
3979 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3980 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
3981 (
3982 "type-err",
3983 "type",
3984 g.map(|s| format!("[{}] {}", s.label, s.surface()))
3985 .unwrap_or_default(),
3986 p.map(|s| format!("[{}] {}", s.label, s.surface()))
3987 .unwrap_or_default(),
3988 format!("{} → {}", gold_label, pred_label),
3989 Some(format!("G{}", gold_id)),
3990 Some(format!("P{}", pred_id)),
3991 )
3992 }
3993 EvalMatch::BoundaryError {
3994 gold_id,
3995 pred_id,
3996 iou,
3997 } => {
3998 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3999 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4000 (
4001 "boundary",
4002 "bound",
4003 g.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4004 .unwrap_or_default(),
4005 p.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4006 .unwrap_or_default(),
4007 format!("IoU={:.2}", iou),
4008 Some(format!("G{}", gold_id)),
4009 Some(format!("P{}", pred_id)),
4010 )
4011 }
4012 EvalMatch::Spurious { pred_id } => {
4013 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4014 (
4015 "spurious",
4016 "FP",
4017 String::new(),
4018 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4019 .unwrap_or_default(),
4020 "false positive".to_string(),
4021 None,
4022 Some(format!("P{}", pred_id)),
4023 )
4024 }
4025 EvalMatch::Missed { gold_id } => {
4026 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4027 (
4028 "missed",
4029 "FN",
4030 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4031 .unwrap_or_default(),
4032 String::new(),
4033 "false negative".to_string(),
4034 Some(format!("G{}", gold_id)),
4035 None,
4036 )
4037 }
4038 };
4039
4040 let mut data_attrs = String::new();
4041 if let Some(gid) = gid.as_deref() {
4042 data_attrs.push_str(&format!(" data-gid=\"{}\"", html_escape(gid)));
4043 }
4044 if let Some(pid) = pid.as_deref() {
4045 data_attrs.push_str(&format!(" data-pid=\"{}\"", html_escape(pid)));
4046 }
4047
4048 html.push_str(&format!(
4049 "<tr id=\"M{mid}\" class=\"match-row {class}\"{attrs}><td><a class=\"match-link\" href=\"#M{mid}\">{mtype}</a></td><td>{gold}</td><td>{pred}</td><td>{notes}</td></tr>",
4050 mid = mi,
4051 class = class,
4052 attrs = data_attrs,
4053 mtype = html_escape(mtype),
4054 gold = html_escape(&gold_text),
4055 pred = html_escape(&pred_text),
4056 notes = html_escape(¬es)
4057 ));
4058 }
4059 html.push_str("</table>");
4060
4061 html.push_str(
4062 r#"<script>
4063(() => {
4064 // Theme toggle: auto (prefers-color-scheme) → dark → light.
4065 const themeBtn = document.getElementById('theme-toggle');
4066 const themeKey = 'anno-theme';
4067 const applyTheme = (theme) => {
4068 const t = theme || 'auto';
4069 if (t === 'auto') {
4070 delete document.documentElement.dataset.theme;
4071 } else {
4072 document.documentElement.dataset.theme = t;
4073 }
4074 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
4075 };
4076 const readTheme = () => {
4077 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
4078 };
4079 const writeTheme = (t) => {
4080 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
4081 };
4082 applyTheme(readTheme());
4083 if (themeBtn) {
4084 themeBtn.addEventListener('click', () => {
4085 const cur = readTheme();
4086 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
4087 writeTheme(next);
4088 applyTheme(next);
4089 });
4090 }
4091
4092 function clearActive() {
4093 document.querySelectorAll(".e-active").forEach((el) => el.classList.remove("e-active"));
4094 document.querySelectorAll("tr.match-row.active").forEach((el) => el.classList.remove("active"));
4095 }
4096
4097 function findSpanEls(eid) {
4098 if (!eid) return [];
4099 // New segmented renderer: one span can be split across multiple elements.
4100 const els = Array.from(document.querySelectorAll(`span.e[data-eids~='${eid}']`));
4101 if (els.length) return els;
4102 // Back-compat: older HTML used a single element id.
4103 const single = document.getElementById(eid);
4104 return single ? [single] : [];
4105 }
4106
4107 function activate(gid, pid, row) {
4108 clearActive();
4109 const gEls = findSpanEls(gid);
4110 const pEls = findSpanEls(pid);
4111 const sel = document.getElementById("selection");
4112 gEls.forEach((el) => el.classList.add("e-active"));
4113 pEls.forEach((el) => el.classList.add("e-active"));
4114 if (row) row.classList.add("active");
4115 if (sel) {
4116 const parts = [];
4117 if (gEls.length) {
4118 const lbl = gEls[0].dataset && gEls[0].dataset.label ? ` [${gEls[0].dataset.label}]` : "";
4119 parts.push(`gold ${gid}${lbl}`);
4120 }
4121 if (pEls.length) {
4122 const lbl = pEls[0].dataset && pEls[0].dataset.label ? ` [${pEls[0].dataset.label}]` : "";
4123 parts.push(`pred ${pid}${lbl}`);
4124 }
4125 sel.textContent = parts.length ? parts.join(" | ") : "no selection";
4126 }
4127 if (row && row.id) {
4128 // Keep deep links stable without triggering navigation jump.
4129 // NOTE: single quotes avoid the Rust raw-string delimiter issue with quote+hash.
4130 history.replaceState(null, "", '#' + row.id);
4131 }
4132 const target = gEls[0] || pEls[0];
4133 if (target) target.scrollIntoView({ behavior: "smooth", block: "center" });
4134 }
4135
4136 document.querySelectorAll("tr.match-row[data-gid], tr.match-row[data-pid]").forEach((tr) => {
4137 tr.addEventListener("click", () => activate(tr.dataset.gid, tr.dataset.pid, tr));
4138 });
4139
4140 document.querySelectorAll("a.match-link").forEach((a) => {
4141 a.addEventListener("click", (ev) => {
4142 ev.preventDefault();
4143 const tr = a.closest("tr.match-row");
4144 if (!tr) return;
4145 activate(tr.dataset.gid, tr.dataset.pid, tr);
4146 });
4147 });
4148
4149 // Auto-select a match row if the URL has a deep link (e.g. #M12).
4150 const hash = (location.hash || "").slice(1);
4151 if (hash && hash.startsWith("M")) {
4152 const tr = document.getElementById(hash);
4153 if (tr && tr.classList && tr.classList.contains("match-row")) {
4154 activate(tr.dataset.gid, tr.dataset.pid, tr);
4155 }
4156 }
4157})();
4158</script>"#,
4159 );
4160
4161 html.push_str("</body></html>");
4162 html
4163}
4164
4165#[derive(Debug, Clone)]
4167struct EvalHtmlSpan {
4168 start: usize,
4169 end: usize,
4170 label: String,
4171 class: &'static str,
4172 id: String,
4173}
4174
4175fn annotate_text_spans(text: &str, spans: &[EvalHtmlSpan]) -> String {
4176 let char_count = text.chars().count();
4177 if char_count == 0 || spans.is_empty() {
4178 return html_escape(text);
4179 }
4180
4181 #[derive(Debug, Clone)]
4182 struct Meta {
4183 id: String,
4184 label: String,
4185 class: &'static str,
4186 len: usize,
4187 }
4188 #[derive(Debug, Clone)]
4189 struct Event {
4190 pos: usize,
4191 meta_idx: usize,
4192 delta: i32,
4193 }
4194
4195 let mut metas: Vec<Meta> = Vec::with_capacity(spans.len());
4196 let mut events: Vec<Event> = Vec::new();
4197 let mut boundaries: Vec<usize> = vec![0, char_count];
4198
4199 for s in spans {
4200 let start = s.start.min(char_count);
4201 let end = s.end.min(char_count);
4202 if start >= end {
4203 continue;
4204 }
4205 let meta_idx = metas.len();
4206 metas.push(Meta {
4207 id: s.id.clone(),
4208 label: s.label.to_string(),
4209 class: s.class,
4210 len: end - start,
4211 });
4212 boundaries.push(start);
4213 boundaries.push(end);
4214 events.push(Event {
4215 pos: start,
4216 meta_idx,
4217 delta: 1,
4218 });
4219 events.push(Event {
4220 pos: end,
4221 meta_idx,
4222 delta: -1,
4223 });
4224 }
4225
4226 if metas.is_empty() {
4227 return html_escape(text);
4228 }
4229
4230 boundaries.sort_unstable();
4231 boundaries.dedup();
4232 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
4233
4234 let mut active_counts: Vec<u32> = vec![0; metas.len()];
4235 let mut active: Vec<usize> = Vec::new();
4236 let mut ev_idx = 0usize;
4237 let mut result = String::new();
4238
4239 for bi in 0..boundaries.len().saturating_sub(1) {
4240 let pos = boundaries[bi];
4241 while ev_idx < events.len() && events[ev_idx].pos == pos {
4242 let e = &events[ev_idx];
4243 let idx = e.meta_idx;
4244 if e.delta < 0 {
4245 if active_counts[idx] > 0 {
4246 active_counts[idx] -= 1;
4247 if active_counts[idx] == 0 {
4248 active.retain(|&x| x != idx);
4249 }
4250 }
4251 } else {
4252 active_counts[idx] += 1;
4253 if active_counts[idx] == 1 {
4254 active.push(idx);
4255 }
4256 }
4257 ev_idx += 1;
4258 }
4259
4260 let next = boundaries[bi + 1];
4261 if next <= pos {
4262 continue;
4263 }
4264
4265 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
4266 if active.is_empty() {
4267 result.push_str(&html_escape(&seg_text));
4268 continue;
4269 }
4270
4271 let primary_idx = active
4272 .iter()
4273 .copied()
4274 .min_by_key(|i| metas[*i].len)
4275 .unwrap_or(active[0]);
4276 let primary = &metas[primary_idx];
4277 let mut eids: Vec<&str> = active.iter().map(|i| metas[*i].id.as_str()).collect();
4278 eids.sort_unstable();
4279 let data_eids = eids.join(" ");
4280
4281 let title = format!(
4282 "eids=[{}] primary={} [{}..{})",
4283 data_eids, primary.id, pos, next
4284 );
4285 result.push_str(&format!(
4286 "<span class=\"e seg {class}\" data-eids=\"{eids}\" data-label=\"{label}\" data-start=\"{start}\" data-end=\"{end}\" title=\"{title}\">{text}</span>",
4287 class = primary.class,
4288 eids = html_escape(&data_eids),
4289 label = html_escape(&primary.label),
4290 start = pos,
4291 end = next,
4292 title = html_escape(&title),
4293 text = html_escape(&seg_text)
4294 ));
4295 }
4296
4297 result
4298}
4299
4300#[derive(Debug, Clone, Default)]
4306pub struct ProcessOptions {
4307 pub labels: Vec<String>,
4309 pub threshold: Confidence,
4311}
4312
4313#[derive(Debug)]
4315pub struct ProcessResult {
4316 pub document: GroundedDocument,
4318 pub valid: bool,
4320 pub errors: Vec<SignalValidationError>,
4322}
4323
4324impl ProcessResult {
4325 #[must_use]
4327 pub fn to_html(&self) -> String {
4328 render_document_html(&self.document)
4329 }
4330}
4331
4332#[derive(Debug, Clone)]
4341pub struct Corpus {
4342 documents: std::collections::HashMap<String, GroundedDocument>,
4343 identities: std::collections::HashMap<IdentityId, Identity>,
4344 next_identity_id: IdentityId,
4345}
4346
4347impl Corpus {
4348 #[must_use]
4350 pub fn new() -> Self {
4351 Self {
4352 documents: std::collections::HashMap::new(),
4353 identities: std::collections::HashMap::new(),
4354 next_identity_id: IdentityId::ZERO,
4355 }
4356 }
4357
4358 #[must_use]
4360 pub fn identities(&self) -> &std::collections::HashMap<IdentityId, Identity> {
4361 &self.identities
4362 }
4363
4364 #[must_use]
4366 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
4367 self.identities.get(&id)
4368 }
4369
4370 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
4375 let id = self.next_identity_id;
4376 identity.id = id;
4377 self.identities.insert(id, identity);
4378 self.next_identity_id += 1;
4379 id
4380 }
4381
4382 #[must_use]
4386 pub fn next_identity_id(&self) -> IdentityId {
4387 self.next_identity_id
4388 }
4389
4390 pub fn documents(&self) -> impl Iterator<Item = &GroundedDocument> {
4394 self.documents.values()
4395 }
4396
4397 #[must_use]
4401 pub fn get_document(&self, doc_id: &str) -> Option<&GroundedDocument> {
4402 self.documents.get(doc_id)
4403 }
4404
4405 pub fn get_document_mut(&mut self, doc_id: &str) -> Option<&mut GroundedDocument> {
4409 self.documents.get_mut(doc_id)
4410 }
4411
4412 pub fn add_document(&mut self, document: GroundedDocument) -> String {
4417 let doc_id = document.id.clone();
4418 self.documents.insert(doc_id.clone(), document);
4419 doc_id
4420 }
4421
4422 pub fn link_track_to_kb(
4444 &mut self,
4445 track_ref: &TrackRef,
4446 kb_name: impl Into<String>,
4447 kb_id: impl Into<String>,
4448 canonical_name: impl Into<String>,
4449 ) -> super::Result<IdentityId> {
4450 use super::error::Error;
4451
4452 let doc = self.documents.get_mut(&track_ref.doc_id).ok_or_else(|| {
4453 Error::track_ref(format!(
4454 "Document '{}' not found in corpus",
4455 track_ref.doc_id
4456 ))
4457 })?;
4458 let track = doc.get_track(track_ref.track_id).ok_or_else(|| {
4459 Error::track_ref(format!(
4460 "Track {} not found in document '{}'",
4461 track_ref.track_id, track_ref.doc_id
4462 ))
4463 })?;
4464
4465 let kb_name_str = kb_name.into();
4466 let kb_id_str = kb_id.into();
4467 let canonical_name_str = canonical_name.into();
4468
4469 let identity_id = if let Some(existing_id) = track.identity_id {
4471 if let Some(identity) = self.identities.get_mut(&existing_id) {
4473 identity.kb_id = Some(kb_id_str.clone());
4474 identity.kb_name = Some(kb_name_str.clone());
4475 identity.canonical_name = canonical_name_str.clone();
4476
4477 identity.source = Some(match identity.source.take() {
4479 Some(IdentitySource::CrossDocCoref { track_refs }) => IdentitySource::Hybrid {
4480 track_refs,
4481 kb_name: kb_name_str.clone(),
4482 kb_id: kb_id_str.clone(),
4483 },
4484 _ => IdentitySource::KnowledgeBase {
4485 kb_name: kb_name_str.clone(),
4486 kb_id: kb_id_str.clone(),
4487 },
4488 });
4489
4490 existing_id
4491 } else {
4492 let new_id = self.next_identity_id;
4500 self.next_identity_id += 1;
4501
4502 let identity = Identity {
4503 id: new_id,
4504 canonical_name: canonical_name_str,
4505 entity_type: track.entity_type.clone(),
4506 kb_id: Some(kb_id_str.clone()),
4507 kb_name: Some(kb_name_str.clone()),
4508 description: None,
4509 embedding: track.embedding.clone(),
4510 aliases: Vec::new(),
4511 confidence: track.cluster_confidence,
4512 source: Some(IdentitySource::KnowledgeBase {
4513 kb_name: kb_name_str,
4514 kb_id: kb_id_str,
4515 }),
4516 };
4517
4518 self.identities.insert(new_id, identity);
4519 doc.link_track_to_identity(track_ref.track_id, new_id);
4522 new_id
4523 }
4524 } else {
4525 let new_id = self.next_identity_id;
4527 self.next_identity_id += 1;
4528
4529 let identity = Identity {
4530 id: new_id,
4531 canonical_name: canonical_name_str,
4532 entity_type: track.entity_type.clone(),
4533 kb_id: Some(kb_id_str.clone()),
4534 kb_name: Some(kb_name_str.clone()),
4535 description: None,
4536 embedding: track.embedding.clone(),
4537 aliases: Vec::new(),
4538 confidence: track.cluster_confidence,
4539 source: Some(IdentitySource::KnowledgeBase {
4540 kb_name: kb_name_str,
4541 kb_id: kb_id_str,
4542 }),
4543 };
4544
4545 self.identities.insert(new_id, identity);
4546 doc.link_track_to_identity(track_ref.track_id, new_id);
4547 new_id
4548 };
4549
4550 Ok(identity_id)
4551 }
4552}
4553
4554impl Default for Corpus {
4555 fn default() -> Self {
4556 Self::new()
4557 }
4558}
4559
4560#[cfg(test)]
4561mod tests {
4562 #![allow(clippy::unwrap_used)] use super::*;
4564 use crate::EntityCategory;
4565
4566 #[test]
4567 fn test_render_eval_html_has_interactive_hooks_and_is_unicode_safe() {
4568 let text = "習近平在北京會見了普京。";
4570
4571 let gold: Vec<Signal<Location>> = vec![
4572 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 1.0),
4573 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "LOC", 1.0),
4574 ];
4575
4576 let predicted: Vec<Signal<Location>> = vec![
4578 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 0.9),
4579 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "PER", 0.7),
4580 ];
4581
4582 let cmp = EvalComparison::compare(text, gold, predicted);
4583 let html = render_eval_html_with_title(&cmp, "test");
4584
4585 assert!(html.contains("id=\"selection\""));
4587
4588 assert!(html.contains("data-eids=\"G0\""));
4590 assert!(html.contains("data-eids=\"P0\""));
4591
4592 assert!(html.contains("class=\"match-link\""));
4594 assert!(html.contains("href=\"#M0\""));
4595 assert!(html.contains("data-gid=\"G0\""));
4596 assert!(html.contains("data-pid=\"P0\""));
4597
4598 assert!(html.contains("北京"));
4600 }
4601
4602 fn find_char_span(text: &str, needle: &str) -> Option<(usize, usize)> {
4603 let hay: Vec<char> = text.chars().collect();
4604 let pat: Vec<char> = needle.chars().collect();
4605 if pat.is_empty() || hay.len() < pat.len() {
4606 return None;
4607 }
4608 for i in 0..=(hay.len() - pat.len()) {
4609 if hay[i..(i + pat.len())] == pat[..] {
4610 return Some((i, i + pat.len()));
4611 }
4612 }
4613 None
4614 }
4615
4616 #[test]
4617 fn test_annotate_text_html_supports_overlaps_discontinuous_and_unicode() {
4618 let text = "Marie Curie met Cher in Paris. 習近平在北京會見了普京。 \
4620التقى محمد بن سلمان في الرياض. Путин встретился с Си Цзиньпином в Москве. \
4621प्रधान मंत्री शर्मा दिल्ली में मिले। severe pain ... in abdomen.";
4622
4623 let (m0s, m0e) = find_char_span(text, "Marie Curie").unwrap();
4625 let (m1s, m1e) = find_char_span(text, "Curie").unwrap();
4626
4627 let pain = find_char_span(text, "pain").unwrap();
4629 let abdomen = find_char_span(text, "abdomen").unwrap();
4630
4631 let signals: Vec<Signal<Location>> = vec![
4632 Signal::new(
4633 SignalId::new(0),
4634 Location::text(m0s, m0e),
4635 "Marie Curie",
4636 "PER",
4637 0.9,
4638 ),
4639 Signal::new(
4640 SignalId::new(1),
4641 Location::text(m1s, m1e),
4642 "Curie",
4643 "PER",
4644 0.8,
4645 ),
4646 Signal::new(
4647 SignalId::new(2),
4648 Location::Discontinuous {
4649 segments: vec![pain, abdomen],
4650 },
4651 "pain … abdomen",
4652 "SYMPTOM",
4653 0.7,
4654 ),
4655 ];
4656
4657 let html = annotate_text_html(text, &signals, &std::collections::HashMap::new());
4658
4659 assert!(html.contains("data-sids=\"S0 S1\"") || html.contains("data-sids=\"S1 S0\""));
4661
4662 assert!(html.contains("data-sids=\"S2\""));
4664
4665 assert!(html.contains("北京"));
4667 assert!(html.contains("Москве"));
4668 assert!(html.contains("शर्मा"));
4669 assert!(html.contains("محمد"));
4670 }
4671
4672 #[test]
4673 fn test_location_text_iou() {
4674 let l1 = Location::text(0, 10);
4675 let l2 = Location::text(5, 15);
4676 let iou = l1.iou(&l2).unwrap();
4677 assert!((iou - 0.333).abs() < 0.01);
4681 }
4682
4683 #[test]
4684 fn test_signal_creation() {
4685 let signal: Signal<Location> =
4686 Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95);
4687 assert_eq!(signal.surface, "Marie Curie");
4688 assert_eq!(signal.label, "Person".into());
4689 assert!((signal.confidence.value() - 0.95).abs() < 0.001);
4690 assert!(!signal.negated);
4691 }
4692
4693 #[test]
4694 fn test_signal_with_linguistic_features() {
4695 let signal: Signal<Location> =
4696 Signal::new(0, Location::text(0, 10), "not a doctor", "Occupation", 0.8)
4697 .negated()
4698 .with_quantifier(Quantifier::Existential)
4699 .with_modality(Modality::Symbolic);
4700
4701 assert!(signal.negated);
4702 assert_eq!(signal.quantifier, Some(Quantifier::Existential));
4703 assert_eq!(signal.modality, Modality::Symbolic);
4704 }
4705
4706 #[test]
4707 fn test_track_formation() {
4708 let mut track = Track::new(0, "Marie Curie");
4709 track.add_signal(0, 0);
4710 track.add_signal(1, 1);
4711 track.add_signal(2, 2);
4712
4713 assert_eq!(track.len(), 3);
4714 assert!(!track.is_singleton());
4715 assert!(!track.is_empty());
4716 }
4717
4718 #[test]
4719 fn test_identity_creation() {
4720 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186")
4721 .with_type("Person")
4722 .with_embedding(vec![0.1, 0.2, 0.3]);
4723
4724 assert_eq!(identity.canonical_name, "Marie Curie");
4725 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4726 assert_eq!(identity.kb_name, Some("wikidata".to_string()));
4727 assert!(identity.embedding.is_some());
4728 }
4729
4730 #[test]
4731 fn test_grounded_document_hierarchy() {
4732 let mut doc = GroundedDocument::new(
4733 "doc1",
4734 "Marie Curie won the Nobel Prize. She was a physicist.",
4735 );
4736
4737 let s1 = doc.add_signal(Signal::new(
4739 0,
4740 Location::text(0, 12),
4741 "Marie Curie",
4742 "Person",
4743 0.95,
4744 ));
4745 let s2 = doc.add_signal(Signal::new(
4746 1,
4747 Location::text(38, 41),
4748 "She",
4749 "Person",
4750 0.88,
4751 ));
4752 let s3 = doc.add_signal(Signal::new(
4753 2,
4754 Location::text(17, 29),
4755 "Nobel Prize",
4756 "Award",
4757 0.92,
4758 ));
4759
4760 let mut track1 = Track::new(0, "Marie Curie");
4762 track1.add_signal(s1, 0);
4763 track1.add_signal(s2, 1);
4764 let track1_id = doc.add_track(track1);
4765
4766 let mut track2 = Track::new(1, "Nobel Prize");
4767 track2.add_signal(s3, 0);
4768 doc.add_track(track2);
4769
4770 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
4772 let identity_id = doc.add_identity(identity);
4773 doc.link_track_to_identity(track1_id, identity_id);
4774
4775 assert_eq!(doc.signals().len(), 3);
4777 assert_eq!(doc.tracks().count(), 2);
4778 assert_eq!(doc.identities().count(), 1);
4779
4780 let track = doc.track_for_signal(s1).unwrap();
4782 assert_eq!(track.canonical_surface, "Marie Curie");
4783 assert_eq!(track.len(), 2);
4784
4785 let identity = doc.identity_for_track(track1_id).unwrap();
4787 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4788
4789 let identity = doc.identity_for_signal(s1).unwrap();
4791 assert_eq!(identity.canonical_name, "Marie Curie");
4792 }
4793
4794 #[test]
4795 fn test_modality_variants() {
4796 assert_eq!(Modality::default(), Modality::Symbolic);
4797 assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
4798 }
4799
4800 #[test]
4801 fn test_location_from_span() {
4802 let span = Span::Text { start: 0, end: 10 };
4803 let location = Location::from(&span);
4804 assert_eq!(location.text_offsets(), Some((0, 10)));
4805 }
4806
4807 #[test]
4808 fn test_entity_roundtrip() {
4809 use super::EntityType;
4810
4811 let entities = vec![
4812 Entity::new("Marie Curie", EntityType::Person, 0, 12, 0.95),
4813 Entity::new(
4814 "Nobel Prize",
4815 EntityType::custom("Award", EntityCategory::Creative),
4816 17,
4817 29,
4818 0.92,
4819 ),
4820 ];
4821
4822 let doc =
4823 GroundedDocument::from_entities("doc1", "Marie Curie won the Nobel Prize.", &entities);
4824 let converted = doc.to_entities();
4825
4826 assert_eq!(converted.len(), 2);
4827 assert_eq!(converted[0].text, "Marie Curie");
4828 assert_eq!(converted[1].text, "Nobel Prize");
4829 }
4830
4831 #[test]
4832 fn test_signal_confidence_threshold() {
4833 let signal: Signal<Location> = Signal::new(0, Location::text(0, 10), "test", "Type", 0.75);
4834 assert!(signal.is_confident(Confidence::new(0.5)));
4835 assert!(signal.is_confident(Confidence::new(0.75)));
4836 assert!(!signal.is_confident(Confidence::new(0.8)));
4837 }
4838
4839 #[test]
4840 fn test_document_filtering() {
4841 let mut doc = GroundedDocument::new("doc1", "Test text");
4842
4843 doc.add_signal(Signal::new(0, Location::text(0, 4), "high", "Person", 0.95));
4845 doc.add_signal(Signal::new(1, Location::text(5, 8), "low", "Person", 0.3));
4846 doc.add_signal(Signal::new(
4847 2,
4848 Location::text(9, 12),
4849 "org",
4850 "Organization",
4851 0.8,
4852 ));
4853
4854 let confident = doc.confident_signals(Confidence::new(0.5));
4856 assert_eq!(confident.len(), 2);
4857
4858 let persons = doc.signals_with_label("Person");
4860 assert_eq!(persons.len(), 2);
4861
4862 let orgs = doc.signals_with_label("Organization");
4863 assert_eq!(orgs.len(), 1);
4864 }
4865
4866 #[test]
4867 fn test_untracked_signals() {
4868 let mut doc = GroundedDocument::new("doc1", "Test");
4869
4870 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
4871 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
4872 let _s3 = doc.add_signal(Signal::new(2, Location::text(9, 12), "c", "T", 0.9));
4873
4874 let mut track = Track::new(0, "a");
4876 track.add_signal(s1, 0);
4877 track.add_signal(s2, 1);
4878 doc.add_track(track);
4879
4880 assert_eq!(doc.untracked_signal_count(), 1);
4882 let untracked = doc.untracked_signals();
4883 assert_eq!(untracked.len(), 1);
4884 assert_eq!(untracked[0].surface, "c");
4885 }
4886
4887 #[test]
4888 fn test_linked_unlinked_tracks() {
4889 let mut doc = GroundedDocument::new("doc1", "Test");
4890
4891 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
4892 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
4893
4894 let mut track1 = Track::new(0, "a");
4895 track1.add_signal(s1, 0);
4896 let track1_id = doc.add_track(track1);
4897
4898 let mut track2 = Track::new(1, "b");
4899 track2.add_signal(s2, 0);
4900 doc.add_track(track2);
4901
4902 let identity = Identity::new(0, "Entity A");
4904 let identity_id = doc.add_identity(identity);
4905 doc.link_track_to_identity(track1_id, identity_id);
4906
4907 assert_eq!(doc.linked_tracks().count(), 1);
4908 assert_eq!(doc.unlinked_tracks().count(), 1);
4909 }
4910
4911 #[test]
4912 fn test_iou_edge_cases() {
4913 let l1 = Location::text(0, 5);
4915 let l2 = Location::text(10, 15);
4916 assert_eq!(l1.iou(&l2), Some(0.0));
4917
4918 let l3 = Location::text(0, 10);
4920 let l4 = Location::text(0, 10);
4921 assert_eq!(l3.iou(&l4), Some(1.0));
4922
4923 let l5 = Location::text(0, 20);
4925 let l6 = Location::text(5, 15);
4926 let iou = l5.iou(&l6).unwrap();
4927 assert!((iou - 0.5).abs() < 0.001);
4929 }
4930
4931 #[test]
4935 fn test_document_stats() {
4936 let mut doc = GroundedDocument::new("doc1", "Test document with entities.");
4937
4938 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9));
4940 let mut negated = Signal::new(0, Location::text(5, 13), "document", "Type", 0.8);
4941 negated.negated = true;
4942 let s2 = doc.add_signal(negated);
4943 let _s3 = doc.add_signal(Signal::new(
4944 0,
4945 Location::text(19, 27),
4946 "entities",
4947 "Type",
4948 0.7,
4949 ));
4950
4951 let mut track = Track::new(0, "Test");
4953 track.add_signal(s1, 0);
4954 track.add_signal(s2, 1);
4955 doc.add_track(track);
4956
4957 let identity = Identity::new(0, "Test Entity");
4959 let identity_id = doc.add_identity(identity);
4960 doc.link_track_to_identity(0, identity_id);
4961
4962 let stats = doc.stats();
4963
4964 assert_eq!(stats.signal_count, 3);
4965 assert_eq!(stats.track_count, 1);
4966 assert_eq!(stats.identity_count, 1);
4967 assert_eq!(stats.linked_track_count, 1);
4968 assert_eq!(stats.untracked_count, 1); assert_eq!(stats.negated_count, 1);
4970 assert!((stats.avg_confidence - 0.8).abs() < 0.01); assert!((stats.avg_track_size - 2.0).abs() < 0.01);
4972 }
4973
4974 #[test]
4975 fn test_batch_operations() {
4976 let mut doc = GroundedDocument::new("doc1", "Test document.");
4977
4978 let signals = vec![
4980 Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9),
4981 Signal::new(0, Location::text(5, 13), "document", "Type", 0.8),
4982 ];
4983 let ids = doc.add_signals(signals);
4984
4985 assert_eq!(ids.len(), 2);
4986 assert_eq!(doc.signals().len(), 2);
4987
4988 let track_id = doc.create_track_from_signals("Test", &ids);
4990 assert!(track_id.is_some());
4991
4992 let track = doc.get_track(track_id.unwrap()).unwrap();
4993 assert_eq!(track.len(), 2);
4994 assert_eq!(track.canonical_surface, "Test");
4995 }
4996
4997 #[test]
4998 fn test_merge_tracks() {
4999 let mut doc = GroundedDocument::new("doc1", "John Smith works at Acme. He is great.");
5000
5001 let s1 = doc.add_signal(Signal::new(
5003 0,
5004 Location::text(0, 10),
5005 "John Smith",
5006 "Person",
5007 0.9,
5008 ));
5009 let s2 = doc.add_signal(Signal::new(0, Location::text(26, 28), "He", "Person", 0.8));
5010
5011 let mut track1 = Track::new(0, "John Smith");
5013 track1.add_signal(s1, 0);
5014 let track1_id = doc.add_track(track1);
5015
5016 let mut track2 = Track::new(0, "He");
5017 track2.add_signal(s2, 0);
5018 let track2_id = doc.add_track(track2);
5019
5020 assert_eq!(doc.tracks().count(), 2);
5021
5022 let merged_id = doc.merge_tracks(&[track1_id, track2_id]);
5024 assert!(merged_id.is_some());
5025
5026 assert_eq!(doc.tracks().count(), 1);
5028 let merged = doc.get_track(merged_id.unwrap()).unwrap();
5029 assert_eq!(merged.len(), 2);
5030 assert_eq!(merged.canonical_surface, "John Smith"); }
5032
5033 #[test]
5034 fn test_find_overlapping_pairs() {
5035 let mut doc = GroundedDocument::new("doc1", "New York City is great.");
5036
5037 doc.add_signal(Signal::new(
5039 0,
5040 Location::text(0, 13),
5041 "New York City",
5042 "Location",
5043 0.9,
5044 ));
5045 doc.add_signal(Signal::new(
5046 0,
5047 Location::text(0, 8),
5048 "New York",
5049 "Location",
5050 0.85,
5051 ));
5052 doc.add_signal(Signal::new(0, Location::text(17, 22), "great", "Adj", 0.7)); let pairs = doc.find_overlapping_signal_pairs();
5055
5056 assert_eq!(pairs.len(), 1);
5058 }
5059
5060 #[test]
5061 fn test_signals_in_range() {
5062 let mut doc = GroundedDocument::new("doc1", "John went to Paris and Berlin last year.");
5063
5064 doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.9));
5065 doc.add_signal(Signal::new(
5066 0,
5067 Location::text(13, 18),
5068 "Paris",
5069 "Location",
5070 0.9,
5071 ));
5072 doc.add_signal(Signal::new(
5073 0,
5074 Location::text(23, 29),
5075 "Berlin",
5076 "Location",
5077 0.9,
5078 ));
5079 doc.add_signal(Signal::new(
5080 0,
5081 Location::text(30, 39),
5082 "last year",
5083 "Date",
5084 0.8,
5085 ));
5086
5087 let in_range = doc.signals_in_range(10, 30);
5089 assert_eq!(in_range.len(), 2); let surfaces: Vec<_> = in_range.iter().map(|s| &s.surface).collect();
5092 assert!(surfaces.contains(&&"Paris".to_string()));
5093 assert!(surfaces.contains(&&"Berlin".to_string()));
5094 }
5095
5096 #[test]
5097 fn test_quantifier_variants() {
5098 let quantifiers = [
5100 Quantifier::Universal,
5101 Quantifier::Existential,
5102 Quantifier::None,
5103 Quantifier::Definite,
5104 Quantifier::Bare,
5105 Quantifier::Approximate,
5106 Quantifier::MinBound,
5107 Quantifier::MaxBound,
5108 ];
5109
5110 for q in quantifiers {
5111 let signal: Signal<Location> =
5112 Signal::new(0, Location::text(0, 5), "test", "Type", 0.9).with_quantifier(q);
5113
5114 assert_eq!(signal.quantifier, Some(q));
5115 }
5116 }
5117
5118 #[test]
5119 fn test_location_modality_derivation() {
5120 assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
5121 assert_eq!(
5122 Location::Discontinuous {
5123 segments: vec![(0, 5), (10, 15)]
5124 }
5125 .modality(),
5126 Modality::Symbolic
5127 );
5128 }
5129
5130 }
5133
5134#[cfg(test)]
5142mod proptests {
5143 #![allow(clippy::unwrap_used)] use super::*;
5145 use proptest::prelude::*;
5146
5147 fn confidence_strategy() -> impl Strategy<Value = f32> {
5153 0.0f32..=1.0
5154 }
5155
5156 fn label_strategy() -> impl Strategy<Value = String> {
5158 prop_oneof![
5159 Just("Person".to_string()),
5160 Just("Organization".to_string()),
5161 Just("Location".to_string()),
5162 Just("Date".to_string()),
5163 "[A-Z][a-z]{2,10}".prop_map(|s| s),
5164 ]
5165 }
5166
5167 fn surface_strategy() -> impl Strategy<Value = String> {
5169 "[A-Za-z ]{1,50}".prop_map(|s| s.trim().to_string())
5170 }
5171
5172 proptest! {
5177 #[test]
5179 fn iou_symmetric(
5180 start1 in 0usize..1000,
5181 len1 in 1usize..500,
5182 start2 in 0usize..1000,
5183 len2 in 1usize..500,
5184 ) {
5185 let a = Location::text(start1, start1 + len1);
5186 let b = Location::text(start2, start2 + len2);
5187
5188 let iou_ab = a.iou(&b);
5189 let iou_ba = b.iou(&a);
5190
5191 prop_assert_eq!(iou_ab, iou_ba, "IoU must be symmetric");
5192 }
5193
5194 #[test]
5196 fn iou_bounded(
5197 start1 in 0usize..1000,
5198 len1 in 1usize..500,
5199 start2 in 0usize..1000,
5200 len2 in 1usize..500,
5201 ) {
5202 let a = Location::text(start1, start1 + len1);
5203 let b = Location::text(start2, start2 + len2);
5204
5205 if let Some(iou) = a.iou(&b) {
5206 prop_assert!(iou >= 0.0, "IoU must be non-negative: got {}", iou);
5207 prop_assert!(iou <= 1.0, "IoU must be at most 1: got {}", iou);
5208 }
5209 }
5210
5211 #[test]
5213 fn iou_self_identity(start in 0usize..1000, len in 1usize..500) {
5214 let loc = Location::text(start, start + len);
5215 let iou = loc.iou(&loc).unwrap();
5216 prop_assert!(
5217 (iou - 1.0).abs() < 1e-6,
5218 "Self-IoU must be 1.0, got {}",
5219 iou
5220 );
5221 }
5222
5223 #[test]
5225 fn iou_non_overlapping_zero(
5226 start1 in 0usize..500,
5227 len1 in 1usize..100,
5228 ) {
5229 let end1 = start1 + len1;
5230 let start2 = end1 + 100; let len2 = 50;
5232
5233 let a = Location::text(start1, end1);
5234 let b = Location::text(start2, start2 + len2);
5235
5236 let iou = a.iou(&b).expect("bbox iou should be defined");
5237 prop_assert!(
5238 iou.abs() < 1e-6,
5239 "Non-overlapping IoU must be 0, got {}",
5240 iou
5241 );
5242 }
5243
5244
5245 }
5246
5247 proptest! {
5252 #[test]
5254 fn signal_confidence_clamped(raw_conf in -10.0f32..10.0) {
5255 let signal: Signal<Location> = Signal::new(
5256 0,
5257 Location::text(0, 10),
5258 "test",
5259 "Type",
5260 raw_conf,
5261 );
5262
5263 prop_assert!(signal.confidence.value() >= 0.0, "Confidence below 0: {}", signal.confidence);
5264 prop_assert!(signal.confidence.value() <= 1.0, "Confidence above 1: {}", signal.confidence);
5265 }
5266
5267 #[test]
5269 fn signal_preserves_data(
5270 surface in surface_strategy(),
5271 label in label_strategy(),
5272 conf in confidence_strategy(),
5273 start in 0usize..1000,
5274 len in 1usize..100,
5275 ) {
5276 let signal: Signal<Location> = Signal::new(
5277 0,
5278 Location::text(start, start + len),
5279 &surface,
5280 label.as_str(),
5281 conf,
5282 );
5283
5284 prop_assert_eq!(&signal.surface, &surface);
5285 let want = crate::TypeLabel::from(label.as_str());
5286 prop_assert_eq!(signal.label, want);
5287 }
5288
5289 #[test]
5293 fn signal_negation_stable(conf in confidence_strategy()) {
5294 let signal: Signal<Location> = Signal::new(
5295 0,
5296 Location::text(0, 10),
5297 "test",
5298 "Type",
5299 conf,
5300 )
5301 .negated();
5302
5303 prop_assert!(signal.negated, "Signal should be negated after .negated()");
5304 }
5305
5306 #[test]
5308 fn text_location_is_symbolic(
5309 start in 0usize..1000,
5310 len in 1usize..100,
5311 ) {
5312 let loc = Location::text(start, start + len);
5313 prop_assert_eq!(
5314 loc.modality(),
5315 Modality::Symbolic,
5316 "Text locations must be Symbolic"
5317 );
5318 }
5319 }
5320
5321 proptest! {
5326 #[test]
5328 fn track_length_monotonic(signal_count in 1usize..20) {
5329 let mut track = Track::new(0, "test");
5330
5331 for i in 0..signal_count {
5332 track.add_signal(i, i as u32);
5333 prop_assert_eq!(
5334 track.len(),
5335 i + 1,
5336 "Track length should be {} after adding {} signals",
5337 i + 1,
5338 i + 1
5339 );
5340 }
5341 }
5342
5343 #[test]
5345 fn track_not_empty_after_add(canonical in surface_strategy()) {
5346 let mut track = Track::new(0, &canonical);
5347 prop_assert!(track.is_empty(), "New track should be empty");
5348
5349 track.add_signal(0, 0);
5350 prop_assert!(!track.is_empty(), "Track should not be empty after add");
5351 }
5352
5353 #[test]
5355 fn track_positions_stored(signal_count in 1usize..10) {
5356 let mut track = Track::new(0, "test");
5357
5358 for i in 0..signal_count {
5359 track.add_signal(i, i as u32);
5360 }
5361
5362 for (idx, signal_ref) in track.signals.iter().enumerate() {
5363 prop_assert_eq!(
5364 signal_ref.position as usize,
5365 idx,
5366 "Signal position mismatch at index {}",
5367 idx
5368 );
5369 }
5370 }
5371 }
5372
5373 proptest! {
5378 #[test]
5380 fn document_signal_ids_monotonic(signal_count in 1usize..20) {
5381 let mut doc = GroundedDocument::new("test", "test text");
5382
5383 let mut prev_id: Option<SignalId> = None;
5384 for i in 0..signal_count {
5385 let id = doc.add_signal(Signal::new(
5386 999, Location::text(i * 10, i * 10 + 5),
5388 format!("entity_{}", i),
5389 "Type",
5390 0.9,
5391 ));
5392
5393 if let Some(prev) = prev_id {
5394 prop_assert!(id > prev, "Signal IDs should be monotonically increasing");
5395 }
5396 prev_id = Some(id);
5397 }
5398 }
5399
5400 #[test]
5402 fn document_track_membership_consistent(signal_count in 1usize..5) {
5403 let mut doc = GroundedDocument::new("test", "test text");
5404
5405 let mut signal_ids = Vec::new();
5407 for i in 0..signal_count {
5408 let id = doc.add_signal(Signal::new(
5409 0,
5410 Location::text(i * 10, i * 10 + 5),
5411 format!("entity_{}", i),
5412 "Type",
5413 0.9,
5414 ));
5415 signal_ids.push(id);
5416 }
5417
5418 let mut track = Track::new(0, "canonical");
5420 for (pos, &id) in signal_ids.iter().enumerate() {
5421 track.add_signal(id, pos as u32);
5422 }
5423 let track_id = doc.add_track(track);
5424
5425 for &signal_id in &signal_ids {
5427 let found_track = doc.track_for_signal(signal_id);
5428 prop_assert!(found_track.is_some(), "Signal should be in a track");
5429 prop_assert_eq!(
5430 found_track.unwrap().id,
5431 track_id,
5432 "Signal should be in the correct track"
5433 );
5434 }
5435 }
5436
5437 #[test]
5439 fn document_identity_transitivity(signal_count in 1usize..3) {
5440 let mut doc = GroundedDocument::new("test", "test text");
5441
5442 let mut signal_ids = Vec::new();
5444 for i in 0..signal_count {
5445 let id = doc.add_signal(Signal::new(
5446 0,
5447 Location::text(i * 10, i * 10 + 5),
5448 format!("entity_{}", i),
5449 "Type",
5450 0.9,
5451 ));
5452 signal_ids.push(id);
5453 }
5454
5455 let mut track = Track::new(0, "canonical");
5457 for (pos, &id) in signal_ids.iter().enumerate() {
5458 track.add_signal(id, pos as u32);
5459 }
5460 let track_id = doc.add_track(track);
5461
5462 let identity = Identity::from_kb(0, "Entity", "wikidata", "Q123");
5463 let identity_id = doc.add_identity(identity);
5464 doc.link_track_to_identity(track_id, identity_id);
5465
5466 for &signal_id in &signal_ids {
5468 let identity = doc.identity_for_signal(signal_id);
5469 prop_assert!(identity.is_some(), "Should find identity through signal");
5470 prop_assert_eq!(
5471 identity.unwrap().id,
5472 identity_id,
5473 "Should find correct identity"
5474 );
5475 }
5476 }
5477
5478 #[test]
5480 fn document_untracked_signals(total in 2usize..10, tracked in 0usize..10) {
5481 let tracked = tracked.min(total - 1); let mut doc = GroundedDocument::new("test", "test text");
5483
5484 let mut signal_ids = Vec::new();
5486 for i in 0..total {
5487 let id = doc.add_signal(Signal::new(
5488 0,
5489 Location::text(i * 10, i * 10 + 5),
5490 format!("entity_{}", i),
5491 "Type",
5492 0.9,
5493 ));
5494 signal_ids.push(id);
5495 }
5496
5497 let mut track = Track::new(0, "canonical");
5499 for (pos, &id) in signal_ids.iter().take(tracked).enumerate() {
5500 track.add_signal(id, pos as u32);
5501 }
5502 if tracked > 0 {
5503 doc.add_track(track);
5504 }
5505
5506 prop_assert_eq!(
5508 doc.untracked_signal_count(),
5509 total - tracked,
5510 "Wrong untracked count"
5511 );
5512 }
5513 }
5514
5515 proptest! {
5520 #[test]
5522 fn entity_roundtrip_preserves_text(
5523 text in surface_strategy(),
5524 start in 0usize..1000,
5525 len in 1usize..100,
5526 conf in 0.0f64..=1.0,
5527 ) {
5528 use super::EntityType;
5529
5530 let end = start + len;
5531 let entity = super::Entity::new(&text, EntityType::Person, start, end, conf);
5532
5533 let doc = GroundedDocument::from_entities("test", "x".repeat(end + 10), &[entity]);
5534 let converted = doc.to_entities();
5535
5536 prop_assert_eq!(converted.len(), 1, "Should have exactly one entity");
5537 prop_assert_eq!(&converted[0].text, &text, "Text should be preserved");
5538 prop_assert_eq!(converted[0].start(), start, "Start should be preserved");
5539 prop_assert_eq!(converted[0].end(), end, "End should be preserved");
5540 }
5541
5542 }
5545
5546 proptest! {
5555 #[test]
5557 fn overlap_symmetric(
5558 start1 in 0usize..1000,
5559 len1 in 1usize..100,
5560 start2 in 0usize..1000,
5561 len2 in 1usize..100,
5562 ) {
5563 let a = Location::text(start1, start1 + len1);
5564 let b = Location::text(start2, start2 + len2);
5565
5566 prop_assert_eq!(
5567 a.overlaps(&b),
5568 b.overlaps(&a),
5569 "Overlap must be symmetric"
5570 );
5571 }
5572
5573 #[test]
5575 fn overlap_reflexive(start in 0usize..1000, len in 1usize..100) {
5576 let loc = Location::text(start, start + len);
5577 prop_assert!(loc.overlaps(&loc), "Location must overlap with itself");
5578 }
5579
5580 #[test]
5582 fn iou_implies_overlap(
5583 start1 in 0usize..500,
5584 len1 in 1usize..100,
5585 start2 in 0usize..500,
5586 len2 in 1usize..100,
5587 ) {
5588 let a = Location::text(start1, start1 + len1);
5589 let b = Location::text(start2, start2 + len2);
5590
5591 if let Some(iou) = a.iou(&b) {
5592 if iou > 0.0 {
5593 prop_assert!(
5594 a.overlaps(&b),
5595 "IoU > 0 should imply overlap"
5596 );
5597 }
5598 }
5599 }
5600 }
5601
5602 proptest! {
5607 #[test]
5609 fn stats_signal_count_accurate(signal_count in 0usize..20) {
5610 let mut doc = GroundedDocument::new("test", "test");
5611 for i in 0..signal_count {
5612 doc.add_signal(Signal::new(
5613 0,
5614 Location::text(i * 10, i * 10 + 5),
5615 "entity",
5616 "Type",
5617 0.9,
5618 ));
5619 }
5620
5621 let stats = doc.stats();
5622 prop_assert_eq!(stats.signal_count, signal_count);
5623 }
5624
5625 #[test]
5627 fn stats_track_count_accurate(track_count in 0usize..10) {
5628 let mut doc = GroundedDocument::new("test", "test");
5629 for i in 0..track_count {
5630 let id = doc.add_signal(Signal::new(
5631 0,
5632 Location::text(i * 10, i * 10 + 5),
5633 "entity",
5634 "Type",
5635 0.9,
5636 ));
5637 let mut track = Track::new(0, format!("track_{}", i));
5638 track.add_signal(id, 0);
5639 doc.add_track(track);
5640 }
5641
5642 let stats = doc.stats();
5643 prop_assert_eq!(stats.track_count, track_count);
5644 }
5645
5646 #[test]
5648 fn stats_avg_confidence_bounded(
5649 confidences in proptest::collection::vec(0.0f32..=1.0, 1..10)
5650 ) {
5651 let mut doc = GroundedDocument::new("test", "test");
5652 for (i, conf) in confidences.iter().enumerate() {
5653 doc.add_signal(Signal::new(
5654 0,
5655 Location::text(i * 10, i * 10 + 5),
5656 "entity",
5657 "Type",
5658 *conf,
5659 ));
5660 }
5661
5662 let stats = doc.stats();
5663 prop_assert!(stats.avg_confidence.value() >= 0.0);
5664 prop_assert!(stats.avg_confidence.value() <= 1.0);
5665 }
5666 }
5667
5668 proptest! {
5673 #[test]
5675 fn batch_add_returns_all_ids(count in 1usize..10) {
5676 let mut doc = GroundedDocument::new("test", "test");
5677 let signals: Vec<Signal<Location>> = (0..count)
5678 .map(|i| Signal::new(0, Location::text(i * 10, i * 10 + 5), "e", "T", 0.9))
5679 .collect();
5680
5681 let ids = doc.add_signals(signals);
5682 prop_assert_eq!(ids.len(), count);
5683 prop_assert_eq!(doc.signals().len(), count);
5684 }
5685
5686 #[test]
5688 fn create_track_valid(signal_count in 1usize..5) {
5689 let mut doc = GroundedDocument::new("test", "test");
5690 let mut signal_ids = Vec::new();
5691 for i in 0..signal_count {
5692 let id = doc.add_signal(Signal::new(
5693 0,
5694 Location::text(i * 10, i * 10 + 5),
5695 "entity",
5696 "Type",
5697 0.9,
5698 ));
5699 signal_ids.push(id);
5700 }
5701
5702 let track_id = doc.create_track_from_signals("canonical", &signal_ids);
5703 prop_assert!(track_id.is_some());
5704
5705 let track = doc.get_track(track_id.unwrap());
5706 prop_assert!(track.is_some());
5707 prop_assert_eq!(track.unwrap().len(), signal_count);
5708 }
5709
5710 #[test]
5712 fn create_track_empty_returns_none(_dummy in 0..1) {
5713 let mut doc = GroundedDocument::new("test", "test");
5714 let track_id = doc.create_track_from_signals("canonical", &[]);
5715 prop_assert!(track_id.is_none());
5716 }
5717 }
5718
5719 proptest! {
5724 #[test]
5726 fn signals_in_range_within_bounds(
5727 range_start in 0usize..100,
5728 range_len in 10usize..50,
5729 ) {
5730 let range_end = range_start + range_len;
5731 let mut doc = GroundedDocument::new("test", "x".repeat(200));
5732
5733 doc.add_signal(Signal::new(0, Location::text(range_start + 2, range_start + 5), "inside", "T", 0.9));
5735 doc.add_signal(Signal::new(0, Location::text(0, 5), "before", "T", 0.9));
5736 doc.add_signal(Signal::new(0, Location::text(190, 195), "after", "T", 0.9));
5737
5738 let in_range = doc.signals_in_range(range_start, range_end);
5739
5740 for signal in &in_range {
5741 if let Some((start, end)) = signal.location.text_offsets() {
5742 prop_assert!(start >= range_start, "Signal start {} < range start {}", start, range_start);
5743 prop_assert!(end <= range_end, "Signal end {} > range end {}", end, range_end);
5744 }
5745 }
5746 }
5747
5748 #[test]
5750 fn overlapping_signals_symmetric(
5751 start1 in 10usize..50,
5752 len1 in 5usize..20,
5753 start2 in 10usize..50,
5754 len2 in 5usize..20,
5755 ) {
5756 let mut doc = GroundedDocument::new("test", "x".repeat(100));
5757
5758 let loc1 = Location::text(start1, start1 + len1);
5759 let loc2 = Location::text(start2, start2 + len2);
5760
5761 doc.add_signal(Signal::new(0, loc1.clone(), "A", "T", 0.9));
5762 doc.add_signal(Signal::new(0, loc2.clone(), "B", "T", 0.9));
5763
5764 let overlaps_loc1 = doc.overlapping_signals(&loc1);
5765 let overlaps_loc2 = doc.overlapping_signals(&loc2);
5766
5767 if loc1.overlaps(&loc2) {
5769 prop_assert!(overlaps_loc1.len() >= 2, "Should find both when overlapping");
5770 prop_assert!(overlaps_loc2.len() >= 2, "Should find both when overlapping");
5771 }
5772 }
5773 }
5774
5775 proptest! {
5780 #[test]
5782 fn modality_counts_sum_to_total(
5783 symbolic_count in 0usize..5,
5784 iconic_count in 0usize..5,
5785 ) {
5786 let mut doc = GroundedDocument::new("test", "test");
5787
5788 for i in 0..symbolic_count {
5790 let mut signal = Signal::new(
5791 0,
5792 Location::text(i * 10, i * 10 + 5),
5793 "entity",
5794 "Type",
5795 0.9,
5796 );
5797 signal.modality = Modality::Symbolic;
5798 doc.add_signal(signal);
5799 }
5800
5801 for i in 0..iconic_count {
5803 let mut signal = Signal::new(
5804 0,
5805 Location::text(1000 + i * 10, 1000 + i * 10 + 5),
5806 "entity",
5807 "Type",
5808 0.9,
5809 );
5810 signal.modality = Modality::Iconic;
5811 doc.add_signal(signal);
5812 }
5813
5814 let stats = doc.stats();
5815 prop_assert_eq!(
5816 stats.symbolic_count + stats.iconic_count + stats.hybrid_count,
5817 stats.signal_count,
5818 "Modality counts should sum to total"
5819 );
5820 }
5821 }
5822
5823 proptest! {
5828 #[test]
5830 fn from_text_always_valid(
5831 text in "[a-zA-Z ]{20,100}",
5832 surface_start in 0usize..15,
5833 surface_len in 1usize..8,
5834 ) {
5835 let text_char_len = text.chars().count();
5836 let surface_end = (surface_start + surface_len).min(text_char_len);
5837 let surface_start = surface_start.min(surface_end.saturating_sub(1));
5838
5839 if surface_start < surface_end && surface_end <= text_char_len {
5840 let surface: String = text.chars()
5841 .skip(surface_start)
5842 .take(surface_end - surface_start)
5843 .collect();
5844
5845 if !surface.is_empty() {
5846 if let Some(signal) = Signal::<Location>::from_text(&text, &surface, "Test", 0.9) {
5848 prop_assert!(
5850 signal.validate_against(&text).is_none(),
5851 "Signal created via from_text must be valid"
5852 );
5853 }
5854 }
5855 }
5856 }
5857
5858 #[test]
5860 fn validated_add_rejects_invalid(
5861 text in "[a-z]{10,50}",
5862 wrong_surface in "[A-Z]{3,10}",
5863 ) {
5864 let mut doc = GroundedDocument::new("test", &text);
5865
5866 let signal = Signal::new(
5868 0,
5869 Location::text(0, wrong_surface.chars().count().min(text.chars().count())),
5870 wrong_surface.clone(),
5871 "Test",
5872 0.9,
5873 );
5874
5875 let expected: String = text.chars().take(wrong_surface.chars().count()).collect();
5878 if expected != wrong_surface {
5879 let result = doc.add_signal_validated(signal);
5880 prop_assert!(result.is_err(), "Should reject signal with mismatched surface");
5881 }
5882 }
5883
5884 #[test]
5886 fn round_trip_signal_from_text(
5887 prefix in "[a-z]{5,20}",
5888 entity in "[A-Z][a-z]{3,10}",
5889 suffix in "[a-z]{5,20}",
5890 ) {
5891 let text = format!("{} {} {}", prefix, entity, suffix);
5892 let mut doc = GroundedDocument::new("test", &text);
5893
5894 let id = doc.add_signal_from_text(&entity, "Entity", 0.9);
5895 prop_assert!(id.is_some(), "Should find entity in text");
5896
5897 let signal = doc.signals().iter().find(|s| s.id == id.unwrap());
5898 prop_assert!(signal.is_some(), "Should retrieve added signal");
5899
5900 let signal = signal.unwrap();
5901 prop_assert_eq!(signal.surface(), entity.as_str(), "Surface should match");
5902
5903 prop_assert!(
5905 doc.is_valid(),
5906 "Document should be valid after from_text add"
5907 );
5908 }
5909
5910 #[test]
5912 fn nth_occurrence_finds_correct(
5913 entity in "[A-Z][a-z]{2,5}",
5914 sep in " [a-z]+ ",
5915 ) {
5916 let text = format!("{}{}{}{}{}", entity, sep, entity, sep, entity);
5918 let mut doc = GroundedDocument::new("test", &text);
5919
5920 for n in 0..3 {
5922 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, n);
5923 prop_assert!(id.is_some(), "Should find occurrence {}", n);
5924 }
5925
5926 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, 3);
5928 prop_assert!(id.is_none(), "Should NOT find 4th occurrence");
5929
5930 prop_assert!(doc.is_valid(), "All signals should be valid");
5932
5933 let offsets: Vec<_> = doc.signals()
5935 .iter()
5936 .filter_map(|s| s.text_offsets())
5937 .collect();
5938 let unique: std::collections::HashSet<_> = offsets.iter().collect();
5939 prop_assert_eq!(offsets.len(), unique.len(), "Each occurrence should have distinct offset");
5940 }
5941 }
5942
5943 #[test]
5948 fn test_track_stats_basic() {
5949 let text = "John met Mary. He said hello. John left.";
5950 let mut doc = GroundedDocument::new("test", text);
5951 let text_len = text.chars().count();
5952
5953 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.95));
5955 let s2 = doc.add_signal(Signal::new(
5956 0,
5957 Location::text(30, 34),
5958 "John",
5959 "Person",
5960 0.90,
5961 ));
5962
5963 let track_id = doc.add_track(Track::new(0, "John".to_string()));
5965 doc.add_signal_to_track(s1, track_id, 0);
5966 doc.add_signal_to_track(s2, track_id, 1);
5967
5968 let track = doc.get_track(track_id).unwrap();
5970 let stats = track.compute_stats(&doc, text_len);
5971
5972 assert_eq!(stats.chain_length, 2, "Two mentions");
5973 assert_eq!(stats.variation_count, 1, "One unique surface form");
5974 assert!(stats.spread > 0, "Spread should be positive");
5975 assert!(stats.relative_spread > 0.0 && stats.relative_spread < 1.0);
5976 assert!((stats.min_confidence.value() - 0.90).abs() < 0.01);
5977 assert!((stats.max_confidence.value() - 0.95).abs() < 0.01);
5978 assert!((stats.mean_confidence.value() - 0.925).abs() < 0.01);
5979 }
5980
5981 #[test]
5982 fn test_track_stats_singleton() {
5983 let text = "Paris is beautiful.";
5984 let mut doc = GroundedDocument::new("test", text);
5985 let text_len = text.chars().count();
5986
5987 let s1 = doc.add_signal(Signal::new(
5988 0,
5989 Location::text(0, 5),
5990 "Paris",
5991 "Location",
5992 0.88,
5993 ));
5994 let track_id = doc.add_track(Track::new(0, "Paris".to_string()));
5995 doc.add_signal_to_track(s1, track_id, 0);
5996
5997 let track = doc.get_track(track_id).unwrap();
5998 let stats = track.compute_stats(&doc, text_len);
5999
6000 assert_eq!(stats.chain_length, 1);
6001 assert_eq!(stats.spread, 0, "Singleton has zero spread");
6002 assert_eq!(stats.first_position, stats.last_position);
6003 assert!((stats.min_confidence.value() - stats.max_confidence.value()).abs() < 0.001);
6004 }
6005}