1use super::confidence::Confidence;
90use super::entity::{
91 DiscontinuousSpan, Entity, EntityType, HierarchicalConfidence, Provenance, Span,
92};
93use serde::{Deserialize, Serialize};
94use std::collections::HashMap;
95
96#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
122pub enum Modality {
123 Iconic,
126 #[default]
129 Symbolic,
130 Hybrid,
133}
134
135impl Modality {
136 #[must_use]
138 pub const fn supports_linguistic_features(&self) -> bool {
139 matches!(self, Self::Symbolic | Self::Hybrid)
140 }
141
142 #[must_use]
144 pub const fn supports_geometric_features(&self) -> bool {
145 matches!(self, Self::Iconic | Self::Hybrid)
146 }
147}
148
149#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
184pub enum Location {
185 Text {
187 start: usize,
189 end: usize,
191 },
192 BoundingBox {
194 x: f32,
196 y: f32,
198 width: f32,
200 height: f32,
202 page: Option<u32>,
204 },
205 Temporal {
207 start_sec: f64,
209 end_sec: f64,
211 frame: Option<u64>,
213 },
214 Cuboid {
216 center: [f32; 3],
218 dimensions: [f32; 3],
220 rotation: [f32; 4],
222 },
223 Genomic {
225 contig: String,
227 start: u64,
229 end: u64,
231 strand: Option<char>,
233 },
234 Discontinuous {
236 segments: Vec<(usize, usize)>,
238 },
239 TextWithBbox {
241 start: usize,
243 end: usize,
245 bbox: Box<Location>,
247 },
248}
249
250impl Location {
251 #[must_use]
253 pub const fn text(start: usize, end: usize) -> Self {
254 Self::Text { start, end }
255 }
256
257 #[must_use]
259 pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
260 Self::BoundingBox {
261 x,
262 y,
263 width,
264 height,
265 page: None,
266 }
267 }
268
269 #[must_use]
271 pub const fn modality(&self) -> Modality {
272 match self {
273 Self::Text { .. } | Self::Genomic { .. } | Self::Discontinuous { .. } => {
274 Modality::Symbolic
275 }
276 Self::BoundingBox { .. } | Self::Cuboid { .. } => Modality::Iconic,
277 Self::Temporal { .. } => Modality::Iconic, Self::TextWithBbox { .. } => Modality::Hybrid,
279 }
280 }
281
282 #[must_use]
284 pub fn text_offsets(&self) -> Option<(usize, usize)> {
285 match self {
286 Self::Text { start, end } => Some((*start, *end)),
287 Self::TextWithBbox { start, end, .. } => Some((*start, *end)),
288 Self::Discontinuous { segments } => {
289 let start = segments.iter().map(|(s, _)| *s).min()?;
290 let end = segments.iter().map(|(_, e)| *e).max()?;
291 Some((start, end))
292 }
293 _ => None,
294 }
295 }
296
297 #[must_use]
299 pub fn overlaps(&self, other: &Self) -> bool {
300 match (self, other) {
301 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
302 s1 < e2 && s2 < e1
303 }
304 (
305 Self::BoundingBox {
306 x: x1,
307 y: y1,
308 width: w1,
309 height: h1,
310 page: p1,
311 },
312 Self::BoundingBox {
313 x: x2,
314 y: y2,
315 width: w2,
316 height: h2,
317 page: p2,
318 },
319 ) => {
320 if p1 != p2 {
322 return false;
323 }
324 x1 < &(x2 + w2) && &(x1 + w1) > x2 && y1 < &(y2 + h2) && &(y1 + h1) > y2
326 }
327 _ => false, }
329 }
330
331 #[must_use]
335 pub fn iou(&self, other: &Self) -> Option<f64> {
336 match (self, other) {
337 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
338 let intersection_start = (*s1).max(*s2);
339 let intersection_end = (*e1).min(*e2);
340 if intersection_start >= intersection_end {
341 return Some(0.0);
342 }
343 let intersection = (intersection_end - intersection_start) as f64;
344 let union = ((*e1).max(*e2) - (*s1).min(*s2)) as f64;
345 if union == 0.0 {
346 Some(0.0)
347 } else {
348 Some(intersection / union)
349 }
350 }
351 (
352 Self::BoundingBox {
353 x: x1,
354 y: y1,
355 width: w1,
356 height: h1,
357 page: p1,
358 },
359 Self::BoundingBox {
360 x: x2,
361 y: y2,
362 width: w2,
363 height: h2,
364 page: p2,
365 },
366 ) => {
367 if p1 != p2 {
368 return Some(0.0);
369 }
370 let x_overlap = (x1 + w1).min(x2 + w2) - x1.max(*x2);
371 let y_overlap = (y1 + h1).min(y2 + h2) - y1.max(*y2);
372 if x_overlap <= 0.0 || y_overlap <= 0.0 {
373 return Some(0.0);
374 }
375 let intersection = (x_overlap * y_overlap) as f64;
376 let area1 = (*w1 * *h1) as f64;
377 let area2 = (*w2 * *h2) as f64;
378 let union = area1 + area2 - intersection;
379 if union == 0.0 {
380 Some(0.0)
381 } else {
382 Some(intersection / union)
383 }
384 }
385 _ => None,
386 }
387 }
388}
389
390impl Default for Location {
391 fn default() -> Self {
392 Self::Text { start: 0, end: 0 }
393 }
394}
395
396impl From<&Span> for Location {
397 fn from(span: &Span) -> Self {
398 match span {
399 Span::Text { start, end } => Self::Text {
400 start: *start,
401 end: *end,
402 },
403 Span::BoundingBox {
404 x,
405 y,
406 width,
407 height,
408 page,
409 } => Self::BoundingBox {
410 x: *x,
411 y: *y,
412 width: *width,
413 height: *height,
414 page: *page,
415 },
416 Span::Hybrid { start, end, bbox } => Self::TextWithBbox {
417 start: *start,
418 end: *end,
419 bbox: Box::new(Location::from(bbox.as_ref())),
420 },
421 }
422 }
423}
424
425impl From<Span> for Location {
426 fn from(span: Span) -> Self {
427 Self::from(&span)
428 }
429}
430
431impl Location {
440 #[must_use]
445 pub fn to_span(&self) -> Option<Span> {
446 match self {
447 Self::Text { start, end } => Some(Span::Text {
448 start: *start,
449 end: *end,
450 }),
451 Self::BoundingBox {
452 x,
453 y,
454 width,
455 height,
456 page,
457 } => Some(Span::BoundingBox {
458 x: *x,
459 y: *y,
460 width: *width,
461 height: *height,
462 page: *page,
463 }),
464 Self::TextWithBbox { start, end, bbox } => {
465 let inner_span = bbox.to_span()?;
466 Some(Span::Hybrid {
467 start: *start,
468 end: *end,
469 bbox: Box::new(inner_span),
470 })
471 }
472 Self::Temporal { .. }
474 | Self::Cuboid { .. }
475 | Self::Genomic { .. }
476 | Self::Discontinuous { .. } => None,
477 }
478 }
479}
480
481pub use super::types::SignalId;
487
488#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
518pub struct Signal<L = Location> {
519 pub id: SignalId,
521 pub location: L,
523 pub surface: String,
525 pub label: super::types::TypeLabel,
529 pub confidence: f32,
531 pub hierarchical: Option<HierarchicalConfidence>,
533 pub provenance: Option<Provenance>,
535 pub modality: Modality,
537 pub normalized: Option<String>,
539 pub negated: bool,
541 pub quantifier: Option<Quantifier>,
543}
544
545#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
550#[non_exhaustive]
551pub enum Quantifier {
552 Universal,
554 Existential,
556 None,
558 Definite,
560 Approximate,
562 MinBound,
564 MaxBound,
566 Bare,
568}
569
570impl<L> Signal<L> {
571 #[must_use]
581 pub fn new(
582 id: impl Into<SignalId>,
583 location: L,
584 surface: impl Into<String>,
585 label: impl Into<super::types::TypeLabel>,
586 confidence: f32,
587 ) -> Self {
588 Self {
589 id: id.into(),
590 location,
591 surface: surface.into(),
592 label: label.into(),
593 confidence: confidence.clamp(0.0, 1.0),
594 hierarchical: None,
595 provenance: None,
596 modality: Modality::default(),
597 normalized: None,
598 negated: false,
599 quantifier: None,
600 }
601 }
602
603 #[must_use]
605 pub fn label(&self) -> &str {
606 self.label.as_str()
607 }
608
609 #[must_use]
611 pub fn type_label(&self) -> super::types::TypeLabel {
612 self.label.clone()
613 }
614
615 #[must_use]
617 pub fn surface(&self) -> &str {
618 &self.surface
619 }
620
621 #[must_use]
623 pub fn is_confident(&self, threshold: f32) -> bool {
624 self.confidence >= threshold
625 }
626
627 #[must_use]
629 pub fn with_modality(mut self, modality: Modality) -> Self {
630 self.modality = modality;
631 self
632 }
633
634 #[must_use]
636 pub fn negated(mut self) -> Self {
637 self.negated = true;
638 self
639 }
640
641 #[must_use]
643 pub fn with_quantifier(mut self, q: Quantifier) -> Self {
644 self.quantifier = Some(q);
645 self
646 }
647
648 #[must_use]
650 pub fn with_provenance(mut self, p: Provenance) -> Self {
651 self.provenance = Some(p);
652 self
653 }
654}
655
656impl Signal<Location> {
657 #[must_use]
659 pub fn text_offsets(&self) -> Option<(usize, usize)> {
660 self.location.text_offsets()
661 }
662
663 #[must_use]
680 pub fn validate_against(&self, source_text: &str) -> Option<SignalValidationError> {
681 let (start, end) = self.location.text_offsets()?;
682
683 let char_count = source_text.chars().count();
684
685 if end > char_count {
687 return Some(SignalValidationError::OutOfBounds {
688 signal_id: self.id,
689 end,
690 text_len: char_count,
691 });
692 }
693
694 if start >= end {
695 return Some(SignalValidationError::InvalidSpan {
696 signal_id: self.id,
697 start,
698 end,
699 });
700 }
701
702 let actual: String = source_text.chars().skip(start).take(end - start).collect();
704
705 if actual != self.surface {
706 return Some(SignalValidationError::TextMismatch {
707 signal_id: self.id,
708 expected: self.surface.clone(),
709 actual,
710 start,
711 end,
712 });
713 }
714
715 None
716 }
717
718 #[must_use]
720 pub fn is_valid(&self, source_text: &str) -> bool {
721 self.validate_against(source_text).is_none()
722 }
723
724 #[must_use]
739 pub fn from_text(
740 source: &str,
741 surface: &str,
742 label: impl Into<super::types::TypeLabel>,
743 confidence: f32,
744 ) -> Option<Self> {
745 Self::from_text_nth(source, surface, label, confidence, 0)
746 }
747
748 #[must_use]
750 pub fn from_text_nth(
751 source: &str,
752 surface: &str,
753 label: impl Into<super::types::TypeLabel>,
754 confidence: f32,
755 occurrence: usize,
756 ) -> Option<Self> {
757 for (count, (byte_idx, _)) in source.match_indices(surface).enumerate() {
759 if count == occurrence {
760 let start = source[..byte_idx].chars().count();
762 let end = start + surface.chars().count();
763
764 return Some(Self::new(
765 SignalId::ZERO,
766 Location::text(start, end),
767 surface,
768 label,
769 confidence,
770 ));
771 }
772 }
773
774 None
775 }
776}
777
778#[derive(Debug, Clone, PartialEq)]
780pub enum SignalValidationError {
781 OutOfBounds {
783 signal_id: SignalId,
785 end: usize,
787 text_len: usize,
789 },
790 InvalidSpan {
792 signal_id: SignalId,
794 start: usize,
796 end: usize,
798 },
799 TextMismatch {
801 signal_id: SignalId,
803 expected: String,
805 actual: String,
807 start: usize,
809 end: usize,
811 },
812}
813
814impl std::fmt::Display for SignalValidationError {
815 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
816 match self {
817 Self::OutOfBounds {
818 signal_id,
819 end,
820 text_len,
821 } => {
822 write!(
823 f,
824 "S{}: end offset {} exceeds text length {}",
825 signal_id, end, text_len
826 )
827 }
828 Self::InvalidSpan {
829 signal_id,
830 start,
831 end,
832 } => {
833 write!(f, "S{}: invalid span [{}, {})", signal_id, start, end)
834 }
835 Self::TextMismatch {
836 signal_id,
837 expected,
838 actual,
839 start,
840 end,
841 } => {
842 write!(
843 f,
844 "S{}: text mismatch at [{}, {}): expected '{}', found '{}'",
845 signal_id, start, end, expected, actual
846 )
847 }
848 }
849 }
850}
851
852impl std::error::Error for SignalValidationError {}
853
854impl From<&Entity> for Signal<Location> {
861 fn from(e: &Entity) -> Self {
862 let mut signal = Signal::new(
863 SignalId::ZERO,
864 Location::text(e.start, e.end),
865 &e.text,
866 e.entity_type.as_label(),
867 f32::from(e.confidence),
868 );
869 signal.normalized = e.normalized.clone();
870 signal.provenance = e.provenance.clone();
871 signal.hierarchical = e.hierarchical_confidence;
872 signal
873 }
874}
875
876pub use super::types::TrackId;
882
883#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
885pub struct SignalRef {
886 pub signal_id: SignalId,
888 pub position: u32,
890}
891
892#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
898pub struct TrackRef {
899 pub doc_id: String,
901 pub track_id: TrackId,
903}
904
905#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
925pub struct Track {
926 pub id: TrackId,
928 pub signals: Vec<SignalRef>,
930 pub entity_type: Option<super::types::TypeLabel>,
934 pub canonical_surface: String,
936 pub identity_id: Option<IdentityId>,
938 pub cluster_confidence: f32,
940 pub embedding: Option<Vec<f32>>,
943}
944
945impl Track {
946 #[must_use]
948 pub fn new(id: impl Into<TrackId>, canonical_surface: impl Into<String>) -> Self {
949 Self {
950 id: id.into(),
951 signals: Vec::new(),
952 entity_type: None,
953 canonical_surface: canonical_surface.into(),
954 identity_id: None,
955 cluster_confidence: 1.0,
956 embedding: None,
957 }
958 }
959
960 pub fn add_signal(&mut self, signal_id: impl Into<SignalId>, position: u32) {
962 let signal_id = signal_id.into();
963 self.signals.push(SignalRef {
964 signal_id,
965 position,
966 });
967 }
968
969 #[must_use]
971 pub fn len(&self) -> usize {
972 self.signals.len()
973 }
974
975 #[must_use]
977 pub fn is_empty(&self) -> bool {
978 self.signals.is_empty()
979 }
980
981 #[must_use]
983 pub fn is_singleton(&self) -> bool {
984 self.signals.len() == 1
985 }
986
987 #[must_use]
989 pub const fn id(&self) -> TrackId {
990 self.id
991 }
992
993 #[must_use]
995 pub fn signals(&self) -> &[SignalRef] {
996 &self.signals
997 }
998
999 #[must_use]
1001 pub fn canonical_surface(&self) -> &str {
1002 &self.canonical_surface
1003 }
1004
1005 #[must_use]
1007 pub const fn identity_id(&self) -> Option<IdentityId> {
1008 self.identity_id
1009 }
1010
1011 #[must_use]
1013 pub const fn cluster_confidence(&self) -> f32 {
1014 self.cluster_confidence
1015 }
1016
1017 pub fn set_cluster_confidence(&mut self, confidence: f32) {
1019 self.cluster_confidence = confidence.clamp(0.0, 1.0);
1020 }
1021
1022 pub fn set_identity_id(&mut self, identity_id: IdentityId) {
1024 self.identity_id = Some(identity_id);
1025 }
1026
1027 pub fn clear_identity_id(&mut self) {
1029 self.identity_id = None;
1030 }
1031
1032 #[must_use]
1034 pub fn with_identity(mut self, identity_id: IdentityId) -> Self {
1035 self.identity_id = Some(identity_id);
1036 self
1037 }
1038
1039 #[must_use]
1043 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1044 let s = entity_type.into();
1045 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1046 self
1047 }
1048
1049 #[must_use]
1063 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1064 self.entity_type = Some(label);
1065 self
1066 }
1067
1068 #[must_use]
1073 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1074 self.entity_type.clone()
1075 }
1076
1077 #[must_use]
1079 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1080 self.embedding = Some(embedding);
1081 self
1082 }
1083
1084 pub fn compute_spread(&self, doc: &GroundedDocument) -> Option<usize> {
1088 if self.signals.is_empty() {
1089 return Some(0);
1090 }
1091
1092 let positions: Vec<usize> = self
1093 .signals
1094 .iter()
1095 .filter_map(|sr| {
1096 doc.signals
1097 .iter()
1098 .find(|s| s.id == sr.signal_id)
1099 .and_then(|s| s.location.text_offsets())
1100 .map(|(start, _)| start)
1101 })
1102 .collect();
1103
1104 if positions.is_empty() {
1105 return None;
1106 }
1107
1108 let min_pos = *positions.iter().min().expect("positions non-empty");
1109 let max_pos = *positions.iter().max().expect("positions non-empty");
1110 Some(max_pos.saturating_sub(min_pos))
1111 }
1112
1113 pub fn collect_variations(&self, doc: &GroundedDocument) -> Vec<String> {
1117 let mut variations: std::collections::HashSet<String> = std::collections::HashSet::new();
1118
1119 for sr in &self.signals {
1120 if let Some(signal) = doc.signals.iter().find(|s| s.id == sr.signal_id) {
1121 variations.insert(signal.surface.clone());
1122 }
1123 }
1124
1125 variations.into_iter().collect()
1126 }
1127
1128 pub fn confidence_stats(&self, doc: &GroundedDocument) -> Option<(f32, f32, f32)> {
1132 let confidences: Vec<f32> = self
1133 .signals
1134 .iter()
1135 .filter_map(|sr| {
1136 doc.signals
1137 .iter()
1138 .find(|s| s.id == sr.signal_id)
1139 .map(|s| s.confidence)
1140 })
1141 .collect();
1142
1143 if confidences.is_empty() {
1144 return None;
1145 }
1146
1147 let min = confidences.iter().cloned().fold(f32::INFINITY, f32::min);
1148 let max = confidences
1149 .iter()
1150 .cloned()
1151 .fold(f32::NEG_INFINITY, f32::max);
1152 let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
1153
1154 Some((min, max, mean))
1155 }
1156
1157 pub fn compute_stats(&self, doc: &GroundedDocument, text_len: usize) -> TrackStats {
1161 let chain_length = self.signals.len();
1162 let spread = self.compute_spread(doc).unwrap_or(0);
1163 let variations = self.collect_variations(doc);
1164 let (min_conf, max_conf, mean_conf) = self.confidence_stats(doc).unwrap_or((0.0, 0.0, 0.0));
1165
1166 let positions: Vec<usize> = self
1168 .signals
1169 .iter()
1170 .filter_map(|sr| {
1171 doc.signals
1172 .iter()
1173 .find(|s| s.id == sr.signal_id)
1174 .and_then(|s| s.location.text_offsets())
1175 .map(|(start, _)| start)
1176 })
1177 .collect();
1178
1179 let first_position = positions.iter().min().copied().unwrap_or(0);
1180 let last_position = positions.iter().max().copied().unwrap_or(0);
1181 let relative_spread = if text_len > 0 {
1182 spread as f64 / text_len as f64
1183 } else {
1184 0.0
1185 };
1186
1187 TrackStats {
1188 chain_length,
1189 variation_count: variations.len(),
1190 variations,
1191 spread,
1192 relative_spread,
1193 first_position,
1194 last_position,
1195 min_confidence: min_conf,
1196 max_confidence: max_conf,
1197 mean_confidence: mean_conf,
1198 has_embedding: self.embedding.is_some(),
1199 }
1200 }
1201}
1202
1203#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1205pub struct TrackStats {
1206 pub chain_length: usize,
1208 pub variation_count: usize,
1210 pub variations: Vec<String>,
1212 pub spread: usize,
1214 pub relative_spread: f64,
1216 pub first_position: usize,
1218 pub last_position: usize,
1220 pub min_confidence: f32,
1222 pub max_confidence: f32,
1224 pub mean_confidence: f32,
1226 pub has_embedding: bool,
1228}
1229
1230pub use super::types::IdentityId;
1236
1237#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1242pub enum IdentitySource {
1243 CrossDocCoref {
1246 track_refs: Vec<TrackRef>,
1248 },
1249 KnowledgeBase {
1252 kb_name: String,
1254 kb_id: String,
1256 },
1257 Hybrid {
1260 track_refs: Vec<TrackRef>,
1262 kb_name: String,
1264 kb_id: String,
1266 },
1267}
1268
1269#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1291pub struct Identity {
1292 pub id: IdentityId,
1294 pub canonical_name: String,
1296 pub entity_type: Option<super::types::TypeLabel>,
1300 pub kb_id: Option<String>,
1302 pub kb_name: Option<String>,
1304 pub description: Option<String>,
1306 pub embedding: Option<Vec<f32>>,
1309 pub aliases: Vec<String>,
1311 pub confidence: f32,
1313 #[serde(default, skip_serializing_if = "Option::is_none")]
1315 pub source: Option<IdentitySource>,
1316}
1317
1318impl Identity {
1319 #[must_use]
1321 pub fn new(id: impl Into<IdentityId>, canonical_name: impl Into<String>) -> Self {
1322 Self {
1323 id: id.into(),
1324 canonical_name: canonical_name.into(),
1325 entity_type: None,
1326 kb_id: None,
1327 kb_name: None,
1328 description: None,
1329 embedding: None,
1330 aliases: Vec::new(),
1331 confidence: 1.0,
1332 source: None,
1333 }
1334 }
1335
1336 #[must_use]
1338 pub fn from_kb(
1339 id: impl Into<IdentityId>,
1340 canonical_name: impl Into<String>,
1341 kb_name: impl Into<String>,
1342 kb_id: impl Into<String>,
1343 ) -> Self {
1344 let kb_name_str = kb_name.into();
1345 let kb_id_str = kb_id.into();
1346 Self {
1347 id: id.into(),
1348 canonical_name: canonical_name.into(),
1349 entity_type: None,
1350 kb_id: Some(kb_id_str.clone()),
1351 kb_name: Some(kb_name_str.clone()),
1352 description: None,
1353 embedding: None,
1354 aliases: Vec::new(),
1355 confidence: 1.0,
1356 source: Some(IdentitySource::KnowledgeBase {
1357 kb_name: kb_name_str,
1358 kb_id: kb_id_str,
1359 }),
1360 }
1361 }
1362
1363 pub fn add_alias(&mut self, alias: impl Into<String>) {
1365 self.aliases.push(alias.into());
1366 }
1367
1368 #[must_use]
1370 pub const fn id(&self) -> IdentityId {
1371 self.id
1372 }
1373
1374 #[must_use]
1376 pub fn canonical_name(&self) -> &str {
1377 &self.canonical_name
1378 }
1379
1380 #[must_use]
1382 pub fn kb_id(&self) -> Option<&str> {
1383 self.kb_id.as_deref()
1384 }
1385
1386 #[must_use]
1388 pub fn kb_name(&self) -> Option<&str> {
1389 self.kb_name.as_deref()
1390 }
1391
1392 #[must_use]
1394 pub fn aliases(&self) -> &[String] {
1395 &self.aliases
1396 }
1397
1398 #[must_use]
1400 pub const fn confidence(&self) -> f32 {
1401 self.confidence
1402 }
1403
1404 pub fn set_confidence(&mut self, confidence: f32) {
1406 self.confidence = confidence.clamp(0.0, 1.0);
1407 }
1408
1409 #[must_use]
1411 pub fn source(&self) -> Option<&IdentitySource> {
1412 self.source.as_ref()
1413 }
1414
1415 #[must_use]
1417 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1418 self.embedding = Some(embedding);
1419 self
1420 }
1421
1422 #[must_use]
1426 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1427 let s = entity_type.into();
1428 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1429 self
1430 }
1431
1432 #[must_use]
1437 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1438 self.entity_type = Some(label);
1439 self
1440 }
1441
1442 #[must_use]
1447 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1448 self.entity_type.clone()
1449 }
1450
1451 #[must_use]
1453 pub fn with_description(mut self, description: impl Into<String>) -> Self {
1454 self.description = Some(description.into());
1455 self
1456 }
1457
1458 }
1460
1461#[derive(Deserialize)]
1469struct GroundedDocumentWire {
1470 id: String,
1471 text: String,
1472 signals: Vec<Signal<Location>>,
1473 tracks: HashMap<TrackId, Track>,
1474 identities: HashMap<IdentityId, Identity>,
1475}
1476
1477impl From<GroundedDocumentWire> for GroundedDocument {
1478 fn from(wire: GroundedDocumentWire) -> Self {
1479 let mut doc = Self {
1480 id: wire.id,
1481 text: wire.text,
1482 signals: wire.signals,
1483 tracks: wire.tracks,
1484 identities: wire.identities,
1485 signal_to_track: HashMap::new(),
1486 track_to_identity: HashMap::new(),
1487 next_signal_id: SignalId::ZERO,
1488 next_track_id: TrackId::ZERO,
1489 next_identity_id: IdentityId::ZERO,
1490 };
1491 doc.rebuild_indexes();
1492 doc
1493 }
1494}
1495
1496#[derive(Debug, Clone, Serialize, Deserialize)]
1560#[serde(from = "GroundedDocumentWire")]
1561pub struct GroundedDocument {
1562 pub id: String,
1564 pub text: String,
1566 pub signals: Vec<Signal<Location>>,
1568 pub tracks: HashMap<TrackId, Track>,
1570 pub identities: HashMap<IdentityId, Identity>,
1572 #[serde(skip)]
1575 signal_to_track: HashMap<SignalId, TrackId>,
1576 #[serde(skip)]
1579 track_to_identity: HashMap<TrackId, IdentityId>,
1580 #[serde(skip)]
1583 next_signal_id: SignalId,
1584 #[serde(skip)]
1587 next_track_id: TrackId,
1588 #[serde(skip)]
1591 next_identity_id: IdentityId,
1592}
1593
1594impl GroundedDocument {
1595 #[must_use]
1597 pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
1598 Self {
1599 id: id.into(),
1600 text: text.into(),
1601 signals: Vec::new(),
1602 tracks: HashMap::new(),
1603 identities: HashMap::new(),
1604 signal_to_track: HashMap::new(),
1605 track_to_identity: HashMap::new(),
1606 next_signal_id: SignalId::ZERO,
1607 next_track_id: TrackId::ZERO,
1608 next_identity_id: IdentityId::ZERO,
1609 }
1610 }
1611
1612 pub fn rebuild_indexes(&mut self) {
1620 self.signal_to_track.clear();
1621 self.track_to_identity.clear();
1622
1623 for (&track_id, track) in &self.tracks {
1624 for sig_ref in &track.signals {
1625 self.signal_to_track.insert(sig_ref.signal_id, track_id);
1626 }
1627 if let Some(identity_id) = track.identity_id {
1628 self.track_to_identity.insert(track_id, identity_id);
1629 }
1630 }
1631
1632 self.next_signal_id = self
1633 .signals
1634 .iter()
1635 .map(|s| s.id)
1636 .max()
1637 .map_or(SignalId::ZERO, |id| id + 1);
1638 self.next_track_id = self
1639 .tracks
1640 .keys()
1641 .copied()
1642 .max()
1643 .map_or(TrackId::ZERO, |id| id + 1);
1644 self.next_identity_id = self
1645 .identities
1646 .keys()
1647 .copied()
1648 .max()
1649 .map_or(IdentityId::ZERO, |id| id + 1);
1650 }
1651
1652 pub fn add_signal(&mut self, mut signal: Signal<Location>) -> SignalId {
1658 let id = self.next_signal_id;
1659 signal.id = id;
1660 self.signals.push(signal);
1661 self.next_signal_id += 1;
1662 id
1663 }
1664
1665 #[must_use]
1667 pub fn get_signal(&self, id: impl Into<SignalId>) -> Option<&Signal<Location>> {
1668 let id = id.into();
1669 self.signals.iter().find(|s| s.id == id)
1670 }
1671
1672 pub fn signals(&self) -> &[Signal<Location>] {
1674 &self.signals
1675 }
1676
1677 pub fn add_track(&mut self, mut track: Track) -> TrackId {
1683 let id = self.next_track_id;
1684 track.id = id;
1685
1686 for signal_ref in &track.signals {
1688 self.signal_to_track.insert(signal_ref.signal_id, id);
1689 }
1690
1691 self.tracks.insert(id, track);
1692 self.next_track_id += 1;
1693 id
1694 }
1695
1696 #[must_use]
1698 pub fn get_track(&self, id: impl Into<TrackId>) -> Option<&Track> {
1699 self.tracks.get(&id.into())
1700 }
1701
1702 #[must_use]
1704 pub fn get_track_mut(&mut self, id: impl Into<TrackId>) -> Option<&mut Track> {
1705 self.tracks.get_mut(&id.into())
1706 }
1707
1708 pub fn add_signal_to_track(
1713 &mut self,
1714 signal_id: impl Into<SignalId>,
1715 track_id: impl Into<TrackId>,
1716 position: u32,
1717 ) -> bool {
1718 let signal_id = signal_id.into();
1719 let track_id = track_id.into();
1720 if let Some(track) = self.tracks.get_mut(&track_id) {
1721 track.add_signal(signal_id, position);
1722 self.signal_to_track.insert(signal_id, track_id);
1723 true
1724 } else {
1725 false
1726 }
1727 }
1728
1729 #[must_use]
1731 pub fn track_for_signal(&self, signal_id: SignalId) -> Option<&Track> {
1732 let track_id = self.signal_to_track.get(&signal_id)?;
1733 self.tracks.get(track_id)
1734 }
1735
1736 pub fn tracks(&self) -> impl Iterator<Item = &Track> {
1738 self.tracks.values()
1739 }
1740
1741 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
1747 let id = self.next_identity_id;
1748 identity.id = id;
1749 self.identities.insert(id, identity);
1750 self.next_identity_id += 1;
1751 id
1752 }
1753
1754 pub fn link_track_to_identity(
1756 &mut self,
1757 track_id: impl Into<TrackId>,
1758 identity_id: impl Into<IdentityId>,
1759 ) {
1760 let track_id = track_id.into();
1761 let identity_id = identity_id.into();
1762 if let Some(track) = self.tracks.get_mut(&track_id) {
1763 track.identity_id = Some(identity_id);
1764 self.track_to_identity.insert(track_id, identity_id);
1765 }
1766 }
1767
1768 #[must_use]
1770 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
1771 self.identities.get(&id)
1772 }
1773
1774 #[must_use]
1776 pub fn identity_for_track(&self, track_id: TrackId) -> Option<&Identity> {
1777 let identity_id = self.track_to_identity.get(&track_id)?;
1778 self.identities.get(identity_id)
1779 }
1780
1781 #[must_use]
1783 pub fn identity_for_signal(&self, signal_id: SignalId) -> Option<&Identity> {
1784 let track_id = self.signal_to_track.get(&signal_id)?;
1785 self.identity_for_track(*track_id)
1786 }
1787
1788 pub fn identities(&self) -> impl Iterator<Item = &Identity> {
1790 self.identities.values()
1791 }
1792
1793 #[must_use]
1798 pub fn track_ref(&self, track_id: TrackId) -> Option<TrackRef> {
1799 if self.tracks.contains_key(&track_id) {
1801 Some(TrackRef {
1802 doc_id: self.id.clone(),
1803 track_id,
1804 })
1805 } else {
1806 None
1807 }
1808 }
1809
1810 #[must_use]
1816 pub fn to_entities(&self) -> Vec<Entity> {
1817 self.signals
1818 .iter()
1819 .map(|signal| {
1820 let (start, end) = signal.location.text_offsets().unwrap_or((0, 0));
1821 let track = self.track_for_signal(signal.id);
1822 let identity = track.and_then(|t| self.identity_for_track(t.id));
1823
1824 Entity {
1825 text: signal.surface.clone(),
1826 entity_type: EntityType::from_label(signal.label.as_str()),
1827 start,
1828 end,
1829 confidence: Confidence::from(signal.confidence),
1830 normalized: signal.normalized.clone(),
1831 provenance: signal.provenance.clone(),
1832 kb_id: identity.and_then(|i| i.kb_id.clone()),
1833 canonical_id: track.map(|t| super::types::CanonicalId::new(t.id.get())),
1834 hierarchical_confidence: signal.hierarchical,
1835 visual_span: match &signal.location {
1836 Location::BoundingBox {
1837 x,
1838 y,
1839 width,
1840 height,
1841 page,
1842 } => Some(Span::BoundingBox {
1843 x: *x,
1844 y: *y,
1845 width: *width,
1846 height: *height,
1847 page: *page,
1848 }),
1849 Location::TextWithBbox { bbox, .. } => {
1850 if let Location::BoundingBox {
1851 x,
1852 y,
1853 width,
1854 height,
1855 page,
1856 } = bbox.as_ref()
1857 {
1858 Some(Span::BoundingBox {
1859 x: *x,
1860 y: *y,
1861 width: *width,
1862 height: *height,
1863 page: *page,
1864 })
1865 } else {
1866 None
1867 }
1868 }
1869 _ => None,
1870 },
1871 discontinuous_span: match &signal.location {
1872 Location::Discontinuous { segments } => Some(DiscontinuousSpan::new(
1873 segments.iter().map(|(s, e)| (*s)..(*e)).collect(),
1874 )),
1875 _ => None,
1876 },
1877 valid_from: None,
1878 valid_until: None,
1879 viewport: None,
1880 phi_features: None,
1881 mention_type: None,
1882 }
1883 })
1884 .collect()
1885 }
1886
1887 #[must_use]
1889 pub fn from_entities(
1890 id: impl Into<String>,
1891 text: impl Into<String>,
1892 entities: &[Entity],
1893 ) -> Self {
1894 let mut doc = Self::new(id, text);
1895
1896 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1902 enum TrackKey {
1903 Canonical(super::types::CanonicalId),
1904 Singleton(usize),
1905 }
1906
1907 let mut tracks_map: HashMap<TrackKey, Vec<SignalId>> = HashMap::new();
1908 let mut signal_to_entity_idx: HashMap<SignalId, usize> = HashMap::new();
1909
1910 for (idx, entity) in entities.iter().enumerate() {
1911 let location = if let Some(disc) = &entity.discontinuous_span {
1912 Location::Discontinuous {
1913 segments: disc.segments().iter().map(|r| (r.start, r.end)).collect(),
1914 }
1915 } else if let Some(visual) = &entity.visual_span {
1916 Location::from(visual)
1917 } else {
1918 Location::text(entity.start, entity.end)
1919 };
1920
1921 let mut signal = Signal::new(
1922 SignalId::new(idx as u64),
1923 location,
1924 &entity.text,
1925 entity.entity_type.as_label(),
1926 f32::from(entity.confidence),
1927 );
1928 signal.normalized = entity.normalized.clone();
1929 signal.provenance = entity.provenance.clone();
1930 signal.hierarchical = entity.hierarchical_confidence;
1931
1932 let signal_id = doc.add_signal(signal);
1933 signal_to_entity_idx.insert(signal_id, idx);
1934
1935 let key = match entity.canonical_id {
1936 Some(cid) => TrackKey::Canonical(cid),
1937 None => TrackKey::Singleton(idx),
1938 };
1939 tracks_map.entry(key).or_default().push(signal_id);
1940 }
1941
1942 for (_key, signal_ids) in tracks_map {
1944 if let Some(first_signal) = signal_ids.first().and_then(|id| doc.get_signal(*id)) {
1945 let mut track = Track::new(doc.next_track_id, &first_signal.surface);
1946 track.entity_type =
1947 Some(super::types::TypeLabel::from(first_signal.label.as_str()));
1948
1949 for (pos, &signal_id) in signal_ids.iter().enumerate() {
1950 track.add_signal(signal_id, pos as u32);
1951 }
1952
1953 let kb_id = signal_ids.iter().find_map(|sid| {
1956 let ent_idx = signal_to_entity_idx.get(sid).copied()?;
1957 entities.get(ent_idx)?.kb_id.clone()
1958 });
1959 if let Some(kb_id) = kb_id {
1960 let identity = Identity::from_kb(
1961 doc.next_identity_id,
1962 &track.canonical_surface,
1963 "unknown",
1964 kb_id,
1965 );
1966 let identity_id = doc.add_identity(identity);
1967 track = track.with_identity(identity_id);
1968 }
1969
1970 doc.add_track(track);
1971 }
1972 }
1973
1974 doc
1975 }
1976
1977 #[must_use]
1979 pub fn signals_with_label(&self, label: &str) -> Vec<&Signal<Location>> {
1980 let want = super::types::TypeLabel::from(label);
1981 self.signals.iter().filter(|s| s.label == want).collect()
1982 }
1983
1984 #[must_use]
1986 pub fn confident_signals(&self, threshold: f32) -> Vec<&Signal<Location>> {
1987 self.signals
1988 .iter()
1989 .filter(|s| s.confidence >= threshold)
1990 .collect()
1991 }
1992
1993 pub fn linked_tracks(&self) -> impl Iterator<Item = &Track> {
1995 self.tracks.values().filter(|t| t.identity_id.is_some())
1996 }
1997
1998 pub fn unlinked_tracks(&self) -> impl Iterator<Item = &Track> {
2000 self.tracks.values().filter(|t| t.identity_id.is_none())
2001 }
2002
2003 #[must_use]
2005 pub fn untracked_signal_count(&self) -> usize {
2006 self.signals
2007 .iter()
2008 .filter(|s| !self.signal_to_track.contains_key(&s.id))
2009 .count()
2010 }
2011
2012 #[must_use]
2014 pub fn untracked_signals(&self) -> Vec<&Signal<Location>> {
2015 self.signals
2016 .iter()
2017 .filter(|s| !self.signal_to_track.contains_key(&s.id))
2018 .collect()
2019 }
2020
2021 #[must_use]
2027 pub fn signals_by_modality(&self, modality: Modality) -> Vec<&Signal<Location>> {
2028 self.signals
2029 .iter()
2030 .filter(|s| s.modality == modality)
2031 .collect()
2032 }
2033
2034 #[must_use]
2036 pub fn text_signals(&self) -> Vec<&Signal<Location>> {
2037 self.signals_by_modality(Modality::Symbolic)
2038 }
2039
2040 #[must_use]
2042 pub fn visual_signals(&self) -> Vec<&Signal<Location>> {
2043 self.signals_by_modality(Modality::Iconic)
2044 }
2045
2046 #[must_use]
2048 pub fn overlapping_signals(&self, location: &Location) -> Vec<&Signal<Location>> {
2049 self.signals
2050 .iter()
2051 .filter(|s| s.location.overlaps(location))
2052 .collect()
2053 }
2054
2055 #[must_use]
2057 pub fn signals_in_range(&self, start: usize, end: usize) -> Vec<&Signal<Location>> {
2058 self.signals
2059 .iter()
2060 .filter(|s| {
2061 if let Some((s_start, s_end)) = s.location.text_offsets() {
2062 s_start >= start && s_end <= end
2063 } else {
2064 false
2065 }
2066 })
2067 .collect()
2068 }
2069
2070 #[must_use]
2072 pub fn negated_signals(&self) -> Vec<&Signal<Location>> {
2073 self.signals.iter().filter(|s| s.negated).collect()
2074 }
2075
2076 #[must_use]
2078 pub fn quantified_signals(&self, quantifier: Quantifier) -> Vec<&Signal<Location>> {
2079 self.signals
2080 .iter()
2081 .filter(|s| s.quantifier == Some(quantifier))
2082 .collect()
2083 }
2084
2085 #[must_use]
2107 pub fn validate(&self) -> Vec<SignalValidationError> {
2108 self.signals
2109 .iter()
2110 .filter_map(|s| s.validate_against(&self.text))
2111 .collect()
2112 }
2113
2114 #[must_use]
2138 pub fn validate_invariants(&self) -> Vec<String> {
2139 let mut errors = Vec::new();
2140
2141 let mut seen_ids = std::collections::HashSet::new();
2143 for signal in &self.signals {
2144 if !seen_ids.insert(signal.id) {
2145 errors.push(format!("Duplicate signal ID: {}", signal.id));
2146 }
2147 }
2148
2149 let signal_ids: std::collections::HashSet<_> = self.signals.iter().map(|s| s.id).collect();
2151
2152 for (track_id, track) in &self.tracks {
2154 for signal_ref in &track.signals {
2155 if !signal_ids.contains(&signal_ref.signal_id) {
2156 errors.push(format!(
2157 "Track {} references non-existent signal {}",
2158 track_id, signal_ref.signal_id
2159 ));
2160 }
2161 }
2162 }
2163
2164 for (signal_id, track_id) in &self.signal_to_track {
2166 if let Some(track) = self.tracks.get(track_id) {
2168 if !track.signals.iter().any(|r| r.signal_id == *signal_id) {
2170 errors.push(format!(
2171 "signal_to_track[{}] = {} but track doesn't contain signal",
2172 signal_id, track_id
2173 ));
2174 }
2175 } else {
2176 errors.push(format!(
2177 "signal_to_track[{}] = {} but track doesn't exist",
2178 signal_id, track_id
2179 ));
2180 }
2181 }
2182
2183 for (track_id, identity_id) in &self.track_to_identity {
2185 if let Some(track) = self.tracks.get(track_id) {
2187 if track.identity_id != Some(*identity_id) {
2188 errors.push(format!(
2189 "track_to_identity[{}] = {} but track.identity_id = {:?}",
2190 track_id, identity_id, track.identity_id
2191 ));
2192 }
2193 } else {
2194 errors.push(format!(
2195 "track_to_identity[{}] = {} but track doesn't exist",
2196 track_id, identity_id
2197 ));
2198 }
2199
2200 if !self.identities.contains_key(identity_id) {
2202 errors.push(format!(
2203 "track_to_identity[{}] = {} but identity doesn't exist",
2204 track_id, identity_id
2205 ));
2206 }
2207 }
2208
2209 for (track_id, track) in &self.tracks {
2211 if let Some(identity_id) = track.identity_id {
2212 if !self.identities.contains_key(&identity_id) {
2213 errors.push(format!(
2214 "Track {} references non-existent identity {}",
2215 track_id, identity_id
2216 ));
2217 }
2218 }
2219 }
2220
2221 errors
2222 }
2223
2224 #[must_use]
2226 pub fn invariants_hold(&self) -> bool {
2227 self.validate_invariants().is_empty()
2228 }
2229
2230 #[must_use]
2232 pub fn is_valid(&self) -> bool {
2233 self.signals.iter().all(|s| s.is_valid(&self.text))
2234 }
2235
2236 pub fn add_signal_validated(
2240 &mut self,
2241 signal: Signal<Location>,
2242 ) -> Result<SignalId, SignalValidationError> {
2243 if let Some(err) = signal.validate_against(&self.text) {
2244 return Err(err);
2245 }
2246 Ok(self.add_signal(signal))
2247 }
2248
2249 pub fn add_signal_from_text(
2263 &mut self,
2264 surface: &str,
2265 label: impl Into<super::types::TypeLabel>,
2266 confidence: f32,
2267 ) -> Option<SignalId> {
2268 let signal = Signal::from_text(&self.text, surface, label, confidence)?;
2269 Some(self.add_signal(signal))
2270 }
2271
2272 pub fn add_signal_from_text_nth(
2274 &mut self,
2275 surface: &str,
2276 label: impl Into<super::types::TypeLabel>,
2277 confidence: f32,
2278 occurrence: usize,
2279 ) -> Option<SignalId> {
2280 let signal = Signal::from_text_nth(&self.text, surface, label, confidence, occurrence)?;
2281 Some(self.add_signal(signal))
2282 }
2283
2284 #[must_use]
2290 pub fn stats(&self) -> DocumentStats {
2291 let signal_count = self.signals.len();
2292 let track_count = self.tracks.len();
2293 let identity_count = self.identities.len();
2294
2295 let linked_track_count = self
2296 .tracks
2297 .values()
2298 .filter(|t| t.identity_id.is_some())
2299 .count();
2300 let untracked_count = self.untracked_signal_count();
2301
2302 let avg_track_size = if track_count > 0 {
2303 self.tracks.values().map(|t| t.len()).sum::<usize>() as f32 / track_count as f32
2304 } else {
2305 0.0
2306 };
2307
2308 let singleton_count = self.tracks.values().filter(|t| t.is_singleton()).count();
2309
2310 let avg_confidence = if signal_count > 0 {
2311 self.signals.iter().map(|s| s.confidence).sum::<f32>() / signal_count as f32
2312 } else {
2313 0.0
2314 };
2315
2316 let negated_count = self.signals.iter().filter(|s| s.negated).count();
2317
2318 let symbolic_count = self
2320 .signals
2321 .iter()
2322 .filter(|s| s.modality == Modality::Symbolic)
2323 .count();
2324 let iconic_count = self
2325 .signals
2326 .iter()
2327 .filter(|s| s.modality == Modality::Iconic)
2328 .count();
2329 let hybrid_count = self
2330 .signals
2331 .iter()
2332 .filter(|s| s.modality == Modality::Hybrid)
2333 .count();
2334
2335 DocumentStats {
2336 signal_count,
2337 track_count,
2338 identity_count,
2339 linked_track_count,
2340 untracked_count,
2341 avg_track_size,
2342 singleton_count,
2343 avg_confidence,
2344 negated_count,
2345 symbolic_count,
2346 iconic_count,
2347 hybrid_count,
2348 }
2349 }
2350
2351 pub fn add_signals(
2359 &mut self,
2360 signals: impl IntoIterator<Item = Signal<Location>>,
2361 ) -> Vec<SignalId> {
2362 signals.into_iter().map(|s| self.add_signal(s)).collect()
2363 }
2364
2365 pub fn create_track_from_signals(
2369 &mut self,
2370 canonical: impl Into<String>,
2371 signal_ids: &[SignalId],
2372 ) -> Option<TrackId> {
2373 if signal_ids.is_empty() {
2374 return None;
2375 }
2376
2377 let mut track = Track::new(TrackId::ZERO, canonical);
2378 for (pos, &id) in signal_ids.iter().enumerate() {
2379 track.add_signal(id, pos as u32);
2380 }
2381 Some(self.add_track(track))
2382 }
2383
2384 pub fn merge_tracks(&mut self, track_ids: &[TrackId]) -> Option<TrackId> {
2389 if track_ids.is_empty() {
2390 return None;
2391 }
2392
2393 let mut all_signals: Vec<SignalRef> = Vec::new();
2395 let mut canonical = String::new();
2396 let mut entity_type = None;
2397
2398 for &track_id in track_ids {
2399 if let Some(track) = self.tracks.get(&track_id) {
2400 if canonical.is_empty() {
2401 canonical = track.canonical_surface.clone();
2402 entity_type = track.entity_type.clone();
2403 }
2404 all_signals.extend(track.signals.iter().cloned());
2405 }
2406 }
2407
2408 if all_signals.is_empty() {
2409 return None;
2410 }
2411
2412 all_signals.sort_by_key(|s| s.position);
2414
2415 for &track_id in track_ids {
2417 self.tracks.remove(&track_id);
2418 }
2419
2420 let mut new_track = Track::new(TrackId::ZERO, canonical);
2422 new_track.entity_type = entity_type;
2423 for (pos, signal_ref) in all_signals.iter().enumerate() {
2424 new_track.add_signal(signal_ref.signal_id, pos as u32);
2425 }
2426
2427 Some(self.add_track(new_track))
2428 }
2429
2430 #[must_use]
2432 pub fn find_overlapping_signal_pairs(&self) -> Vec<(SignalId, SignalId)> {
2433 let mut pairs = Vec::new();
2434 let signals: Vec<_> = self.signals.iter().collect();
2435
2436 for i in 0..signals.len() {
2437 for j in (i + 1)..signals.len() {
2438 if signals[i].location.overlaps(&signals[j].location) {
2439 pairs.push((signals[i].id, signals[j].id));
2440 }
2441 }
2442 }
2443
2444 pairs
2445 }
2446}
2447
2448#[derive(Debug, Clone, Copy, Default)]
2450pub struct DocumentStats {
2451 pub signal_count: usize,
2453 pub track_count: usize,
2455 pub identity_count: usize,
2457 pub linked_track_count: usize,
2459 pub untracked_count: usize,
2461 pub avg_track_size: f32,
2463 pub singleton_count: usize,
2465 pub avg_confidence: f32,
2467 pub negated_count: usize,
2469 pub symbolic_count: usize,
2471 pub iconic_count: usize,
2473 pub hybrid_count: usize,
2475}
2476
2477impl std::fmt::Display for DocumentStats {
2478 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2479 writeln!(f, "Document Statistics:")?;
2480 writeln!(
2481 f,
2482 " Signals: {} (avg confidence: {:.2})",
2483 self.signal_count, self.avg_confidence
2484 )?;
2485 writeln!(
2486 f,
2487 " Tracks: {} (avg size: {:.1}, singletons: {})",
2488 self.track_count, self.avg_track_size, self.singleton_count
2489 )?;
2490 writeln!(
2491 f,
2492 " Identities: {} ({} tracks linked)",
2493 self.identity_count, self.linked_track_count
2494 )?;
2495 writeln!(f, " Untracked signals: {}", self.untracked_count)?;
2496 writeln!(
2497 f,
2498 " Modalities: {} symbolic, {} iconic, {} hybrid",
2499 self.symbolic_count, self.iconic_count, self.hybrid_count
2500 )?;
2501 if self.negated_count > 0 {
2502 writeln!(f, " Negated: {}", self.negated_count)?;
2503 }
2504 Ok(())
2505 }
2506}
2507
2508#[derive(Debug, Clone)]
2518struct IntervalNode {
2519 signal_id: SignalId,
2521 start: usize,
2523 end: usize,
2525 max_end: usize,
2527 left: Option<Box<IntervalNode>>,
2529 right: Option<Box<IntervalNode>>,
2531}
2532
2533impl IntervalNode {
2534 fn new(signal_id: SignalId, start: usize, end: usize) -> Self {
2535 Self {
2536 signal_id,
2537 start,
2538 end,
2539 max_end: end,
2540 left: None,
2541 right: None,
2542 }
2543 }
2544
2545 fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2546 self.max_end = self.max_end.max(end);
2547
2548 if start < self.start {
2549 if let Some(ref mut left) = self.left {
2550 left.insert(signal_id, start, end);
2551 } else {
2552 self.left = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2553 }
2554 } else if let Some(ref mut right) = self.right {
2555 right.insert(signal_id, start, end);
2556 } else {
2557 self.right = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2558 }
2559 }
2560
2561 fn query_overlap(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2562 if self.start < query_end && query_start < self.end {
2564 results.push(self.signal_id);
2565 }
2566
2567 if let Some(ref left) = self.left {
2569 if left.max_end > query_start {
2570 left.query_overlap(query_start, query_end, results);
2571 }
2572 }
2573
2574 if let Some(ref right) = self.right {
2576 if self.start < query_end {
2577 right.query_overlap(query_start, query_end, results);
2578 }
2579 }
2580 }
2581
2582 fn query_containing(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2583 if self.start <= query_start && self.end >= query_end {
2585 results.push(self.signal_id);
2586 }
2587
2588 if let Some(ref left) = self.left {
2590 if left.max_end >= query_end {
2591 left.query_containing(query_start, query_end, results);
2592 }
2593 }
2594
2595 if let Some(ref right) = self.right {
2597 if self.start <= query_start {
2598 right.query_containing(query_start, query_end, results);
2599 }
2600 }
2601 }
2602
2603 fn query_contained_in(
2604 &self,
2605 range_start: usize,
2606 range_end: usize,
2607 results: &mut Vec<SignalId>,
2608 ) {
2609 if self.start >= range_start && self.end <= range_end {
2611 results.push(self.signal_id);
2612 }
2613
2614 if let Some(ref left) = self.left {
2616 left.query_contained_in(range_start, range_end, results);
2617 }
2618
2619 if let Some(ref right) = self.right {
2621 if self.start < range_end {
2622 right.query_contained_in(range_start, range_end, results);
2623 }
2624 }
2625 }
2626}
2627
2628#[derive(Debug, Clone, Default)]
2644pub struct TextSpatialIndex {
2645 root: Option<IntervalNode>,
2646 size: usize,
2647}
2648
2649impl TextSpatialIndex {
2650 #[must_use]
2652 pub fn new() -> Self {
2653 Self::default()
2654 }
2655
2656 #[must_use]
2658 pub fn from_signals(signals: &[Signal<Location>]) -> Self {
2659 let mut index = Self::new();
2660 for signal in signals {
2661 if let Some((start, end)) = signal.location.text_offsets() {
2662 index.insert(signal.id, start, end);
2663 }
2664 }
2665 index
2666 }
2667
2668 pub fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2670 if let Some(ref mut root) = self.root {
2671 root.insert(signal_id, start, end);
2672 } else {
2673 self.root = Some(IntervalNode::new(signal_id, start, end));
2674 }
2675 self.size += 1;
2676 }
2677
2678 #[must_use]
2680 pub fn query_overlap(&self, start: usize, end: usize) -> Vec<SignalId> {
2681 let mut results = Vec::new();
2682 if let Some(ref root) = self.root {
2683 root.query_overlap(start, end, &mut results);
2684 }
2685 results
2686 }
2687
2688 #[must_use]
2690 pub fn query_containing(&self, start: usize, end: usize) -> Vec<SignalId> {
2691 let mut results = Vec::new();
2692 if let Some(ref root) = self.root {
2693 root.query_containing(start, end, &mut results);
2694 }
2695 results
2696 }
2697
2698 #[must_use]
2700 pub fn query_contained_in(&self, start: usize, end: usize) -> Vec<SignalId> {
2701 let mut results = Vec::new();
2702 if let Some(ref root) = self.root {
2703 root.query_contained_in(start, end, &mut results);
2704 }
2705 results
2706 }
2707
2708 #[must_use]
2710 pub fn len(&self) -> usize {
2711 self.size
2712 }
2713
2714 #[must_use]
2716 pub fn is_empty(&self) -> bool {
2717 self.size == 0
2718 }
2719}
2720
2721impl GroundedDocument {
2722 #[must_use]
2741 pub fn build_text_index(&self) -> TextSpatialIndex {
2742 TextSpatialIndex::from_signals(&self.signals)
2743 }
2744
2745 #[must_use]
2750 pub fn query_signals_in_range_indexed(
2751 &self,
2752 start: usize,
2753 end: usize,
2754 ) -> Vec<&Signal<Location>> {
2755 let index = self.build_text_index();
2756 let ids = index.query_contained_in(start, end);
2757 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2758 }
2759
2760 #[must_use]
2762 pub fn query_overlapping_signals_indexed(
2763 &self,
2764 start: usize,
2765 end: usize,
2766 ) -> Vec<&Signal<Location>> {
2767 let index = self.build_text_index();
2768 let ids = index.query_overlap(start, end);
2769 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2770 }
2771
2772 #[must_use]
2785 pub fn to_coref_document(&self) -> super::coref::CorefDocument {
2786 use super::coref::{CorefChain, CorefDocument, Mention};
2787 use std::collections::HashMap;
2788
2789 let signal_by_id: HashMap<SignalId, &Signal<Location>> =
2791 self.signals.iter().map(|s| (s.id, s)).collect();
2792
2793 let mut chains: Vec<CorefChain> = Vec::new();
2794
2795 for track in self.tracks.values() {
2796 let mut mentions: Vec<Mention> = Vec::new();
2797
2798 for sref in &track.signals {
2799 let Some(signal) = signal_by_id.get(&sref.signal_id) else {
2800 continue;
2801 };
2802
2803 let Some((start, end)) = signal.location.text_offsets() else {
2804 continue;
2805 };
2806
2807 let mut m = Mention::new(signal.surface.clone(), start, end);
2808 m.entity_type = Some(signal.label.to_string());
2809 mentions.push(m);
2810 }
2811
2812 if mentions.is_empty() {
2813 continue;
2814 }
2815
2816 let mut chain = CorefChain::new(mentions);
2817 chain.entity_type = track.entity_type.as_ref().map(|t| t.to_string());
2818 chains.push(chain);
2819 }
2820
2821 chains.sort_by_key(|c| c.mentions.first().map(|m| m.start).unwrap_or(usize::MAX));
2823
2824 CorefDocument::with_id(&self.text, &self.id, chains)
2825 }
2826}
2827
2828pub fn render_document_html(doc: &GroundedDocument) -> String {
2836 let mut html = String::new();
2837 let stats = doc.stats();
2838
2839 html.push_str(r#"<!DOCTYPE html>
2840<html>
2841<head>
2842<meta charset="UTF-8">
2843<meta name="color-scheme" content="dark light">
2844<title>grounded::GroundedDocument</title>
2845<style>
2846:root{
2847 /* Allow UA widgets (inputs/scrollbars) to match the theme */
2848 color-scheme: light dark;
2849 /* Dark (default) */
2850 --bg:#0a0a0a;
2851 --panel-bg:#0d0d0d;
2852 --text:#b0b0b0;
2853 --text-strong:#fff;
2854 --muted:#666;
2855 --border:#222;
2856 --border-strong:#333;
2857 --hover:#111;
2858 --input-bg:#080808;
2859 --active:#fff;
2860 --track-strong:rgba(255,255,255,0.35);
2861 --track-soft:rgba(255,255,255,0.18);
2862 /* Entity colors (dark) */
2863 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2864 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2865 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2866 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2867 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2868 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2869 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2870}
2871@media (prefers-color-scheme: light){
2872 :root{
2873 --bg:#ffffff;
2874 --panel-bg:#f7f7f7;
2875 --text:#222;
2876 --text-strong:#000;
2877 --muted:#555;
2878 --border:#d6d6d6;
2879 --border-strong:#c6c6c6;
2880 --hover:#f0f0f0;
2881 --input-bg:#ffffff;
2882 --active:#000;
2883 --track-strong:rgba(0,0,0,0.25);
2884 --track-soft:rgba(0,0,0,0.12);
2885 /* Entity colors (light) */
2886 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2887 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2888 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2889 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2890 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2891 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2892 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2893 }
2894}
2895html[data-theme='dark']{
2896 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
2897 --muted:#666; --border:#222; --border-strong:#333; --hover:#111;
2898 --input-bg:#080808; --active:#fff;
2899 --track-strong:rgba(255,255,255,0.35); --track-soft:rgba(255,255,255,0.18);
2900 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2901 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2902 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2903 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2904 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2905 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2906 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2907}
2908html[data-theme='light']{
2909 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
2910 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0;
2911 --input-bg:#ffffff; --active:#000;
2912 --track-strong:rgba(0,0,0,0.25); --track-soft:rgba(0,0,0,0.12);
2913 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2914 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2915 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2916 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2917 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2918 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2919 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2920}
2921
2922*{box-sizing:border-box;margin:0;padding:0}
2923body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
2924h1,h2,h3{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
2925h1{font-size:14px}h2{font-size:12px}h3{font-size:11px;color:var(--muted)}
2926 a{color:inherit}
2927 a:hover{text-decoration:underline}
2928table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
2929th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
2930th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
2931tr:hover{background:var(--hover)}
2932.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:8px}
2933.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
2934.panel-h{display:flex;align-items:center;gap:8px}
2935.toggle{cursor:pointer;user-select:none;color:var(--muted);border:1px solid var(--border);background:var(--bg);padding:2px 6px;font-size:10px}
2936.panel-collapsed table,.panel-collapsed .panel-body{display:none}
2937.toolbar{display:flex;gap:8px;align-items:center;margin:8px 0 0}
2938.toolbar input{width:100%;max-width:520px;background:var(--input-bg);border:1px solid var(--border);color:var(--text);padding:6px 8px;font:12px monospace}
2939.muted{color:var(--muted)}
2940.panel-body{white-space:pre-wrap;word-break:break-word}
2941.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
2942.e{padding:1px 2px;border-bottom:1px solid}
2943.seg{cursor:pointer}
2944.e-per{background:var(--per-bg);border-color:var(--per-br);color:var(--per-tx)}
2945.e-org{background:var(--org-bg);border-color:var(--org-br);color:var(--org-tx)}
2946.e-loc{background:var(--loc-bg);border-color:var(--loc-br);color:var(--loc-tx)}
2947.e-misc{background:var(--mis-bg);border-color:var(--mis-br);color:var(--mis-tx)}
2948.e-date{background:var(--dat-bg);border-color:var(--dat-br);color:var(--dat-tx)}
2949.e-track{box-shadow:inset 0 0 0 1px var(--track-strong)}
2950.e-track-hover{box-shadow:inset 0 0 0 1px var(--track-soft)}
2951.e-active{outline:2px solid var(--active);outline-offset:1px}
2952.conf{color:var(--muted);font-size:10px}
2953.badge{display:inline-block;padding:1px 4px;font-size:9px;text-transform:uppercase}
2954.badge-y{background:var(--badge-y-bg);color:var(--badge-y-tx);border:1px solid var(--badge-y-br)}
2955.badge-n{background:var(--badge-n-bg);color:var(--badge-n-tx);border:1px solid var(--badge-n-br)}
2956.stats{display:flex;gap:16px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
2957.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
2958.id{color:var(--muted);font-size:9px}
2959.kb{color:var(--muted)}
2960.arrow{color:var(--muted)}
2961</style>
2962</head>
2963<body>
2964"#);
2965
2966 html.push_str(&format!(
2968 r#"<div class="panel-h" style="justify-content:space-between"><h1>doc_id="{}" len={}</h1><span class="toggle" id="theme-toggle" title="toggle theme (auto → dark → light)">theme: auto</span></div>"#,
2969 html_escape(&doc.id),
2970 doc.text.len()
2971 ));
2972
2973 html.push_str(r#"<div class="stats">"#);
2974 html.push_str(&format!(
2975 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">signals</div></div>"#,
2976 stats.signal_count
2977 ));
2978 html.push_str(&format!(
2979 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">tracks</div></div>"#,
2980 stats.track_count
2981 ));
2982 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">identities</div></div>"#, stats.identity_count));
2983 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{:.2}</div><div class="stat-l">avg_conf</div></div>"#, stats.avg_confidence));
2984 html.push_str(&format!(
2985 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">linked</div></div>"#,
2986 stats.linked_track_count
2987 ));
2988 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">untracked</div></div>"#, stats.untracked_count));
2989 if stats.iconic_count > 0 || stats.hybrid_count > 0 {
2990 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}/{}/{}</div><div class="stat-l">sym/ico/hyb</div></div>"#,
2991 stats.symbolic_count, stats.iconic_count, stats.hybrid_count));
2992 }
2993 html.push_str(r#"</div>"#);
2994
2995 html.push_str(r#"<h2>text</h2>"#);
2997 html.push_str(r#"<div class="text-box">"#);
2998 html.push_str(&annotate_text_html(
2999 &doc.text,
3000 doc.signals(),
3001 &doc.signal_to_track,
3002 ));
3003 html.push_str(r#"</div>"#);
3004
3005 html.push_str(
3007 r#"<h2>selection</h2><div class="panel" id="selection-panel" role="region" aria-label="selection"><div class="panel-h"><h3>selection</h3><span class="muted" id="selection-hint" role="status" aria-live="polite">click a mention / row to see coref track details</span></div><pre class="panel-body" id="selection-body" role="textbox" aria-readonly="true" aria-label="selection details">—</pre></div>"#,
3008 );
3009
3010 html.push_str(r#"<div class="grid">"#);
3012
3013 html.push_str(r#"<div class="panel" id="panel-signals"><div class="panel-h"><h3>signals (level 1)</h3><span class="toggle" data-toggle="panel-signals">toggle</span></div><div class="toolbar"><input id="signal-filter" type="text" placeholder="filter signals: id / label / surface (e.g. 'PER', 'S12', 'Paris')" /><span class="muted" id="signal-filter-count"></span></div><table id="signals-table">"#);
3015 html.push_str(r#"<tr><th>id</th><th>span</th><th>surface</th><th>label</th><th>conf</th><th>track</th></tr>"#);
3016 for signal in doc.signals() {
3017 let (span, start_opt, end_opt) = if let Some((s, e)) = signal.location.text_offsets() {
3018 (format!("[{},{})", s, e), Some(s), Some(e))
3019 } else {
3020 ("bbox".to_string(), None, None)
3021 };
3022 let track_id_num = doc.signal_to_track.get(&signal.id).copied();
3023 let track_id = track_id_num
3024 .map(|t| format!("T{}", t))
3025 .unwrap_or_else(|| "-".to_string());
3026 let track_attr = track_id_num
3027 .map(|t| format!(r#" data-track="{}""#, t))
3028 .unwrap_or_default();
3029 let offs_attr = match (start_opt, end_opt) {
3030 (Some(s), Some(e)) => format!(r#" data-start="{}" data-end="{}""#, s, e),
3031 _ => String::new(),
3032 };
3033 let neg = if signal.negated { " NEG" } else { "" };
3034 html.push_str(&format!(
3035 r#"<tr data-sid="S{sid}" data-label="{label}" data-surface="{surface}"{track_attr}{offs_attr} data-conf="{conf:.2}"><td class="id"><a href='#S{sid}'>S{sid}</a></td><td>{span}</td><td>{surface}</td><td>{label}{neg}</td><td class="conf">{conf:.2}</td><td class="id">{track}</td></tr>"#,
3036 sid = signal.id,
3037 span = span,
3038 surface = html_escape(&signal.surface),
3039 label = html_escape(signal.label.as_str()),
3040 neg = neg,
3041 conf = signal.confidence,
3042 track = track_id,
3043 track_attr = track_attr,
3044 offs_attr = offs_attr
3045 ));
3046 }
3047 html.push_str(r#"</table></div>"#);
3048
3049 html.push_str(r#"<div class="panel" id="panel-tracks"><div class="panel-h"><h3>tracks (level 2)</h3><span class="toggle" data-toggle="panel-tracks">toggle</span></div><table id="tracks-table">"#);
3051 html.push_str(r#"<tr><th>id</th><th>canonical</th><th>type</th><th>|S|</th><th>signals</th><th>identity</th></tr>"#);
3052 for track in doc.tracks() {
3053 let entity_type = track
3054 .entity_type
3055 .as_ref()
3056 .map(|t| t.as_str())
3057 .unwrap_or("-");
3058 let signals: Vec<String> = track
3059 .signals
3060 .iter()
3061 .map(|s| format!("S{}", s.signal_id))
3062 .collect();
3063 let identity = doc
3064 .identity_for_track(track.id)
3065 .map(|i| format!("I{}", i.id))
3066 .unwrap_or_else(|| "-".to_string());
3067 let linked_badge = if track.identity_id.is_some() {
3068 r#"<span class="badge badge-y">y</span>"#
3069 } else {
3070 r#"<span class="badge badge-n">n</span>"#
3071 };
3072 html.push_str(&format!(
3073 r#"<tr data-tid="{tid}"><td class="id">T{tid}</td><td>{canonical_surface}</td><td>{etype}</td><td>{n}</td><td class="id">{sigs}</td><td class="id">{ident} {badge}</td></tr>"#,
3074 tid = track.id,
3075 canonical_surface = html_escape(&track.canonical_surface),
3076 etype = html_escape(entity_type),
3077 n = track.len(),
3078 sigs = html_escape(&signals.join(" ")),
3079 ident = identity,
3080 badge = linked_badge
3081 ));
3082 }
3083 html.push_str(r#"</table></div>"#);
3084
3085 html.push_str(r#"<div class="panel" id="panel-identities"><div class="panel-h"><h3>identities (level 3)</h3><span class="toggle" data-toggle="panel-identities">toggle</span></div><table>"#);
3087 html.push_str(r#"<tr><th>id</th><th>name</th><th>type</th><th>kb</th><th>kb_id</th><th>aliases</th></tr>"#);
3088 for identity in doc.identities() {
3089 let kb = identity.kb_name.as_deref().unwrap_or("-");
3090 let kb_id = identity.kb_id.as_deref().unwrap_or("-");
3091 let entity_type = identity
3092 .entity_type
3093 .as_ref()
3094 .map(|t| t.as_str())
3095 .unwrap_or("-");
3096 let aliases = if identity.aliases.is_empty() {
3097 "-".to_string()
3098 } else {
3099 identity.aliases.join(", ")
3100 };
3101 html.push_str(&format!(
3102 r#"<tr><td class="id">I{}</td><td>{}</td><td>{}</td><td class="kb">{}</td><td class="kb">{}</td><td>{}</td></tr>"#,
3103 identity.id, html_escape(&identity.canonical_name), entity_type, kb, kb_id, html_escape(&aliases)
3104 ));
3105 }
3106 html.push_str(r#"</table></div>"#);
3107
3108 html.push_str(r#"</div>"#); html.push_str(r#"<h2>hierarchy trace</h2><div class="panel"><table>"#);
3112 html.push_str(r#"<tr><th>signal</th><th></th><th>track</th><th></th><th>identity</th><th>kb_id</th></tr>"#);
3113 for signal in doc.signals() {
3114 let track = doc.track_for_signal(signal.id);
3115 let identity = doc.identity_for_signal(signal.id);
3116
3117 let track_str = track
3118 .map(|t| format!("T{} \"{}\"", t.id, html_escape(&t.canonical_surface)))
3119 .unwrap_or_else(|| "-".to_string());
3120 let identity_str = identity
3121 .map(|i| format!("I{} \"{}\"", i.id, html_escape(&i.canonical_name)))
3122 .unwrap_or_else(|| "-".to_string());
3123 let kb_str = identity
3124 .and_then(|i| i.kb_id.as_ref())
3125 .map(|s| s.as_str())
3126 .unwrap_or("-");
3127
3128 html.push_str(&format!(
3129 r#"<tr><td>S{} "{}"</td><td class="arrow">→</td><td>{}</td><td class="arrow">→</td><td>{}</td><td class="kb">{}</td></tr>"#,
3130 signal.id, html_escape(&signal.surface), track_str, identity_str, kb_str
3131 ));
3132 }
3133 html.push_str(r#"</table></div>"#);
3134
3135 html.push_str(r#"<script>
3138(() => {
3139 // Index signal metadata from the signals table, and map signal/track → text elements.
3140 const signalMeta = new Map();
3141 document.querySelectorAll('#signals-table tr[data-sid]').forEach((row) => {
3142 const sid = row.getAttribute('data-sid');
3143 if (!sid) return;
3144 signalMeta.set(sid, {
3145 sid,
3146 label: row.getAttribute('data-label') || '',
3147 surface: row.getAttribute('data-surface') || '',
3148 conf: row.getAttribute('data-conf') || '',
3149 start: row.getAttribute('data-start'),
3150 end: row.getAttribute('data-end'),
3151 track: row.getAttribute('data-track'),
3152 });
3153 });
3154
3155 const signalEls = new Map();
3156 const addSignalEl = (sid, el) => {
3157 if (!sid || !el) return;
3158 const arr = signalEls.get(sid) || [];
3159 arr.push(el);
3160 signalEls.set(sid, arr);
3161 };
3162 // Old-style inline spans (non-overlapping renderer).
3163 document.querySelectorAll('span.e[data-sid]').forEach((el) => {
3164 addSignalEl(el.getAttribute('data-sid'), el);
3165 });
3166 // Segmented spans (overlap/discontinuous-safe renderer).
3167 document.querySelectorAll('span.seg[data-sids]').forEach((el) => {
3168 const raw = (el.getAttribute('data-sids') || '').trim();
3169 if (!raw) return;
3170 raw.split(/\s+/).filter(Boolean).forEach((sid) => addSignalEl(sid, el));
3171 });
3172
3173 const trackEls = new Map();
3174 for (const [sid, els] of signalEls.entries()) {
3175 const meta = signalMeta.get(sid);
3176 const tid = meta ? meta.track : null;
3177 if (!tid) continue;
3178 const arr = trackEls.get(tid) || [];
3179 els.forEach((el) => arr.push(el));
3180 trackEls.set(tid, arr);
3181 }
3182
3183 const selectionBody = document.getElementById('selection-body');
3184 const selectionHint = document.getElementById('selection-hint');
3185 const defaultHint = selectionHint ? (selectionHint.textContent || '') : '';
3186 const setSelection = (text) => {
3187 if (!selectionBody) return;
3188 selectionBody.textContent = text;
3189 };
3190 const setHint = (text) => {
3191 if (!selectionHint) return;
3192 selectionHint.textContent = text || defaultHint;
3193 };
3194
3195 // Theme toggle: auto (prefers-color-scheme) → dark → light.
3196 const themeBtn = document.getElementById('theme-toggle');
3197 const themeKey = 'anno-theme';
3198 const applyTheme = (theme) => {
3199 const t = theme || 'auto';
3200 if (t === 'auto') {
3201 delete document.documentElement.dataset.theme;
3202 } else {
3203 document.documentElement.dataset.theme = t;
3204 }
3205 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
3206 };
3207 const readTheme = () => {
3208 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
3209 };
3210 const writeTheme = (t) => {
3211 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
3212 };
3213 applyTheme(readTheme());
3214 if (themeBtn) {
3215 themeBtn.addEventListener('click', () => {
3216 const cur = readTheme();
3217 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
3218 writeTheme(next);
3219 applyTheme(next);
3220 });
3221 }
3222
3223 let activeSignalEls = [];
3224 let activeSignalRow = null;
3225 const clearActive = () => {
3226 if (activeSignalEls && activeSignalEls.length) {
3227 activeSignalEls.forEach((el) => el.classList.remove('e-active'));
3228 }
3229 if (activeSignalRow) activeSignalRow.classList.remove('e-active');
3230 activeSignalEls = [];
3231 activeSignalRow = null;
3232 };
3233
3234 let activeTrack = null;
3235 let hoverTrack = null;
3236
3237 const removeTrackClass = (tid, cls) => {
3238 if (!tid) return;
3239 const els = trackEls.get(tid);
3240 if (!els) return;
3241 els.forEach((el) => el.classList.remove(cls));
3242 };
3243
3244 const addTrackClass = (tid, cls) => {
3245 if (!tid) return;
3246 const els = trackEls.get(tid);
3247 if (!els) return;
3248 els.forEach((el) => el.classList.add(cls));
3249 };
3250
3251 const trackSize = (tid) => {
3252 const els = tid ? trackEls.get(tid) : null;
3253 return els ? els.length : 0;
3254 };
3255
3256 const getTrackSelectionText = (tid) => {
3257 if (!tid) return 'track: - (untracked)';
3258 const row = document.querySelector(`#tracks-table tr[data-tid='${tid}']`);
3259 if (!row) return `track T${tid}`;
3260 const cells = row.querySelectorAll('td');
3261 const canonical = (cells[1]?.textContent || '').trim();
3262 const etype = (cells[2]?.textContent || '').trim();
3263 const count = (cells[3]?.textContent || '').trim();
3264 const sigs = (cells[4]?.textContent || '').trim();
3265 const lines = [];
3266 lines.push(`track T${tid} canonical="${canonical}" type="${etype}" mentions=${count}`);
3267 if (sigs) lines.push(`track signals: ${sigs}`);
3268 return lines.join('\n');
3269 };
3270
3271 const renderTrackSelection = (tid) => setSelection(getTrackSelectionText(tid));
3272
3273 const renderSignalSelectionBySid = (sid) => {
3274 const meta = signalMeta.get(sid);
3275 const label = meta ? (meta.label || '') : '';
3276 const conf = meta ? (meta.conf || '') : '';
3277 const start = meta ? meta.start : null;
3278 const end = meta ? meta.end : null;
3279 const tid = meta ? meta.track : null;
3280 const lines = [];
3281 if (start !== null && end !== null) {
3282 lines.push(`signal ${sid} label=${label} conf=${conf} span=[${start},${end})`);
3283 } else {
3284 lines.push(`signal ${sid} label=${label} conf=${conf}`);
3285 }
3286 if (meta && meta.surface) lines.push(`surface: ${meta.surface}`);
3287 lines.push('');
3288 lines.push(getTrackSelectionText(tid));
3289 setSelection(lines.join('\n'));
3290 };
3291
3292 const setActiveTrack = (tid) => {
3293 const next = tid || null;
3294 if (activeTrack === next) return;
3295 removeTrackClass(activeTrack, 'e-track');
3296 activeTrack = next;
3297 if (activeTrack) addTrackClass(activeTrack, 'e-track');
3298 if (hoverTrack && activeTrack && hoverTrack === activeTrack) {
3299 removeTrackClass(hoverTrack, 'e-track-hover');
3300 }
3301 };
3302
3303 const setHoverTrack = (tid) => {
3304 const next = tid || null;
3305 if (hoverTrack === next) return;
3306 removeTrackClass(hoverTrack, 'e-track-hover');
3307 hoverTrack = next;
3308 if (!hoverTrack) {
3309 setHint('');
3310 return;
3311 }
3312 if (activeTrack && hoverTrack === activeTrack) {
3313 setHint(`selected track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3314 return;
3315 }
3316 addTrackClass(hoverTrack, 'e-track-hover');
3317 setHint(`hover track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3318 };
3319
3320 const emitToParentSpan = (start, end) => {
3321 try {
3322 if (!window.parent || window.parent === window) return;
3323 if (start === null || end === null) return;
3324 window.parent.postMessage({ type: 'anno:activate-span', start: Number(start), end: Number(end) }, '*');
3325 } catch (_) {
3326 // ignore: best-effort bridge for iframe containers
3327 }
3328 };
3329
3330 const activateBySpan = (start, end, emit) => {
3331 if (start === null || end === null || start === undefined || end === undefined) return;
3332 // Prefer an exact signal span if present; otherwise fall back to the table row metadata.
3333 const el = document.querySelector(`span.e[data-sid][data-start='${start}'][data-end='${end}']`);
3334 if (el) {
3335 const sid = el.getAttribute('data-sid');
3336 if (sid) activateSignal(sid, emit);
3337 return;
3338 }
3339 const row = document.querySelector(`#signals-table tr[data-start='${start}'][data-end='${end}']`);
3340 if (!row) return;
3341 const sid = row.getAttribute('data-sid');
3342 if (!sid) return;
3343 activateSignal(sid, emit);
3344 };
3345
3346 const activateSignal = (sid, emit) => {
3347 clearActive();
3348 const els = signalEls.get(sid) || [];
3349 if (!els.length) return;
3350 els.forEach((el) => el.classList.add('e-active'));
3351 activeSignalEls = els;
3352 const row = document.querySelector(`#signals-table tr[data-sid='${sid}']`);
3353 if (row) {
3354 row.classList.add('e-active');
3355 activeSignalRow = row;
3356 }
3357 const primaryEl = els[0];
3358 primaryEl.scrollIntoView({ block: 'center', behavior: 'smooth' });
3359 const meta = signalMeta.get(sid);
3360 const tid = meta ? meta.track : primaryEl.getAttribute('data-track');
3361 setActiveTrack(tid);
3362 renderSignalSelectionBySid(sid);
3363 if (emit && meta && meta.start !== null && meta.end !== null) {
3364 emitToParentSpan(meta.start, meta.end);
3365 }
3366 };
3367
3368 // Table click
3369 const signalsTable = document.getElementById('signals-table');
3370 if (signalsTable) {
3371 signalsTable.addEventListener('click', (ev) => {
3372 const a = ev.target && ev.target.closest ? ev.target.closest("a[href^='#S']") : null;
3373 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3374 const sid = (a && a.getAttribute('href') ? a.getAttribute('href').slice(1) : null) || (row ? row.getAttribute('data-sid') : null);
3375 if (!sid) return;
3376 ev.preventDefault();
3377 activateSignal(sid, true);
3378 history.replaceState(null, '', '#' + sid);
3379 });
3380
3381 // Hover a signals row → preview track highlight
3382 signalsTable.addEventListener('mouseover', (ev) => {
3383 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3384 if (!row) return;
3385 const tid = row.getAttribute('data-track');
3386 setHoverTrack(tid);
3387 });
3388 signalsTable.addEventListener('mouseout', (ev) => {
3389 const to = ev.relatedTarget;
3390 if (to && signalsTable.contains(to)) return;
3391 setHoverTrack(null);
3392 });
3393 }
3394
3395 // Clicking an inline entity should also toggle active highlight.
3396 const pickPrimarySid = (el) => {
3397 if (!el) return null;
3398 const p = el.getAttribute('data-primary');
3399 if (p) return p;
3400 const raw = (el.getAttribute('data-sids') || '').trim();
3401 if (!raw) return null;
3402 const sids = raw.split(/\s+/).filter(Boolean);
3403 if (!sids.length) return null;
3404 // Prefer the shortest mention span from metadata.
3405 let best = sids[0];
3406 let bestLen = null;
3407 for (const sid of sids) {
3408 const meta = signalMeta.get(sid);
3409 const s = meta && meta.start !== null ? Number(meta.start) : null;
3410 const e = meta && meta.end !== null ? Number(meta.end) : null;
3411 const len = (s !== null && e !== null) ? (e - s) : null;
3412 if (len === null) continue;
3413 if (bestLen === null || len < bestLen) {
3414 best = sid;
3415 bestLen = len;
3416 }
3417 }
3418 return best;
3419 };
3420
3421 document.addEventListener('click', (ev) => {
3422 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3423 if (span) {
3424 activateSignal(span.getAttribute('data-sid'), true);
3425 return;
3426 }
3427 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3428 if (!seg) return;
3429 activateSignal(pickPrimarySid(seg), true);
3430 });
3431
3432 // Hover an inline entity → preview highlight its track
3433 document.addEventListener('mouseover', (ev) => {
3434 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3435 if (span) {
3436 setHoverTrack(span.getAttribute('data-track'));
3437 return;
3438 }
3439 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3440 if (!seg) return;
3441 const sid = pickPrimarySid(seg);
3442 const meta = sid ? signalMeta.get(sid) : null;
3443 setHoverTrack(meta ? meta.track : null);
3444 });
3445 document.addEventListener('mouseout', (ev) => {
3446 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3447 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3448 if (!span && !seg) return;
3449 const to = ev.relatedTarget;
3450 if (to && to.closest && (to.closest('span.e[data-sid]') || to.closest('span.seg[data-sids]'))) return;
3451 setHoverTrack(null);
3452 });
3453
3454 // Clicking a track row → select track (highlight + details)
3455 const tracksTable = document.getElementById('tracks-table');
3456 if (tracksTable) {
3457 tracksTable.addEventListener('click', (ev) => {
3458 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3459 if (!row) return;
3460 const tid = row.getAttribute('data-tid');
3461 setActiveTrack(tid);
3462 renderTrackSelection(tid);
3463 });
3464 tracksTable.addEventListener('mouseover', (ev) => {
3465 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3466 if (!row) return;
3467 setHoverTrack(row.getAttribute('data-tid'));
3468 });
3469 tracksTable.addEventListener('mouseout', (ev) => {
3470 const to = ev.relatedTarget;
3471 if (to && tracksTable.contains(to)) return;
3472 setHoverTrack(null);
3473 });
3474 }
3475
3476 // Filter
3477 const input = document.getElementById('signal-filter');
3478 const countEl = document.getElementById('signal-filter-count');
3479 if (input && signalsTable) {
3480 const update = () => {
3481 const q = (input.value || '').trim().toLowerCase();
3482 let shown = 0;
3483 const rows = signalsTable.querySelectorAll('tr[data-sid]');
3484 rows.forEach(row => {
3485 const sid = (row.getAttribute('data-sid') || '').toLowerCase();
3486 const label = (row.getAttribute('data-label') || '').toLowerCase();
3487 const surface = (row.getAttribute('data-surface') || '').toLowerCase();
3488 const ok = !q || sid.includes(q) || label.includes(q) || surface.includes(q);
3489 row.style.display = ok ? '' : 'none';
3490 if (ok) shown += 1;
3491 });
3492 if (countEl) countEl.textContent = shown + ' shown';
3493 };
3494 input.addEventListener('input', update);
3495 update();
3496 }
3497
3498 // Panel toggles
3499 document.querySelectorAll('[data-toggle]').forEach(btn => {
3500 btn.addEventListener('click', () => {
3501 const id = btn.getAttribute('data-toggle');
3502 const panel = id ? document.getElementById(id) : null;
3503 if (!panel) return;
3504 panel.classList.toggle('panel-collapsed');
3505 });
3506 });
3507
3508 // If URL hash is #S123, focus it.
3509 const hash = (location.hash || '').slice(1);
3510 if (hash && hash.startsWith('S')) activateSignal(hash, false);
3511
3512 // Optional: allow parent pages (e.g., dataset explorers) to sync selection across iframes.
3513 window.addEventListener('message', (ev) => {
3514 const data = ev && ev.data ? ev.data : null;
3515 if (!data || data.type !== 'anno:activate-span') return;
3516 if (typeof data.start !== 'number' || typeof data.end !== 'number') return;
3517 activateBySpan(data.start, data.end, false);
3518 });
3519})();
3520</script>"#);
3521
3522 html.push_str(r#"</body></html>"#);
3523 html
3524}
3525
3526fn html_escape(s: &str) -> String {
3527 s.replace('&', "&")
3528 .replace('<', "<")
3529 .replace('>', ">")
3530 .replace('"', """)
3531}
3532
3533fn annotate_text_html(
3534 text: &str,
3535 signals: &[Signal<Location>],
3536 signal_to_track: &std::collections::HashMap<SignalId, TrackId>,
3537) -> String {
3538 let char_count = text.chars().count();
3539 if char_count == 0 {
3540 return String::new();
3541 }
3542
3543 #[derive(Debug, Clone)]
3544 struct SigMeta {
3545 sid: String,
3546 label: String,
3547 conf: f32,
3548 track_id: Option<TrackId>,
3549 covered_len: usize,
3550 }
3551
3552 #[derive(Debug, Clone)]
3553 struct Event {
3554 pos: usize,
3555 meta_idx: usize,
3556 delta: i32, }
3558
3559 let mut metas: Vec<SigMeta> = Vec::new();
3561 let mut events: Vec<Event> = Vec::new();
3562 let mut boundaries: Vec<usize> = vec![0, char_count];
3563
3564 for s in signals {
3565 let raw_segments: Vec<(usize, usize)> = match &s.location {
3566 Location::Text { start, end } => vec![(*start, *end)],
3567 Location::TextWithBbox { start, end, .. } => vec![(*start, *end)],
3568 Location::Discontinuous { segments } => segments.clone(),
3569 _ => Vec::new(),
3570 };
3571 if raw_segments.is_empty() {
3572 continue;
3573 }
3574
3575 let mut cleaned: Vec<(usize, usize)> = Vec::new();
3576 let mut covered_len = 0usize;
3577 for (start, end) in raw_segments {
3578 let start = start.min(char_count);
3579 let end = end.min(char_count);
3580 if start >= end {
3581 continue;
3582 }
3583 covered_len = covered_len.saturating_add(end - start);
3584 cleaned.push((start, end));
3585 }
3586 if cleaned.is_empty() {
3587 continue;
3588 }
3589
3590 let meta_idx = metas.len();
3591 let track_id = signal_to_track.get(&s.id).copied();
3592 metas.push(SigMeta {
3593 sid: format!("S{}", s.id),
3594 label: s.label.to_string(),
3595 conf: s.confidence,
3596 track_id,
3597 covered_len,
3598 });
3599
3600 for (start, end) in cleaned {
3601 boundaries.push(start);
3602 boundaries.push(end);
3603 events.push(Event {
3604 pos: start,
3605 meta_idx,
3606 delta: 1,
3607 });
3608 events.push(Event {
3609 pos: end,
3610 meta_idx,
3611 delta: -1,
3612 });
3613 }
3614 }
3615
3616 if metas.is_empty() {
3617 return html_escape(text);
3618 }
3619
3620 boundaries.sort_unstable();
3621 boundaries.dedup();
3622 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
3623
3624 let mut active_counts: Vec<u32> = vec![0; metas.len()];
3625 let mut active: Vec<usize> = Vec::new();
3626 let mut ev_idx = 0usize;
3627
3628 let mut result = String::new();
3629
3630 for bi in 0..boundaries.len().saturating_sub(1) {
3631 let pos = boundaries[bi];
3632 while ev_idx < events.len() && events[ev_idx].pos == pos {
3634 let e = &events[ev_idx];
3635 let idx = e.meta_idx;
3636 if e.delta < 0 {
3637 if active_counts[idx] > 0 {
3638 active_counts[idx] -= 1;
3639 if active_counts[idx] == 0 {
3640 active.retain(|&x| x != idx);
3641 }
3642 }
3643 } else {
3644 active_counts[idx] += 1;
3645 if active_counts[idx] == 1 {
3646 active.push(idx);
3647 }
3648 }
3649 ev_idx += 1;
3650 }
3651
3652 let next = boundaries[bi + 1];
3653 if next <= pos {
3654 continue;
3655 }
3656
3657 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
3658 if active.is_empty() {
3659 result.push_str(&html_escape(&seg_text));
3660 continue;
3661 }
3662
3663 let primary_idx = active
3665 .iter()
3666 .copied()
3667 .min_by(|a, b| {
3668 metas[*a]
3669 .covered_len
3670 .cmp(&metas[*b].covered_len)
3671 .then_with(|| {
3672 metas[*b]
3673 .conf
3674 .partial_cmp(&metas[*a].conf)
3675 .unwrap_or(std::cmp::Ordering::Equal)
3676 })
3677 })
3678 .unwrap_or(active[0]);
3679 let primary = &metas[primary_idx];
3680
3681 let class = match primary.label.to_uppercase().as_str() {
3682 "PER" | "PERSON" => "e-per",
3683 "ORG" | "ORGANIZATION" | "COMPANY" => "e-org",
3684 "LOC" | "LOCATION" | "GPE" => "e-loc",
3685 "DATE" | "TIME" => "e-date",
3686 _ => "e-misc",
3687 };
3688
3689 let mut sids: Vec<&str> = active.iter().map(|i| metas[*i].sid.as_str()).collect();
3690 sids.sort_unstable();
3691 let data_sids = sids.join(" ");
3692
3693 let mut title = format!(
3694 "sids=[{}] primary={} [{}..{})",
3695 data_sids, primary.sid, pos, next
3696 );
3697 if let Some(t) = primary.track_id {
3698 title.push_str(&format!(" track=T{}", t));
3699 }
3700
3701 result.push_str(&format!(
3702 r#"<span class="e seg {class}" data-sids="{sids}" data-start="{start}" data-end="{end}" data-primary="{primary}" title="{title}">{text}</span>"#,
3703 class = class,
3704 sids = html_escape(&data_sids),
3705 start = pos,
3706 end = next,
3707 primary = html_escape(&primary.sid),
3708 title = html_escape(&title),
3709 text = html_escape(&seg_text),
3710 ));
3711 }
3712
3713 result
3714}
3715
3716#[derive(Debug, Clone)]
3722pub struct EvalComparison {
3723 pub text: String,
3725 pub gold: Vec<Signal<Location>>,
3727 pub predicted: Vec<Signal<Location>>,
3729 pub matches: Vec<EvalMatch>,
3731}
3732
3733#[derive(Debug, Clone)]
3735pub enum EvalMatch {
3736 Correct {
3738 gold_id: SignalId,
3740 pred_id: SignalId,
3742 },
3743 TypeMismatch {
3745 gold_id: SignalId,
3747 pred_id: SignalId,
3749 gold_label: String,
3751 pred_label: String,
3753 },
3754 BoundaryError {
3756 gold_id: SignalId,
3758 pred_id: SignalId,
3760 iou: f64,
3762 },
3763 Spurious {
3765 pred_id: SignalId,
3767 },
3768 Missed {
3770 gold_id: SignalId,
3772 },
3773}
3774
3775impl EvalComparison {
3776 #[must_use]
3796 pub fn compare(
3797 text: &str,
3798 gold: Vec<Signal<Location>>,
3799 predicted: Vec<Signal<Location>>,
3800 ) -> Self {
3801 let mut matches = Vec::new();
3802 let mut gold_matched = vec![false; gold.len()];
3803 let mut pred_matched = vec![false; predicted.len()];
3804
3805 for (pi, pred) in predicted.iter().enumerate() {
3807 let pred_offsets = match pred.location.text_offsets() {
3808 Some(o) => o,
3809 None => continue,
3810 };
3811
3812 for (gi, g) in gold.iter().enumerate() {
3813 if gold_matched[gi] {
3814 continue;
3815 }
3816 let gold_offsets = match g.location.text_offsets() {
3817 Some(o) => o,
3818 None => continue,
3819 };
3820
3821 if pred_offsets == gold_offsets {
3823 if pred.label == g.label {
3824 matches.push(EvalMatch::Correct {
3825 gold_id: g.id,
3826 pred_id: pred.id,
3827 });
3828 } else {
3829 matches.push(EvalMatch::TypeMismatch {
3830 gold_id: g.id,
3831 pred_id: pred.id,
3832 gold_label: g.label.to_string(),
3833 pred_label: pred.label.to_string(),
3834 });
3835 }
3836 gold_matched[gi] = true;
3837 pred_matched[pi] = true;
3838 break;
3839 }
3840 }
3841 }
3842
3843 for (pi, pred) in predicted.iter().enumerate() {
3845 if pred_matched[pi] {
3846 continue;
3847 }
3848 let pred_offsets = match pred.location.text_offsets() {
3849 Some(o) => o,
3850 None => continue,
3851 };
3852
3853 for (gi, g) in gold.iter().enumerate() {
3854 if gold_matched[gi] {
3855 continue;
3856 }
3857 let gold_offsets = match g.location.text_offsets() {
3858 Some(o) => o,
3859 None => continue,
3860 };
3861
3862 if pred_offsets.0 < gold_offsets.1 && pred_offsets.1 > gold_offsets.0 {
3864 let iou = pred.location.iou(&g.location).unwrap_or(0.0);
3865 matches.push(EvalMatch::BoundaryError {
3866 gold_id: g.id,
3867 pred_id: pred.id,
3868 iou,
3869 });
3870 gold_matched[gi] = true;
3871 pred_matched[pi] = true;
3872 break;
3873 }
3874 }
3875 }
3876
3877 for (pi, pred) in predicted.iter().enumerate() {
3879 if !pred_matched[pi] {
3880 matches.push(EvalMatch::Spurious { pred_id: pred.id });
3881 }
3882 }
3883
3884 for (gi, g) in gold.iter().enumerate() {
3886 if !gold_matched[gi] {
3887 matches.push(EvalMatch::Missed { gold_id: g.id });
3888 }
3889 }
3890
3891 Self {
3892 text: text.to_string(),
3893 gold,
3894 predicted,
3895 matches,
3896 }
3897 }
3898
3899 #[must_use]
3901 pub fn correct_count(&self) -> usize {
3902 self.matches
3903 .iter()
3904 .filter(|m| matches!(m, EvalMatch::Correct { .. }))
3905 .count()
3906 }
3907
3908 #[must_use]
3910 pub fn error_count(&self) -> usize {
3911 self.matches.len() - self.correct_count()
3912 }
3913
3914 #[must_use]
3916 pub fn precision(&self) -> f64 {
3917 if self.predicted.is_empty() {
3918 0.0
3919 } else {
3920 self.correct_count() as f64 / self.predicted.len() as f64
3921 }
3922 }
3923
3924 #[must_use]
3926 pub fn recall(&self) -> f64 {
3927 if self.gold.is_empty() {
3928 0.0
3929 } else {
3930 self.correct_count() as f64 / self.gold.len() as f64
3931 }
3932 }
3933
3934 #[must_use]
3936 pub fn f1(&self) -> f64 {
3937 let p = self.precision();
3938 let r = self.recall();
3939 if p + r > 0.0 {
3940 2.0 * p * r / (p + r)
3941 } else {
3942 0.0
3943 }
3944 }
3945}
3946
3947pub fn render_eval_html(cmp: &EvalComparison) -> String {
3951 render_eval_html_with_title(cmp, "eval comparison")
3952}
3953
3954#[must_use]
3958pub fn render_eval_html_with_title(cmp: &EvalComparison, title: &str) -> String {
3959 let mut html = String::new();
3960 let title = html_escape(title);
3961
3962 html.push_str(
3963 r#"<!DOCTYPE html>
3964<html>
3965<head>
3966<meta charset="UTF-8">
3967<meta name="color-scheme" content="dark light">
3968"#,
3969 );
3970 html.push_str(&format!("<title>{}</title>", title));
3971 html.push_str(r#"
3972:root{
3973 color-scheme: light dark;
3974 --bg:#0a0a0a;
3975 --panel-bg:#0d0d0d;
3976 --text:#b0b0b0;
3977 --text-strong:#fff;
3978 --muted:#666;
3979 --border:#222;
3980 --border-strong:#333;
3981 --hover:#111;
3982 --input-bg:#080808;
3983 --active:#ddd;
3984 /* Eval entity colors (dark) */
3985 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3986 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3987 /* Match row borders */
3988 --m-ok:#4a8a4a;
3989 --m-type:#8a8a4a;
3990 --m-bound:#4a8a8a;
3991 --m-fp:#8a4a4a;
3992 --m-fn:#8a4a8a;
3993}
3994@media (prefers-color-scheme: light){
3995 :root{
3996 --bg:#ffffff;
3997 --panel-bg:#f7f7f7;
3998 --text:#222;
3999 --text-strong:#000;
4000 --muted:#555;
4001 --border:#d6d6d6;
4002 --border-strong:#c6c6c6;
4003 --hover:#f0f0f0;
4004 --input-bg:#ffffff;
4005 --active:#000;
4006 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4007 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4008 --m-ok:#2f8a2f;
4009 --m-type:#8a7a2f;
4010 --m-bound:#2f7a8a;
4011 --m-fp:#8a2f2f;
4012 --m-fn:#6a2f8a;
4013 }
4014}
4015html[data-theme='dark']{
4016 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
4017 --muted:#666; --border:#222; --border-strong:#333; --hover:#111; --input-bg:#080808; --active:#ddd;
4018 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
4019 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
4020 --m-ok:#4a8a4a; --m-type:#8a8a4a; --m-bound:#4a8a8a; --m-fp:#8a4a4a; --m-fn:#8a4a8a;
4021}
4022html[data-theme='light']{
4023 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
4024 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0; --input-bg:#ffffff; --active:#000;
4025 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4026 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4027 --m-ok:#2f8a2f; --m-type:#8a7a2f; --m-bound:#2f7a8a; --m-fp:#8a2f2f; --m-fn:#6a2f8a;
4028}
4029
4030<style>
4031*{box-sizing:border-box;margin:0;padding:0}
4032body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
4033h1,h2{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
4034h1{font-size:14px}h2{font-size:12px}
4035table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
4036th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
4037th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
4038tr:hover{background:var(--hover)}
4039.grid{display:grid;grid-template-columns:1fr 1fr;gap:8px}
4040.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
4041.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
4042.stats{display:flex;gap:24px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
4043.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
4044/* Entities */
4045.e{padding:1px 2px;border-bottom:2px solid}
4046.seg{cursor:pointer}
4047.e-gold{background:var(--gold-bg);border-color:var(--gold-br);color:var(--gold-tx)}
4048.e-pred{background:var(--pred-bg);border-color:var(--pred-br);color:var(--pred-tx)}
4049.e-active{outline:1px solid var(--active);outline-offset:1px}
4050/* Match types */
4051.correct{background:#1a2e1a;border-color:#4a8a4a}
4052.type-err{background:#2e2e1a;border-color:#8a8a4a}
4053.boundary{background:#1a2e2e;border-color:#4a8a8a}
4054.spurious{background:#2e1a1a;border-color:#8a4a4a}
4055.missed{background:#2e1a2e;border-color:#8a4a8a}
4056.match-row.correct{border-left:3px solid var(--m-ok)}
4057.match-row.type-err{border-left:3px solid var(--m-type)}
4058.match-row.boundary{border-left:3px solid var(--m-bound)}
4059.match-row.spurious{border-left:3px solid var(--m-fp)}
4060.match-row.missed{border-left:3px solid var(--m-fn)}
4061.match-row.active{outline:1px solid var(--muted)}
4062.sel{color:var(--muted);margin:6px 0 12px}
4063.metric{font-size:14px;color:var(--muted)}.metric b{color:var(--text-strong)}
4064</style>
4065</head>
4066<body>
4067"#);
4068
4069 html.push_str(&format!(
4071 "<div class=\"panel-h\" style=\"justify-content:space-between\"><h1>{}</h1><span class=\"toggle\" id=\"theme-toggle\" title=\"toggle theme (auto → dark → light)\">theme: auto</span></div>",
4072 title
4073 ));
4074
4075 html.push_str("<div class=\"stats\">");
4077 html.push_str(&format!(
4078 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">gold</div></div>",
4079 cmp.gold.len()
4080 ));
4081 html.push_str(&format!(
4082 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">predicted</div></div>",
4083 cmp.predicted.len()
4084 ));
4085 html.push_str(&format!(
4086 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">correct</div></div>",
4087 cmp.correct_count()
4088 ));
4089 html.push_str(&format!(
4090 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">errors</div></div>",
4091 cmp.error_count()
4092 ));
4093 html.push_str(&format!(
4094 "<div class=\"metric\">P=<b>{:.1}%</b> R=<b>{:.1}%</b> F1=<b>{:.1}%</b></div>",
4095 cmp.precision() * 100.0,
4096 cmp.recall() * 100.0,
4097 cmp.f1() * 100.0
4098 ));
4099 html.push_str("</div>");
4100
4101 html.push_str("<div id=\"selection\" class=\"sel\">click a match row to select spans</div>");
4103
4104 html.push_str("<div class=\"grid\">");
4106
4107 html.push_str("<div class=\"panel\"><h2>gold (ground truth)</h2><div class=\"text-box\">");
4109 let gold_spans: Vec<EvalHtmlSpan> = cmp
4110 .gold
4111 .iter()
4112 .map(|s| {
4113 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4114 EvalHtmlSpan {
4115 start,
4116 end,
4117 label: s.label.to_string(),
4118 class: "e-gold",
4119 id: format!("G{}", s.id),
4120 }
4121 })
4122 .collect();
4123 html.push_str(&annotate_text_spans(&cmp.text, &gold_spans));
4124 html.push_str("</div></div>");
4125
4126 html.push_str("<div class=\"panel\"><h2>predicted</h2><div class=\"text-box\">");
4128 let pred_spans: Vec<EvalHtmlSpan> = cmp
4129 .predicted
4130 .iter()
4131 .map(|s| {
4132 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4133 EvalHtmlSpan {
4134 start,
4135 end,
4136 label: s.label.to_string(),
4137 class: "e-pred",
4138 id: format!("P{}", s.id),
4139 }
4140 })
4141 .collect();
4142 html.push_str(&annotate_text_spans(&cmp.text, &pred_spans));
4143 html.push_str("</div></div>");
4144
4145 html.push_str("</div>");
4146
4147 html.push_str("<h2>matches</h2><table>");
4149 html.push_str("<tr><th>type</th><th>gold</th><th>predicted</th><th>notes</th></tr>");
4150
4151 for (mi, m) in cmp.matches.iter().enumerate() {
4152 let (class, mtype, gold_text, pred_text, notes, gid, pid) = match m {
4153 EvalMatch::Correct { gold_id, pred_id } => {
4154 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4155 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4156 (
4157 "correct",
4158 "✓",
4159 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4160 .unwrap_or_default(),
4161 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4162 .unwrap_or_default(),
4163 String::new(),
4164 Some(format!("G{}", gold_id)),
4165 Some(format!("P{}", pred_id)),
4166 )
4167 }
4168 EvalMatch::TypeMismatch {
4169 gold_id,
4170 pred_id,
4171 gold_label,
4172 pred_label,
4173 } => {
4174 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4175 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4176 (
4177 "type-err",
4178 "type",
4179 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4180 .unwrap_or_default(),
4181 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4182 .unwrap_or_default(),
4183 format!("{} → {}", gold_label, pred_label),
4184 Some(format!("G{}", gold_id)),
4185 Some(format!("P{}", pred_id)),
4186 )
4187 }
4188 EvalMatch::BoundaryError {
4189 gold_id,
4190 pred_id,
4191 iou,
4192 } => {
4193 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4194 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4195 (
4196 "boundary",
4197 "bound",
4198 g.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4199 .unwrap_or_default(),
4200 p.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4201 .unwrap_or_default(),
4202 format!("IoU={:.2}", iou),
4203 Some(format!("G{}", gold_id)),
4204 Some(format!("P{}", pred_id)),
4205 )
4206 }
4207 EvalMatch::Spurious { pred_id } => {
4208 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4209 (
4210 "spurious",
4211 "FP",
4212 String::new(),
4213 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4214 .unwrap_or_default(),
4215 "false positive".to_string(),
4216 None,
4217 Some(format!("P{}", pred_id)),
4218 )
4219 }
4220 EvalMatch::Missed { gold_id } => {
4221 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4222 (
4223 "missed",
4224 "FN",
4225 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4226 .unwrap_or_default(),
4227 String::new(),
4228 "false negative".to_string(),
4229 Some(format!("G{}", gold_id)),
4230 None,
4231 )
4232 }
4233 };
4234
4235 let mut data_attrs = String::new();
4236 if let Some(gid) = gid.as_deref() {
4237 data_attrs.push_str(&format!(" data-gid=\"{}\"", html_escape(gid)));
4238 }
4239 if let Some(pid) = pid.as_deref() {
4240 data_attrs.push_str(&format!(" data-pid=\"{}\"", html_escape(pid)));
4241 }
4242
4243 html.push_str(&format!(
4244 "<tr id=\"M{mid}\" class=\"match-row {class}\"{attrs}><td><a class=\"match-link\" href=\"#M{mid}\">{mtype}</a></td><td>{gold}</td><td>{pred}</td><td>{notes}</td></tr>",
4245 mid = mi,
4246 class = class,
4247 attrs = data_attrs,
4248 mtype = html_escape(mtype),
4249 gold = html_escape(&gold_text),
4250 pred = html_escape(&pred_text),
4251 notes = html_escape(¬es)
4252 ));
4253 }
4254 html.push_str("</table>");
4255
4256 html.push_str(
4257 r#"<script>
4258(() => {
4259 // Theme toggle: auto (prefers-color-scheme) → dark → light.
4260 const themeBtn = document.getElementById('theme-toggle');
4261 const themeKey = 'anno-theme';
4262 const applyTheme = (theme) => {
4263 const t = theme || 'auto';
4264 if (t === 'auto') {
4265 delete document.documentElement.dataset.theme;
4266 } else {
4267 document.documentElement.dataset.theme = t;
4268 }
4269 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
4270 };
4271 const readTheme = () => {
4272 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
4273 };
4274 const writeTheme = (t) => {
4275 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
4276 };
4277 applyTheme(readTheme());
4278 if (themeBtn) {
4279 themeBtn.addEventListener('click', () => {
4280 const cur = readTheme();
4281 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
4282 writeTheme(next);
4283 applyTheme(next);
4284 });
4285 }
4286
4287 function clearActive() {
4288 document.querySelectorAll(".e-active").forEach((el) => el.classList.remove("e-active"));
4289 document.querySelectorAll("tr.match-row.active").forEach((el) => el.classList.remove("active"));
4290 }
4291
4292 function findSpanEls(eid) {
4293 if (!eid) return [];
4294 // New segmented renderer: one span can be split across multiple elements.
4295 const els = Array.from(document.querySelectorAll(`span.e[data-eids~='${eid}']`));
4296 if (els.length) return els;
4297 // Back-compat: older HTML used a single element id.
4298 const single = document.getElementById(eid);
4299 return single ? [single] : [];
4300 }
4301
4302 function activate(gid, pid, row) {
4303 clearActive();
4304 const gEls = findSpanEls(gid);
4305 const pEls = findSpanEls(pid);
4306 const sel = document.getElementById("selection");
4307 gEls.forEach((el) => el.classList.add("e-active"));
4308 pEls.forEach((el) => el.classList.add("e-active"));
4309 if (row) row.classList.add("active");
4310 if (sel) {
4311 const parts = [];
4312 if (gEls.length) {
4313 const lbl = gEls[0].dataset && gEls[0].dataset.label ? ` [${gEls[0].dataset.label}]` : "";
4314 parts.push(`gold ${gid}${lbl}`);
4315 }
4316 if (pEls.length) {
4317 const lbl = pEls[0].dataset && pEls[0].dataset.label ? ` [${pEls[0].dataset.label}]` : "";
4318 parts.push(`pred ${pid}${lbl}`);
4319 }
4320 sel.textContent = parts.length ? parts.join(" | ") : "no selection";
4321 }
4322 if (row && row.id) {
4323 // Keep deep links stable without triggering navigation jump.
4324 // NOTE: single quotes avoid the Rust raw-string delimiter issue with quote+hash.
4325 history.replaceState(null, "", '#' + row.id);
4326 }
4327 const target = gEls[0] || pEls[0];
4328 if (target) target.scrollIntoView({ behavior: "smooth", block: "center" });
4329 }
4330
4331 document.querySelectorAll("tr.match-row[data-gid], tr.match-row[data-pid]").forEach((tr) => {
4332 tr.addEventListener("click", () => activate(tr.dataset.gid, tr.dataset.pid, tr));
4333 });
4334
4335 document.querySelectorAll("a.match-link").forEach((a) => {
4336 a.addEventListener("click", (ev) => {
4337 ev.preventDefault();
4338 const tr = a.closest("tr.match-row");
4339 if (!tr) return;
4340 activate(tr.dataset.gid, tr.dataset.pid, tr);
4341 });
4342 });
4343
4344 // Auto-select a match row if the URL has a deep link (e.g. #M12).
4345 const hash = (location.hash || "").slice(1);
4346 if (hash && hash.startsWith("M")) {
4347 const tr = document.getElementById(hash);
4348 if (tr && tr.classList && tr.classList.contains("match-row")) {
4349 activate(tr.dataset.gid, tr.dataset.pid, tr);
4350 }
4351 }
4352})();
4353</script>"#,
4354 );
4355
4356 html.push_str("</body></html>");
4357 html
4358}
4359
4360#[derive(Debug, Clone)]
4362struct EvalHtmlSpan {
4363 start: usize,
4364 end: usize,
4365 label: String,
4366 class: &'static str,
4367 id: String,
4368}
4369
4370fn annotate_text_spans(text: &str, spans: &[EvalHtmlSpan]) -> String {
4371 let char_count = text.chars().count();
4372 if char_count == 0 || spans.is_empty() {
4373 return html_escape(text);
4374 }
4375
4376 #[derive(Debug, Clone)]
4377 struct Meta {
4378 id: String,
4379 label: String,
4380 class: &'static str,
4381 len: usize,
4382 }
4383 #[derive(Debug, Clone)]
4384 struct Event {
4385 pos: usize,
4386 meta_idx: usize,
4387 delta: i32,
4388 }
4389
4390 let mut metas: Vec<Meta> = Vec::with_capacity(spans.len());
4391 let mut events: Vec<Event> = Vec::new();
4392 let mut boundaries: Vec<usize> = vec![0, char_count];
4393
4394 for s in spans {
4395 let start = s.start.min(char_count);
4396 let end = s.end.min(char_count);
4397 if start >= end {
4398 continue;
4399 }
4400 let meta_idx = metas.len();
4401 metas.push(Meta {
4402 id: s.id.clone(),
4403 label: s.label.to_string(),
4404 class: s.class,
4405 len: end - start,
4406 });
4407 boundaries.push(start);
4408 boundaries.push(end);
4409 events.push(Event {
4410 pos: start,
4411 meta_idx,
4412 delta: 1,
4413 });
4414 events.push(Event {
4415 pos: end,
4416 meta_idx,
4417 delta: -1,
4418 });
4419 }
4420
4421 if metas.is_empty() {
4422 return html_escape(text);
4423 }
4424
4425 boundaries.sort_unstable();
4426 boundaries.dedup();
4427 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
4428
4429 let mut active_counts: Vec<u32> = vec![0; metas.len()];
4430 let mut active: Vec<usize> = Vec::new();
4431 let mut ev_idx = 0usize;
4432 let mut result = String::new();
4433
4434 for bi in 0..boundaries.len().saturating_sub(1) {
4435 let pos = boundaries[bi];
4436 while ev_idx < events.len() && events[ev_idx].pos == pos {
4437 let e = &events[ev_idx];
4438 let idx = e.meta_idx;
4439 if e.delta < 0 {
4440 if active_counts[idx] > 0 {
4441 active_counts[idx] -= 1;
4442 if active_counts[idx] == 0 {
4443 active.retain(|&x| x != idx);
4444 }
4445 }
4446 } else {
4447 active_counts[idx] += 1;
4448 if active_counts[idx] == 1 {
4449 active.push(idx);
4450 }
4451 }
4452 ev_idx += 1;
4453 }
4454
4455 let next = boundaries[bi + 1];
4456 if next <= pos {
4457 continue;
4458 }
4459
4460 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
4461 if active.is_empty() {
4462 result.push_str(&html_escape(&seg_text));
4463 continue;
4464 }
4465
4466 let primary_idx = active
4467 .iter()
4468 .copied()
4469 .min_by_key(|i| metas[*i].len)
4470 .unwrap_or(active[0]);
4471 let primary = &metas[primary_idx];
4472 let mut eids: Vec<&str> = active.iter().map(|i| metas[*i].id.as_str()).collect();
4473 eids.sort_unstable();
4474 let data_eids = eids.join(" ");
4475
4476 let title = format!(
4477 "eids=[{}] primary={} [{}..{})",
4478 data_eids, primary.id, pos, next
4479 );
4480 result.push_str(&format!(
4481 "<span class=\"e seg {class}\" data-eids=\"{eids}\" data-label=\"{label}\" data-start=\"{start}\" data-end=\"{end}\" title=\"{title}\">{text}</span>",
4482 class = primary.class,
4483 eids = html_escape(&data_eids),
4484 label = html_escape(&primary.label),
4485 start = pos,
4486 end = next,
4487 title = html_escape(&title),
4488 text = html_escape(&seg_text)
4489 ));
4490 }
4491
4492 result
4493}
4494
4495#[derive(Debug, Clone, Default)]
4501pub struct ProcessOptions {
4502 pub labels: Vec<String>,
4504 pub threshold: f32,
4506}
4507
4508#[derive(Debug)]
4510pub struct ProcessResult {
4511 pub document: GroundedDocument,
4513 pub valid: bool,
4515 pub errors: Vec<SignalValidationError>,
4517}
4518
4519impl ProcessResult {
4520 #[must_use]
4522 pub fn to_html(&self) -> String {
4523 render_document_html(&self.document)
4524 }
4525}
4526
4527#[derive(Debug, Clone)]
4536pub struct Corpus {
4537 documents: std::collections::HashMap<String, GroundedDocument>,
4538 identities: std::collections::HashMap<IdentityId, Identity>,
4539 next_identity_id: IdentityId,
4540}
4541
4542impl Corpus {
4543 #[must_use]
4545 pub fn new() -> Self {
4546 Self {
4547 documents: std::collections::HashMap::new(),
4548 identities: std::collections::HashMap::new(),
4549 next_identity_id: IdentityId::ZERO,
4550 }
4551 }
4552
4553 #[must_use]
4555 pub fn identities(&self) -> &std::collections::HashMap<IdentityId, Identity> {
4556 &self.identities
4557 }
4558
4559 #[must_use]
4561 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
4562 self.identities.get(&id)
4563 }
4564
4565 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
4570 let id = self.next_identity_id;
4571 identity.id = id;
4572 self.identities.insert(id, identity);
4573 self.next_identity_id += 1;
4574 id
4575 }
4576
4577 #[must_use]
4581 pub fn next_identity_id(&self) -> IdentityId {
4582 self.next_identity_id
4583 }
4584
4585 pub fn documents(&self) -> impl Iterator<Item = &GroundedDocument> {
4589 self.documents.values()
4590 }
4591
4592 #[must_use]
4596 pub fn get_document(&self, doc_id: &str) -> Option<&GroundedDocument> {
4597 self.documents.get(doc_id)
4598 }
4599
4600 pub fn get_document_mut(&mut self, doc_id: &str) -> Option<&mut GroundedDocument> {
4604 self.documents.get_mut(doc_id)
4605 }
4606
4607 pub fn add_document(&mut self, document: GroundedDocument) -> String {
4612 let doc_id = document.id.clone();
4613 self.documents.insert(doc_id.clone(), document);
4614 doc_id
4615 }
4616
4617 pub fn link_track_to_kb(
4639 &mut self,
4640 track_ref: &TrackRef,
4641 kb_name: impl Into<String>,
4642 kb_id: impl Into<String>,
4643 canonical_name: impl Into<String>,
4644 ) -> super::Result<IdentityId> {
4645 use super::error::Error;
4646
4647 let doc = self.documents.get_mut(&track_ref.doc_id).ok_or_else(|| {
4648 Error::track_ref(format!(
4649 "Document '{}' not found in corpus",
4650 track_ref.doc_id
4651 ))
4652 })?;
4653 let track = doc.get_track(track_ref.track_id).ok_or_else(|| {
4654 Error::track_ref(format!(
4655 "Track {} not found in document '{}'",
4656 track_ref.track_id, track_ref.doc_id
4657 ))
4658 })?;
4659
4660 let kb_name_str = kb_name.into();
4661 let kb_id_str = kb_id.into();
4662 let canonical_name_str = canonical_name.into();
4663
4664 let identity_id = if let Some(existing_id) = track.identity_id {
4666 if let Some(identity) = self.identities.get_mut(&existing_id) {
4668 identity.kb_id = Some(kb_id_str.clone());
4669 identity.kb_name = Some(kb_name_str.clone());
4670 identity.canonical_name = canonical_name_str.clone();
4671
4672 identity.source = Some(match identity.source.take() {
4674 Some(IdentitySource::CrossDocCoref { track_refs }) => IdentitySource::Hybrid {
4675 track_refs,
4676 kb_name: kb_name_str.clone(),
4677 kb_id: kb_id_str.clone(),
4678 },
4679 _ => IdentitySource::KnowledgeBase {
4680 kb_name: kb_name_str.clone(),
4681 kb_id: kb_id_str.clone(),
4682 },
4683 });
4684
4685 existing_id
4686 } else {
4687 let new_id = self.next_identity_id;
4695 self.next_identity_id += 1;
4696
4697 let identity = Identity {
4698 id: new_id,
4699 canonical_name: canonical_name_str,
4700 entity_type: track.entity_type.clone(),
4701 kb_id: Some(kb_id_str.clone()),
4702 kb_name: Some(kb_name_str.clone()),
4703 description: None,
4704 embedding: track.embedding.clone(),
4705 aliases: Vec::new(),
4706 confidence: track.cluster_confidence,
4707 source: Some(IdentitySource::KnowledgeBase {
4708 kb_name: kb_name_str,
4709 kb_id: kb_id_str,
4710 }),
4711 };
4712
4713 self.identities.insert(new_id, identity);
4714 doc.link_track_to_identity(track_ref.track_id, new_id);
4717 new_id
4718 }
4719 } else {
4720 let new_id = self.next_identity_id;
4722 self.next_identity_id += 1;
4723
4724 let identity = Identity {
4725 id: new_id,
4726 canonical_name: canonical_name_str,
4727 entity_type: track.entity_type.clone(),
4728 kb_id: Some(kb_id_str.clone()),
4729 kb_name: Some(kb_name_str.clone()),
4730 description: None,
4731 embedding: track.embedding.clone(),
4732 aliases: Vec::new(),
4733 confidence: track.cluster_confidence,
4734 source: Some(IdentitySource::KnowledgeBase {
4735 kb_name: kb_name_str,
4736 kb_id: kb_id_str,
4737 }),
4738 };
4739
4740 self.identities.insert(new_id, identity);
4741 doc.link_track_to_identity(track_ref.track_id, new_id);
4742 new_id
4743 };
4744
4745 Ok(identity_id)
4746 }
4747}
4748
4749impl Default for Corpus {
4750 fn default() -> Self {
4751 Self::new()
4752 }
4753}
4754
4755#[cfg(test)]
4756mod tests {
4757 #![allow(clippy::unwrap_used)] use super::*;
4759 use crate::EntityCategory;
4760
4761 #[test]
4762 fn test_render_eval_html_has_interactive_hooks_and_is_unicode_safe() {
4763 let text = "習近平在北京會見了普京。";
4765
4766 let gold: Vec<Signal<Location>> = vec![
4767 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 1.0),
4768 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "LOC", 1.0),
4769 ];
4770
4771 let predicted: Vec<Signal<Location>> = vec![
4773 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 0.9),
4774 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "PER", 0.7),
4775 ];
4776
4777 let cmp = EvalComparison::compare(text, gold, predicted);
4778 let html = render_eval_html_with_title(&cmp, "test");
4779
4780 assert!(html.contains("id=\"selection\""));
4782
4783 assert!(html.contains("data-eids=\"G0\""));
4785 assert!(html.contains("data-eids=\"P0\""));
4786
4787 assert!(html.contains("class=\"match-link\""));
4789 assert!(html.contains("href=\"#M0\""));
4790 assert!(html.contains("data-gid=\"G0\""));
4791 assert!(html.contains("data-pid=\"P0\""));
4792
4793 assert!(html.contains("北京"));
4795 }
4796
4797 fn find_char_span(text: &str, needle: &str) -> Option<(usize, usize)> {
4798 let hay: Vec<char> = text.chars().collect();
4799 let pat: Vec<char> = needle.chars().collect();
4800 if pat.is_empty() || hay.len() < pat.len() {
4801 return None;
4802 }
4803 for i in 0..=(hay.len() - pat.len()) {
4804 if hay[i..(i + pat.len())] == pat[..] {
4805 return Some((i, i + pat.len()));
4806 }
4807 }
4808 None
4809 }
4810
4811 #[test]
4812 fn test_annotate_text_html_supports_overlaps_discontinuous_and_unicode() {
4813 let text = "Marie Curie met Cher in Paris. 習近平在北京會見了普京。 \
4815التقى محمد بن سلمان في الرياض. Путин встретился с Си Цзиньпином в Москве. \
4816प्रधान मंत्री शर्मा दिल्ली में मिले। severe pain ... in abdomen.";
4817
4818 let (m0s, m0e) = find_char_span(text, "Marie Curie").unwrap();
4820 let (m1s, m1e) = find_char_span(text, "Curie").unwrap();
4821
4822 let pain = find_char_span(text, "pain").unwrap();
4824 let abdomen = find_char_span(text, "abdomen").unwrap();
4825
4826 let signals: Vec<Signal<Location>> = vec![
4827 Signal::new(
4828 SignalId::new(0),
4829 Location::text(m0s, m0e),
4830 "Marie Curie",
4831 "PER",
4832 0.9,
4833 ),
4834 Signal::new(
4835 SignalId::new(1),
4836 Location::text(m1s, m1e),
4837 "Curie",
4838 "PER",
4839 0.8,
4840 ),
4841 Signal::new(
4842 SignalId::new(2),
4843 Location::Discontinuous {
4844 segments: vec![pain, abdomen],
4845 },
4846 "pain … abdomen",
4847 "SYMPTOM",
4848 0.7,
4849 ),
4850 ];
4851
4852 let html = annotate_text_html(text, &signals, &std::collections::HashMap::new());
4853
4854 assert!(html.contains("data-sids=\"S0 S1\"") || html.contains("data-sids=\"S1 S0\""));
4856
4857 assert!(html.contains("data-sids=\"S2\""));
4859
4860 assert!(html.contains("北京"));
4862 assert!(html.contains("Москве"));
4863 assert!(html.contains("शर्मा"));
4864 assert!(html.contains("محمد"));
4865 }
4866
4867 #[test]
4868 fn test_location_text_iou() {
4869 let l1 = Location::text(0, 10);
4870 let l2 = Location::text(5, 15);
4871 let iou = l1.iou(&l2).unwrap();
4872 assert!((iou - 0.333).abs() < 0.01);
4876 }
4877
4878 #[test]
4879 fn test_location_bbox_iou() {
4880 let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
4881 let b2 = Location::bbox(0.25, 0.25, 0.5, 0.5);
4882 let iou = b1.iou(&b2).unwrap();
4883 assert!((iou - 0.143).abs() < 0.01);
4887 }
4888
4889 #[test]
4890 fn test_location_different_types_no_iou() {
4891 let text = Location::text(0, 10);
4892 let bbox = Location::bbox(0.0, 0.0, 0.5, 0.5);
4893 assert!(text.iou(&bbox).is_none());
4894 }
4895
4896 #[test]
4897 fn test_signal_creation() {
4898 let signal: Signal<Location> =
4899 Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95);
4900 assert_eq!(signal.surface, "Marie Curie");
4901 assert_eq!(signal.label, "Person".into());
4902 assert!((signal.confidence - 0.95).abs() < 0.001);
4903 assert!(!signal.negated);
4904 }
4905
4906 #[test]
4907 fn test_signal_with_linguistic_features() {
4908 let signal: Signal<Location> =
4909 Signal::new(0, Location::text(0, 10), "not a doctor", "Occupation", 0.8)
4910 .negated()
4911 .with_quantifier(Quantifier::Existential)
4912 .with_modality(Modality::Symbolic);
4913
4914 assert!(signal.negated);
4915 assert_eq!(signal.quantifier, Some(Quantifier::Existential));
4916 assert!(signal.modality.supports_linguistic_features());
4917 }
4918
4919 #[test]
4920 fn test_track_formation() {
4921 let mut track = Track::new(0, "Marie Curie");
4922 track.add_signal(0, 0);
4923 track.add_signal(1, 1);
4924 track.add_signal(2, 2);
4925
4926 assert_eq!(track.len(), 3);
4927 assert!(!track.is_singleton());
4928 assert!(!track.is_empty());
4929 }
4930
4931 #[test]
4932 fn test_identity_creation() {
4933 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186")
4934 .with_type("Person")
4935 .with_embedding(vec![0.1, 0.2, 0.3]);
4936
4937 assert_eq!(identity.canonical_name, "Marie Curie");
4938 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4939 assert_eq!(identity.kb_name, Some("wikidata".to_string()));
4940 assert!(identity.embedding.is_some());
4941 }
4942
4943 #[test]
4944 fn test_grounded_document_hierarchy() {
4945 let mut doc = GroundedDocument::new(
4946 "doc1",
4947 "Marie Curie won the Nobel Prize. She was a physicist.",
4948 );
4949
4950 let s1 = doc.add_signal(Signal::new(
4952 0,
4953 Location::text(0, 12),
4954 "Marie Curie",
4955 "Person",
4956 0.95,
4957 ));
4958 let s2 = doc.add_signal(Signal::new(
4959 1,
4960 Location::text(38, 41),
4961 "She",
4962 "Person",
4963 0.88,
4964 ));
4965 let s3 = doc.add_signal(Signal::new(
4966 2,
4967 Location::text(17, 29),
4968 "Nobel Prize",
4969 "Award",
4970 0.92,
4971 ));
4972
4973 let mut track1 = Track::new(0, "Marie Curie");
4975 track1.add_signal(s1, 0);
4976 track1.add_signal(s2, 1);
4977 let track1_id = doc.add_track(track1);
4978
4979 let mut track2 = Track::new(1, "Nobel Prize");
4980 track2.add_signal(s3, 0);
4981 doc.add_track(track2);
4982
4983 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
4985 let identity_id = doc.add_identity(identity);
4986 doc.link_track_to_identity(track1_id, identity_id);
4987
4988 assert_eq!(doc.signals().len(), 3);
4990 assert_eq!(doc.tracks().count(), 2);
4991 assert_eq!(doc.identities().count(), 1);
4992
4993 let track = doc.track_for_signal(s1).unwrap();
4995 assert_eq!(track.canonical_surface, "Marie Curie");
4996 assert_eq!(track.len(), 2);
4997
4998 let identity = doc.identity_for_track(track1_id).unwrap();
5000 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
5001
5002 let identity = doc.identity_for_signal(s1).unwrap();
5004 assert_eq!(identity.canonical_name, "Marie Curie");
5005 }
5006
5007 #[test]
5008 fn test_modality_features() {
5009 assert!(Modality::Symbolic.supports_linguistic_features());
5010 assert!(!Modality::Symbolic.supports_geometric_features());
5011
5012 assert!(!Modality::Iconic.supports_linguistic_features());
5013 assert!(Modality::Iconic.supports_geometric_features());
5014
5015 assert!(Modality::Hybrid.supports_linguistic_features());
5016 assert!(Modality::Hybrid.supports_geometric_features());
5017 }
5018
5019 #[test]
5020 fn test_location_from_span() {
5021 let span = Span::Text { start: 0, end: 10 };
5022 let location = Location::from(&span);
5023 assert_eq!(location.text_offsets(), Some((0, 10)));
5024
5025 let span = Span::BoundingBox {
5026 x: 0.1,
5027 y: 0.2,
5028 width: 0.3,
5029 height: 0.4,
5030 page: Some(1),
5031 };
5032 let location = Location::from(&span);
5033 assert!(matches!(location, Location::BoundingBox { .. }));
5034 }
5035
5036 #[test]
5037 fn test_entity_roundtrip() {
5038 use super::EntityType;
5039
5040 let entities = vec![
5041 Entity::new("Marie Curie", EntityType::Person, 0, 12, 0.95),
5042 Entity::new(
5043 "Nobel Prize",
5044 EntityType::custom("Award", EntityCategory::Creative),
5045 17,
5046 29,
5047 0.92,
5048 ),
5049 ];
5050
5051 let doc =
5052 GroundedDocument::from_entities("doc1", "Marie Curie won the Nobel Prize.", &entities);
5053 let converted = doc.to_entities();
5054
5055 assert_eq!(converted.len(), 2);
5056 assert_eq!(converted[0].text, "Marie Curie");
5057 assert_eq!(converted[1].text, "Nobel Prize");
5058 }
5059
5060 #[test]
5061 fn test_signal_confidence_threshold() {
5062 let signal: Signal<Location> = Signal::new(0, Location::text(0, 10), "test", "Type", 0.75);
5063 assert!(signal.is_confident(0.5));
5064 assert!(signal.is_confident(0.75));
5065 assert!(!signal.is_confident(0.8));
5066 }
5067
5068 #[test]
5069 fn test_document_filtering() {
5070 let mut doc = GroundedDocument::new("doc1", "Test text");
5071
5072 doc.add_signal(Signal::new(0, Location::text(0, 4), "high", "Person", 0.95));
5074 doc.add_signal(Signal::new(1, Location::text(5, 8), "low", "Person", 0.3));
5075 doc.add_signal(Signal::new(
5076 2,
5077 Location::text(9, 12),
5078 "org",
5079 "Organization",
5080 0.8,
5081 ));
5082
5083 let confident = doc.confident_signals(0.5);
5085 assert_eq!(confident.len(), 2);
5086
5087 let persons = doc.signals_with_label("Person");
5089 assert_eq!(persons.len(), 2);
5090
5091 let orgs = doc.signals_with_label("Organization");
5092 assert_eq!(orgs.len(), 1);
5093 }
5094
5095 #[test]
5096 fn test_untracked_signals() {
5097 let mut doc = GroundedDocument::new("doc1", "Test");
5098
5099 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5100 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5101 let _s3 = doc.add_signal(Signal::new(2, Location::text(9, 12), "c", "T", 0.9));
5102
5103 let mut track = Track::new(0, "a");
5105 track.add_signal(s1, 0);
5106 track.add_signal(s2, 1);
5107 doc.add_track(track);
5108
5109 assert_eq!(doc.untracked_signal_count(), 1);
5111 let untracked = doc.untracked_signals();
5112 assert_eq!(untracked.len(), 1);
5113 assert_eq!(untracked[0].surface, "c");
5114 }
5115
5116 #[test]
5117 fn test_linked_unlinked_tracks() {
5118 let mut doc = GroundedDocument::new("doc1", "Test");
5119
5120 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5121 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5122
5123 let mut track1 = Track::new(0, "a");
5124 track1.add_signal(s1, 0);
5125 let track1_id = doc.add_track(track1);
5126
5127 let mut track2 = Track::new(1, "b");
5128 track2.add_signal(s2, 0);
5129 doc.add_track(track2);
5130
5131 let identity = Identity::new(0, "Entity A");
5133 let identity_id = doc.add_identity(identity);
5134 doc.link_track_to_identity(track1_id, identity_id);
5135
5136 assert_eq!(doc.linked_tracks().count(), 1);
5137 assert_eq!(doc.unlinked_tracks().count(), 1);
5138 }
5139
5140 #[test]
5141 fn test_location_overlaps() {
5142 let l1 = Location::text(0, 10);
5143 let l2 = Location::text(5, 15);
5144 let l3 = Location::text(15, 20);
5145
5146 assert!(l1.overlaps(&l2));
5147 assert!(!l1.overlaps(&l3));
5148 assert!(!l2.overlaps(&l3)); let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
5152 let b2 = Location::bbox(0.4, 0.4, 0.5, 0.5);
5153 let b3 = Location::bbox(0.6, 0.6, 0.2, 0.2);
5154
5155 assert!(b1.overlaps(&b2));
5156 assert!(!b1.overlaps(&b3));
5157 }
5158
5159 #[test]
5160 fn test_iou_edge_cases() {
5161 let l1 = Location::text(0, 5);
5163 let l2 = Location::text(10, 15);
5164 assert_eq!(l1.iou(&l2), Some(0.0));
5165
5166 let l3 = Location::text(0, 10);
5168 let l4 = Location::text(0, 10);
5169 assert_eq!(l3.iou(&l4), Some(1.0));
5170
5171 let l5 = Location::text(0, 20);
5173 let l6 = Location::text(5, 15);
5174 let iou = l5.iou(&l6).unwrap();
5175 assert!((iou - 0.5).abs() < 0.001);
5177 }
5178
5179 #[test]
5183 fn test_document_stats() {
5184 let mut doc = GroundedDocument::new("doc1", "Test document with entities.");
5185
5186 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9));
5188 let mut negated = Signal::new(0, Location::text(5, 13), "document", "Type", 0.8);
5189 negated.negated = true;
5190 let s2 = doc.add_signal(negated);
5191 let _s3 = doc.add_signal(Signal::new(
5192 0,
5193 Location::text(19, 27),
5194 "entities",
5195 "Type",
5196 0.7,
5197 ));
5198
5199 let mut track = Track::new(0, "Test");
5201 track.add_signal(s1, 0);
5202 track.add_signal(s2, 1);
5203 doc.add_track(track);
5204
5205 let identity = Identity::new(0, "Test Entity");
5207 let identity_id = doc.add_identity(identity);
5208 doc.link_track_to_identity(0, identity_id);
5209
5210 let stats = doc.stats();
5211
5212 assert_eq!(stats.signal_count, 3);
5213 assert_eq!(stats.track_count, 1);
5214 assert_eq!(stats.identity_count, 1);
5215 assert_eq!(stats.linked_track_count, 1);
5216 assert_eq!(stats.untracked_count, 1); assert_eq!(stats.negated_count, 1);
5218 assert!((stats.avg_confidence - 0.8).abs() < 0.01); assert!((stats.avg_track_size - 2.0).abs() < 0.01);
5220 }
5221
5222 #[test]
5223 fn test_batch_operations() {
5224 let mut doc = GroundedDocument::new("doc1", "Test document.");
5225
5226 let signals = vec![
5228 Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9),
5229 Signal::new(0, Location::text(5, 13), "document", "Type", 0.8),
5230 ];
5231 let ids = doc.add_signals(signals);
5232
5233 assert_eq!(ids.len(), 2);
5234 assert_eq!(doc.signals().len(), 2);
5235
5236 let track_id = doc.create_track_from_signals("Test", &ids);
5238 assert!(track_id.is_some());
5239
5240 let track = doc.get_track(track_id.unwrap()).unwrap();
5241 assert_eq!(track.len(), 2);
5242 assert_eq!(track.canonical_surface, "Test");
5243 }
5244
5245 #[test]
5246 fn test_merge_tracks() {
5247 let mut doc = GroundedDocument::new("doc1", "John Smith works at Acme. He is great.");
5248
5249 let s1 = doc.add_signal(Signal::new(
5251 0,
5252 Location::text(0, 10),
5253 "John Smith",
5254 "Person",
5255 0.9,
5256 ));
5257 let s2 = doc.add_signal(Signal::new(0, Location::text(26, 28), "He", "Person", 0.8));
5258
5259 let mut track1 = Track::new(0, "John Smith");
5261 track1.add_signal(s1, 0);
5262 let track1_id = doc.add_track(track1);
5263
5264 let mut track2 = Track::new(0, "He");
5265 track2.add_signal(s2, 0);
5266 let track2_id = doc.add_track(track2);
5267
5268 assert_eq!(doc.tracks().count(), 2);
5269
5270 let merged_id = doc.merge_tracks(&[track1_id, track2_id]);
5272 assert!(merged_id.is_some());
5273
5274 assert_eq!(doc.tracks().count(), 1);
5276 let merged = doc.get_track(merged_id.unwrap()).unwrap();
5277 assert_eq!(merged.len(), 2);
5278 assert_eq!(merged.canonical_surface, "John Smith"); }
5280
5281 #[test]
5282 fn test_find_overlapping_pairs() {
5283 let mut doc = GroundedDocument::new("doc1", "New York City is great.");
5284
5285 doc.add_signal(Signal::new(
5287 0,
5288 Location::text(0, 13),
5289 "New York City",
5290 "Location",
5291 0.9,
5292 ));
5293 doc.add_signal(Signal::new(
5294 0,
5295 Location::text(0, 8),
5296 "New York",
5297 "Location",
5298 0.85,
5299 ));
5300 doc.add_signal(Signal::new(0, Location::text(17, 22), "great", "Adj", 0.7)); let pairs = doc.find_overlapping_signal_pairs();
5303
5304 assert_eq!(pairs.len(), 1);
5306 }
5307
5308 #[test]
5309 fn test_signals_in_range() {
5310 let mut doc = GroundedDocument::new("doc1", "John went to Paris and Berlin last year.");
5311
5312 doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.9));
5313 doc.add_signal(Signal::new(
5314 0,
5315 Location::text(13, 18),
5316 "Paris",
5317 "Location",
5318 0.9,
5319 ));
5320 doc.add_signal(Signal::new(
5321 0,
5322 Location::text(23, 29),
5323 "Berlin",
5324 "Location",
5325 0.9,
5326 ));
5327 doc.add_signal(Signal::new(
5328 0,
5329 Location::text(30, 39),
5330 "last year",
5331 "Date",
5332 0.8,
5333 ));
5334
5335 let in_range = doc.signals_in_range(10, 30);
5337 assert_eq!(in_range.len(), 2); let surfaces: Vec<_> = in_range.iter().map(|s| &s.surface).collect();
5340 assert!(surfaces.contains(&&"Paris".to_string()));
5341 assert!(surfaces.contains(&&"Berlin".to_string()));
5342 }
5343
5344 #[test]
5345 fn test_modality_filtering() {
5346 let mut doc = GroundedDocument::new("doc1", "Test");
5347
5348 let mut text_signal = Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9);
5350 text_signal.modality = Modality::Symbolic;
5351 doc.add_signal(text_signal);
5352
5353 let mut visual_signal =
5355 Signal::new(0, Location::bbox(0.0, 0.0, 0.5, 0.5), "Box", "Type", 0.8);
5356 visual_signal.modality = Modality::Iconic;
5357 doc.add_signal(visual_signal);
5358
5359 assert_eq!(doc.text_signals().len(), 1);
5360 assert_eq!(doc.visual_signals().len(), 1);
5361 assert_eq!(doc.signals_by_modality(Modality::Hybrid).len(), 0);
5362 }
5363
5364 #[test]
5365 fn test_quantifier_variants() {
5366 let quantifiers = [
5368 Quantifier::Universal,
5369 Quantifier::Existential,
5370 Quantifier::None,
5371 Quantifier::Definite,
5372 Quantifier::Bare,
5373 Quantifier::Approximate,
5374 Quantifier::MinBound,
5375 Quantifier::MaxBound,
5376 ];
5377
5378 for q in quantifiers {
5379 let signal: Signal<Location> =
5380 Signal::new(0, Location::text(0, 5), "test", "Type", 0.9).with_quantifier(q);
5381
5382 assert_eq!(signal.quantifier, Some(q));
5383 }
5384 }
5385
5386 #[test]
5387 fn test_location_modality_derivation() {
5388 assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
5389 assert_eq!(
5390 Location::bbox(0.0, 0.0, 0.5, 0.5).modality(),
5391 Modality::Iconic
5392 );
5393
5394 let temporal = Location::Temporal {
5395 start_sec: 0.0,
5396 end_sec: 5.0,
5397 frame: None,
5398 };
5399 assert_eq!(temporal.modality(), Modality::Iconic);
5400
5401 let genomic = Location::Genomic {
5402 contig: "chr1".into(),
5403 start: 0,
5404 end: 1000,
5405 strand: Some('+'),
5406 };
5407 assert_eq!(genomic.modality(), Modality::Symbolic);
5408
5409 let hybrid = Location::TextWithBbox {
5410 start: 0,
5411 end: 10,
5412 bbox: Box::new(Location::bbox(0.0, 0.0, 0.5, 0.5)),
5413 };
5414 assert_eq!(hybrid.modality(), Modality::Hybrid);
5415 }
5416
5417 }
5420
5421#[cfg(test)]
5429mod proptests {
5430 #![allow(clippy::unwrap_used)] use super::*;
5432 use proptest::prelude::*;
5433
5434 fn confidence_strategy() -> impl Strategy<Value = f32> {
5440 0.0f32..=1.0
5441 }
5442
5443 fn label_strategy() -> impl Strategy<Value = String> {
5445 prop_oneof![
5446 Just("Person".to_string()),
5447 Just("Organization".to_string()),
5448 Just("Location".to_string()),
5449 Just("Date".to_string()),
5450 "[A-Z][a-z]{2,10}".prop_map(|s| s),
5451 ]
5452 }
5453
5454 fn surface_strategy() -> impl Strategy<Value = String> {
5456 "[A-Za-z ]{1,50}".prop_map(|s| s.trim().to_string())
5457 }
5458
5459 proptest! {
5464 #[test]
5466 fn iou_symmetric(
5467 start1 in 0usize..1000,
5468 len1 in 1usize..500,
5469 start2 in 0usize..1000,
5470 len2 in 1usize..500,
5471 ) {
5472 let a = Location::text(start1, start1 + len1);
5473 let b = Location::text(start2, start2 + len2);
5474
5475 let iou_ab = a.iou(&b);
5476 let iou_ba = b.iou(&a);
5477
5478 prop_assert_eq!(iou_ab, iou_ba, "IoU must be symmetric");
5479 }
5480
5481 #[test]
5483 fn iou_bounded(
5484 start1 in 0usize..1000,
5485 len1 in 1usize..500,
5486 start2 in 0usize..1000,
5487 len2 in 1usize..500,
5488 ) {
5489 let a = Location::text(start1, start1 + len1);
5490 let b = Location::text(start2, start2 + len2);
5491
5492 if let Some(iou) = a.iou(&b) {
5493 prop_assert!(iou >= 0.0, "IoU must be non-negative: got {}", iou);
5494 prop_assert!(iou <= 1.0, "IoU must be at most 1: got {}", iou);
5495 }
5496 }
5497
5498 #[test]
5500 fn iou_self_identity(start in 0usize..1000, len in 1usize..500) {
5501 let loc = Location::text(start, start + len);
5502 let iou = loc.iou(&loc).unwrap();
5503 prop_assert!(
5504 (iou - 1.0).abs() < 1e-6,
5505 "Self-IoU must be 1.0, got {}",
5506 iou
5507 );
5508 }
5509
5510 #[test]
5512 fn iou_non_overlapping_zero(
5513 start1 in 0usize..500,
5514 len1 in 1usize..100,
5515 ) {
5516 let end1 = start1 + len1;
5517 let start2 = end1 + 100; let len2 = 50;
5519
5520 let a = Location::text(start1, end1);
5521 let b = Location::text(start2, start2 + len2);
5522
5523 let iou = a.iou(&b).expect("bbox iou should be defined");
5524 prop_assert!(
5525 iou.abs() < 1e-6,
5526 "Non-overlapping IoU must be 0, got {}",
5527 iou
5528 );
5529 }
5530
5531 #[test]
5533 fn bbox_iou_symmetric_bounded(
5534 x1 in 0.0f32..0.8,
5535 y1 in 0.0f32..0.8,
5536 w1 in 0.05f32..0.2,
5537 h1 in 0.05f32..0.2,
5538 x2 in 0.0f32..0.8,
5539 y2 in 0.0f32..0.8,
5540 w2 in 0.05f32..0.2,
5541 h2 in 0.05f32..0.2,
5542 ) {
5543 let a = Location::bbox(x1, y1, w1, h1);
5544 let b = Location::bbox(x2, y2, w2, h2);
5545
5546 let iou_ab = a.iou(&b);
5547 let iou_ba = b.iou(&a);
5548
5549 prop_assert_eq!(iou_ab, iou_ba, "BBox IoU must be symmetric");
5551
5552 if let Some(iou) = iou_ab {
5554 prop_assert!(
5555 (0.0..=1.0).contains(&iou),
5556 "BBox IoU out of bounds: {}",
5557 iou
5558 );
5559 }
5560 }
5561 }
5562
5563 proptest! {
5568 #[test]
5570 fn signal_confidence_clamped(raw_conf in -10.0f32..10.0) {
5571 let signal: Signal<Location> = Signal::new(
5572 0,
5573 Location::text(0, 10),
5574 "test",
5575 "Type",
5576 raw_conf,
5577 );
5578
5579 prop_assert!(signal.confidence >= 0.0, "Confidence below 0: {}", signal.confidence);
5580 prop_assert!(signal.confidence <= 1.0, "Confidence above 1: {}", signal.confidence);
5581 }
5582
5583 #[test]
5585 fn signal_preserves_data(
5586 surface in surface_strategy(),
5587 label in label_strategy(),
5588 conf in confidence_strategy(),
5589 start in 0usize..1000,
5590 len in 1usize..100,
5591 ) {
5592 let signal: Signal<Location> = Signal::new(
5593 0,
5594 Location::text(start, start + len),
5595 &surface,
5596 label.as_str(),
5597 conf,
5598 );
5599
5600 prop_assert_eq!(&signal.surface, &surface);
5601 let want = crate::TypeLabel::from(label.as_str());
5602 prop_assert_eq!(signal.label, want);
5603 }
5604
5605 #[test]
5609 fn signal_negation_stable(conf in confidence_strategy()) {
5610 let signal: Signal<Location> = Signal::new(
5611 0,
5612 Location::text(0, 10),
5613 "test",
5614 "Type",
5615 conf,
5616 )
5617 .negated();
5618
5619 prop_assert!(signal.negated, "Signal should be negated after .negated()");
5620 }
5621
5622 #[test]
5624 fn symbolic_supports_linguistic(
5625 start in 0usize..1000,
5626 len in 1usize..100,
5627 ) {
5628 let loc = Location::text(start, start + len);
5629 prop_assert!(
5630 loc.modality().supports_linguistic_features(),
5631 "Text locations must support linguistic features"
5632 );
5633 }
5634
5635 #[test]
5637 fn iconic_supports_geometric(
5638 x in 0.0f32..0.9,
5639 y in 0.0f32..0.9,
5640 w in 0.01f32..0.5,
5641 h in 0.01f32..0.5,
5642 ) {
5643 let loc = Location::bbox(x, y, w, h);
5644 prop_assert!(
5645 loc.modality().supports_geometric_features(),
5646 "BBox locations must support geometric features"
5647 );
5648 }
5649 }
5650
5651 proptest! {
5656 #[test]
5658 fn track_length_monotonic(signal_count in 1usize..20) {
5659 let mut track = Track::new(0, "test");
5660
5661 for i in 0..signal_count {
5662 track.add_signal(i, i as u32);
5663 prop_assert_eq!(
5664 track.len(),
5665 i + 1,
5666 "Track length should be {} after adding {} signals",
5667 i + 1,
5668 i + 1
5669 );
5670 }
5671 }
5672
5673 #[test]
5675 fn track_not_empty_after_add(canonical in surface_strategy()) {
5676 let mut track = Track::new(0, &canonical);
5677 prop_assert!(track.is_empty(), "New track should be empty");
5678
5679 track.add_signal(0, 0);
5680 prop_assert!(!track.is_empty(), "Track should not be empty after add");
5681 }
5682
5683 #[test]
5685 fn track_positions_stored(signal_count in 1usize..10) {
5686 let mut track = Track::new(0, "test");
5687
5688 for i in 0..signal_count {
5689 track.add_signal(i, i as u32);
5690 }
5691
5692 for (idx, signal_ref) in track.signals.iter().enumerate() {
5693 prop_assert_eq!(
5694 signal_ref.position as usize,
5695 idx,
5696 "Signal position mismatch at index {}",
5697 idx
5698 );
5699 }
5700 }
5701 }
5702
5703 proptest! {
5708 #[test]
5710 fn document_signal_ids_monotonic(signal_count in 1usize..20) {
5711 let mut doc = GroundedDocument::new("test", "test text");
5712
5713 let mut prev_id: Option<SignalId> = None;
5714 for i in 0..signal_count {
5715 let id = doc.add_signal(Signal::new(
5716 999, Location::text(i * 10, i * 10 + 5),
5718 format!("entity_{}", i),
5719 "Type",
5720 0.9,
5721 ));
5722
5723 if let Some(prev) = prev_id {
5724 prop_assert!(id > prev, "Signal IDs should be monotonically increasing");
5725 }
5726 prev_id = Some(id);
5727 }
5728 }
5729
5730 #[test]
5732 fn document_track_membership_consistent(signal_count in 1usize..5) {
5733 let mut doc = GroundedDocument::new("test", "test text");
5734
5735 let mut signal_ids = Vec::new();
5737 for i in 0..signal_count {
5738 let id = doc.add_signal(Signal::new(
5739 0,
5740 Location::text(i * 10, i * 10 + 5),
5741 format!("entity_{}", i),
5742 "Type",
5743 0.9,
5744 ));
5745 signal_ids.push(id);
5746 }
5747
5748 let mut track = Track::new(0, "canonical");
5750 for (pos, &id) in signal_ids.iter().enumerate() {
5751 track.add_signal(id, pos as u32);
5752 }
5753 let track_id = doc.add_track(track);
5754
5755 for &signal_id in &signal_ids {
5757 let found_track = doc.track_for_signal(signal_id);
5758 prop_assert!(found_track.is_some(), "Signal should be in a track");
5759 prop_assert_eq!(
5760 found_track.unwrap().id,
5761 track_id,
5762 "Signal should be in the correct track"
5763 );
5764 }
5765 }
5766
5767 #[test]
5769 fn document_identity_transitivity(signal_count in 1usize..3) {
5770 let mut doc = GroundedDocument::new("test", "test text");
5771
5772 let mut signal_ids = Vec::new();
5774 for i in 0..signal_count {
5775 let id = doc.add_signal(Signal::new(
5776 0,
5777 Location::text(i * 10, i * 10 + 5),
5778 format!("entity_{}", i),
5779 "Type",
5780 0.9,
5781 ));
5782 signal_ids.push(id);
5783 }
5784
5785 let mut track = Track::new(0, "canonical");
5787 for (pos, &id) in signal_ids.iter().enumerate() {
5788 track.add_signal(id, pos as u32);
5789 }
5790 let track_id = doc.add_track(track);
5791
5792 let identity = Identity::from_kb(0, "Entity", "wikidata", "Q123");
5793 let identity_id = doc.add_identity(identity);
5794 doc.link_track_to_identity(track_id, identity_id);
5795
5796 for &signal_id in &signal_ids {
5798 let identity = doc.identity_for_signal(signal_id);
5799 prop_assert!(identity.is_some(), "Should find identity through signal");
5800 prop_assert_eq!(
5801 identity.unwrap().id,
5802 identity_id,
5803 "Should find correct identity"
5804 );
5805 }
5806 }
5807
5808 #[test]
5810 fn document_untracked_signals(total in 2usize..10, tracked in 0usize..10) {
5811 let tracked = tracked.min(total - 1); let mut doc = GroundedDocument::new("test", "test text");
5813
5814 let mut signal_ids = Vec::new();
5816 for i in 0..total {
5817 let id = doc.add_signal(Signal::new(
5818 0,
5819 Location::text(i * 10, i * 10 + 5),
5820 format!("entity_{}", i),
5821 "Type",
5822 0.9,
5823 ));
5824 signal_ids.push(id);
5825 }
5826
5827 let mut track = Track::new(0, "canonical");
5829 for (pos, &id) in signal_ids.iter().take(tracked).enumerate() {
5830 track.add_signal(id, pos as u32);
5831 }
5832 if tracked > 0 {
5833 doc.add_track(track);
5834 }
5835
5836 prop_assert_eq!(
5838 doc.untracked_signal_count(),
5839 total - tracked,
5840 "Wrong untracked count"
5841 );
5842 }
5843 }
5844
5845 proptest! {
5850 #[test]
5852 fn entity_roundtrip_preserves_text(
5853 text in surface_strategy(),
5854 start in 0usize..1000,
5855 len in 1usize..100,
5856 conf in 0.0f64..=1.0,
5857 ) {
5858 use super::EntityType;
5859
5860 let end = start + len;
5861 let entity = super::Entity::new(&text, EntityType::Person, start, end, conf);
5862
5863 let doc = GroundedDocument::from_entities("test", "x".repeat(end + 10), &[entity]);
5864 let converted = doc.to_entities();
5865
5866 prop_assert_eq!(converted.len(), 1, "Should have exactly one entity");
5867 prop_assert_eq!(&converted[0].text, &text, "Text should be preserved");
5868 prop_assert_eq!(converted[0].start, start, "Start should be preserved");
5869 prop_assert_eq!(converted[0].end, end, "End should be preserved");
5870 }
5871
5872 }
5875
5876 proptest! {
5881 #[test]
5883 fn modality_feature_consistency(_dummy in 0..1) {
5884 prop_assert!(Modality::Iconic.supports_geometric_features());
5886 prop_assert!(!Modality::Iconic.supports_linguistic_features());
5887
5888 prop_assert!(Modality::Symbolic.supports_linguistic_features());
5890 prop_assert!(!Modality::Symbolic.supports_geometric_features());
5891
5892 prop_assert!(Modality::Hybrid.supports_linguistic_features());
5894 prop_assert!(Modality::Hybrid.supports_geometric_features());
5895 }
5896 }
5897
5898 proptest! {
5903 #[test]
5905 fn overlap_symmetric(
5906 start1 in 0usize..1000,
5907 len1 in 1usize..100,
5908 start2 in 0usize..1000,
5909 len2 in 1usize..100,
5910 ) {
5911 let a = Location::text(start1, start1 + len1);
5912 let b = Location::text(start2, start2 + len2);
5913
5914 prop_assert_eq!(
5915 a.overlaps(&b),
5916 b.overlaps(&a),
5917 "Overlap must be symmetric"
5918 );
5919 }
5920
5921 #[test]
5923 fn overlap_reflexive(start in 0usize..1000, len in 1usize..100) {
5924 let loc = Location::text(start, start + len);
5925 prop_assert!(loc.overlaps(&loc), "Location must overlap with itself");
5926 }
5927
5928 #[test]
5930 fn iou_implies_overlap(
5931 start1 in 0usize..500,
5932 len1 in 1usize..100,
5933 start2 in 0usize..500,
5934 len2 in 1usize..100,
5935 ) {
5936 let a = Location::text(start1, start1 + len1);
5937 let b = Location::text(start2, start2 + len2);
5938
5939 if let Some(iou) = a.iou(&b) {
5940 if iou > 0.0 {
5941 prop_assert!(
5942 a.overlaps(&b),
5943 "IoU > 0 should imply overlap"
5944 );
5945 }
5946 }
5947 }
5948 }
5949
5950 proptest! {
5955 #[test]
5957 fn stats_signal_count_accurate(signal_count in 0usize..20) {
5958 let mut doc = GroundedDocument::new("test", "test");
5959 for i in 0..signal_count {
5960 doc.add_signal(Signal::new(
5961 0,
5962 Location::text(i * 10, i * 10 + 5),
5963 "entity",
5964 "Type",
5965 0.9,
5966 ));
5967 }
5968
5969 let stats = doc.stats();
5970 prop_assert_eq!(stats.signal_count, signal_count);
5971 }
5972
5973 #[test]
5975 fn stats_track_count_accurate(track_count in 0usize..10) {
5976 let mut doc = GroundedDocument::new("test", "test");
5977 for i in 0..track_count {
5978 let id = doc.add_signal(Signal::new(
5979 0,
5980 Location::text(i * 10, i * 10 + 5),
5981 "entity",
5982 "Type",
5983 0.9,
5984 ));
5985 let mut track = Track::new(0, format!("track_{}", i));
5986 track.add_signal(id, 0);
5987 doc.add_track(track);
5988 }
5989
5990 let stats = doc.stats();
5991 prop_assert_eq!(stats.track_count, track_count);
5992 }
5993
5994 #[test]
5996 fn stats_avg_confidence_bounded(
5997 confidences in proptest::collection::vec(0.0f32..=1.0, 1..10)
5998 ) {
5999 let mut doc = GroundedDocument::new("test", "test");
6000 for (i, conf) in confidences.iter().enumerate() {
6001 doc.add_signal(Signal::new(
6002 0,
6003 Location::text(i * 10, i * 10 + 5),
6004 "entity",
6005 "Type",
6006 *conf,
6007 ));
6008 }
6009
6010 let stats = doc.stats();
6011 prop_assert!(stats.avg_confidence >= 0.0);
6012 prop_assert!(stats.avg_confidence <= 1.0);
6013 }
6014 }
6015
6016 proptest! {
6021 #[test]
6023 fn batch_add_returns_all_ids(count in 1usize..10) {
6024 let mut doc = GroundedDocument::new("test", "test");
6025 let signals: Vec<Signal<Location>> = (0..count)
6026 .map(|i| Signal::new(0, Location::text(i * 10, i * 10 + 5), "e", "T", 0.9))
6027 .collect();
6028
6029 let ids = doc.add_signals(signals);
6030 prop_assert_eq!(ids.len(), count);
6031 prop_assert_eq!(doc.signals().len(), count);
6032 }
6033
6034 #[test]
6036 fn create_track_valid(signal_count in 1usize..5) {
6037 let mut doc = GroundedDocument::new("test", "test");
6038 let mut signal_ids = Vec::new();
6039 for i in 0..signal_count {
6040 let id = doc.add_signal(Signal::new(
6041 0,
6042 Location::text(i * 10, i * 10 + 5),
6043 "entity",
6044 "Type",
6045 0.9,
6046 ));
6047 signal_ids.push(id);
6048 }
6049
6050 let track_id = doc.create_track_from_signals("canonical", &signal_ids);
6051 prop_assert!(track_id.is_some());
6052
6053 let track = doc.get_track(track_id.unwrap());
6054 prop_assert!(track.is_some());
6055 prop_assert_eq!(track.unwrap().len(), signal_count);
6056 }
6057
6058 #[test]
6060 fn create_track_empty_returns_none(_dummy in 0..1) {
6061 let mut doc = GroundedDocument::new("test", "test");
6062 let track_id = doc.create_track_from_signals("canonical", &[]);
6063 prop_assert!(track_id.is_none());
6064 }
6065 }
6066
6067 proptest! {
6072 #[test]
6074 fn signals_in_range_within_bounds(
6075 range_start in 0usize..100,
6076 range_len in 10usize..50,
6077 ) {
6078 let range_end = range_start + range_len;
6079 let mut doc = GroundedDocument::new("test", "x".repeat(200));
6080
6081 doc.add_signal(Signal::new(0, Location::text(range_start + 2, range_start + 5), "inside", "T", 0.9));
6083 doc.add_signal(Signal::new(0, Location::text(0, 5), "before", "T", 0.9));
6084 doc.add_signal(Signal::new(0, Location::text(190, 195), "after", "T", 0.9));
6085
6086 let in_range = doc.signals_in_range(range_start, range_end);
6087
6088 for signal in &in_range {
6089 if let Some((start, end)) = signal.location.text_offsets() {
6090 prop_assert!(start >= range_start, "Signal start {} < range start {}", start, range_start);
6091 prop_assert!(end <= range_end, "Signal end {} > range end {}", end, range_end);
6092 }
6093 }
6094 }
6095
6096 #[test]
6098 fn overlapping_signals_symmetric(
6099 start1 in 10usize..50,
6100 len1 in 5usize..20,
6101 start2 in 10usize..50,
6102 len2 in 5usize..20,
6103 ) {
6104 let mut doc = GroundedDocument::new("test", "x".repeat(100));
6105
6106 let loc1 = Location::text(start1, start1 + len1);
6107 let loc2 = Location::text(start2, start2 + len2);
6108
6109 doc.add_signal(Signal::new(0, loc1.clone(), "A", "T", 0.9));
6110 doc.add_signal(Signal::new(0, loc2.clone(), "B", "T", 0.9));
6111
6112 let overlaps_loc1 = doc.overlapping_signals(&loc1);
6113 let overlaps_loc2 = doc.overlapping_signals(&loc2);
6114
6115 if loc1.overlaps(&loc2) {
6117 prop_assert!(overlaps_loc1.len() >= 2, "Should find both when overlapping");
6118 prop_assert!(overlaps_loc2.len() >= 2, "Should find both when overlapping");
6119 }
6120 }
6121 }
6122
6123 proptest! {
6128 #[test]
6130 fn modality_counts_sum_to_total(
6131 symbolic_count in 0usize..5,
6132 iconic_count in 0usize..5,
6133 ) {
6134 let mut doc = GroundedDocument::new("test", "test");
6135
6136 for i in 0..symbolic_count {
6138 let mut signal = Signal::new(
6139 0,
6140 Location::text(i * 10, i * 10 + 5),
6141 "entity",
6142 "Type",
6143 0.9,
6144 );
6145 signal.modality = Modality::Symbolic;
6146 doc.add_signal(signal);
6147 }
6148
6149 for i in 0..iconic_count {
6151 let mut signal = Signal::new(
6152 0,
6153 Location::bbox(i as f32 * 0.1, 0.0, 0.05, 0.05),
6154 "entity",
6155 "Type",
6156 0.9,
6157 );
6158 signal.modality = Modality::Iconic;
6159 doc.add_signal(signal);
6160 }
6161
6162 let stats = doc.stats();
6163 prop_assert_eq!(
6164 stats.symbolic_count + stats.iconic_count + stats.hybrid_count,
6165 stats.signal_count,
6166 "Modality counts should sum to total"
6167 );
6168 }
6169 }
6170
6171 proptest! {
6176 #[test]
6178 fn from_text_always_valid(
6179 text in "[a-zA-Z ]{20,100}",
6180 surface_start in 0usize..15,
6181 surface_len in 1usize..8,
6182 ) {
6183 let text_char_len = text.chars().count();
6184 let surface_end = (surface_start + surface_len).min(text_char_len);
6185 let surface_start = surface_start.min(surface_end.saturating_sub(1));
6186
6187 if surface_start < surface_end && surface_end <= text_char_len {
6188 let surface: String = text.chars()
6189 .skip(surface_start)
6190 .take(surface_end - surface_start)
6191 .collect();
6192
6193 if !surface.is_empty() {
6194 if let Some(signal) = Signal::<Location>::from_text(&text, &surface, "Test", 0.9) {
6196 prop_assert!(
6198 signal.validate_against(&text).is_none(),
6199 "Signal created via from_text must be valid"
6200 );
6201 }
6202 }
6203 }
6204 }
6205
6206 #[test]
6208 fn validated_add_rejects_invalid(
6209 text in "[a-z]{10,50}",
6210 wrong_surface in "[A-Z]{3,10}",
6211 ) {
6212 let mut doc = GroundedDocument::new("test", &text);
6213
6214 let signal = Signal::new(
6216 0,
6217 Location::text(0, wrong_surface.chars().count().min(text.chars().count())),
6218 wrong_surface.clone(),
6219 "Test",
6220 0.9,
6221 );
6222
6223 let expected: String = text.chars().take(wrong_surface.chars().count()).collect();
6226 if expected != wrong_surface {
6227 let result = doc.add_signal_validated(signal);
6228 prop_assert!(result.is_err(), "Should reject signal with mismatched surface");
6229 }
6230 }
6231
6232 #[test]
6234 fn round_trip_signal_from_text(
6235 prefix in "[a-z]{5,20}",
6236 entity in "[A-Z][a-z]{3,10}",
6237 suffix in "[a-z]{5,20}",
6238 ) {
6239 let text = format!("{} {} {}", prefix, entity, suffix);
6240 let mut doc = GroundedDocument::new("test", &text);
6241
6242 let id = doc.add_signal_from_text(&entity, "Entity", 0.9);
6243 prop_assert!(id.is_some(), "Should find entity in text");
6244
6245 let signal = doc.signals().iter().find(|s| s.id == id.unwrap());
6246 prop_assert!(signal.is_some(), "Should retrieve added signal");
6247
6248 let signal = signal.unwrap();
6249 prop_assert_eq!(signal.surface(), entity.as_str(), "Surface should match");
6250
6251 prop_assert!(
6253 doc.is_valid(),
6254 "Document should be valid after from_text add"
6255 );
6256 }
6257
6258 #[test]
6260 fn nth_occurrence_finds_correct(
6261 entity in "[A-Z][a-z]{2,5}",
6262 sep in " [a-z]+ ",
6263 ) {
6264 let text = format!("{}{}{}{}{}", entity, sep, entity, sep, entity);
6266 let mut doc = GroundedDocument::new("test", &text);
6267
6268 for n in 0..3 {
6270 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, n);
6271 prop_assert!(id.is_some(), "Should find occurrence {}", n);
6272 }
6273
6274 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, 3);
6276 prop_assert!(id.is_none(), "Should NOT find 4th occurrence");
6277
6278 prop_assert!(doc.is_valid(), "All signals should be valid");
6280
6281 let offsets: Vec<_> = doc.signals()
6283 .iter()
6284 .filter_map(|s| s.text_offsets())
6285 .collect();
6286 let unique: std::collections::HashSet<_> = offsets.iter().collect();
6287 prop_assert_eq!(offsets.len(), unique.len(), "Each occurrence should have distinct offset");
6288 }
6289 }
6290
6291 #[test]
6296 fn test_track_stats_basic() {
6297 let text = "John met Mary. He said hello. John left.";
6298 let mut doc = GroundedDocument::new("test", text);
6299 let text_len = text.chars().count();
6300
6301 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.95));
6303 let s2 = doc.add_signal(Signal::new(
6304 0,
6305 Location::text(30, 34),
6306 "John",
6307 "Person",
6308 0.90,
6309 ));
6310
6311 let track_id = doc.add_track(Track::new(0, "John".to_string()));
6313 doc.add_signal_to_track(s1, track_id, 0);
6314 doc.add_signal_to_track(s2, track_id, 1);
6315
6316 let track = doc.get_track(track_id).unwrap();
6318 let stats = track.compute_stats(&doc, text_len);
6319
6320 assert_eq!(stats.chain_length, 2, "Two mentions");
6321 assert_eq!(stats.variation_count, 1, "One unique surface form");
6322 assert!(stats.spread > 0, "Spread should be positive");
6323 assert!(stats.relative_spread > 0.0 && stats.relative_spread < 1.0);
6324 assert!((stats.min_confidence - 0.90).abs() < 0.01);
6325 assert!((stats.max_confidence - 0.95).abs() < 0.01);
6326 assert!((stats.mean_confidence - 0.925).abs() < 0.01);
6327 }
6328
6329 #[test]
6330 fn test_track_stats_singleton() {
6331 let text = "Paris is beautiful.";
6332 let mut doc = GroundedDocument::new("test", text);
6333 let text_len = text.chars().count();
6334
6335 let s1 = doc.add_signal(Signal::new(
6336 0,
6337 Location::text(0, 5),
6338 "Paris",
6339 "Location",
6340 0.88,
6341 ));
6342 let track_id = doc.add_track(Track::new(0, "Paris".to_string()));
6343 doc.add_signal_to_track(s1, track_id, 0);
6344
6345 let track = doc.get_track(track_id).unwrap();
6346 let stats = track.compute_stats(&doc, text_len);
6347
6348 assert_eq!(stats.chain_length, 1);
6349 assert_eq!(stats.spread, 0, "Singleton has zero spread");
6350 assert_eq!(stats.first_position, stats.last_position);
6351 assert!((stats.min_confidence - stats.max_confidence).abs() < 0.001);
6352 }
6353}