1use super::confidence::Confidence;
90use super::entity::{
91 DiscontinuousSpan, Entity, EntityType, HierarchicalConfidence, Provenance, Span,
92};
93use serde::{Deserialize, Serialize};
94use std::collections::HashMap;
95
96#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
122pub enum Modality {
123 Iconic,
126 #[default]
129 Symbolic,
130 Hybrid,
133}
134
135impl Modality {
136 #[must_use]
138 pub const fn supports_linguistic_features(&self) -> bool {
139 matches!(self, Self::Symbolic | Self::Hybrid)
140 }
141
142 #[must_use]
144 pub const fn supports_geometric_features(&self) -> bool {
145 matches!(self, Self::Iconic | Self::Hybrid)
146 }
147}
148
149#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
184pub enum Location {
185 Text {
187 start: usize,
189 end: usize,
191 },
192 BoundingBox {
194 x: f32,
196 y: f32,
198 width: f32,
200 height: f32,
202 page: Option<u32>,
204 },
205 Temporal {
207 start_sec: f64,
209 end_sec: f64,
211 frame: Option<u64>,
213 },
214 Cuboid {
216 center: [f32; 3],
218 dimensions: [f32; 3],
220 rotation: [f32; 4],
222 },
223 Genomic {
225 contig: String,
227 start: u64,
229 end: u64,
231 strand: Option<char>,
233 },
234 Discontinuous {
236 segments: Vec<(usize, usize)>,
238 },
239 TextWithBbox {
241 start: usize,
243 end: usize,
245 bbox: Box<Location>,
247 },
248}
249
250impl Location {
251 #[must_use]
253 pub const fn text(start: usize, end: usize) -> Self {
254 Self::Text { start, end }
255 }
256
257 #[must_use]
259 pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
260 Self::BoundingBox {
261 x,
262 y,
263 width,
264 height,
265 page: None,
266 }
267 }
268
269 #[must_use]
271 pub const fn modality(&self) -> Modality {
272 match self {
273 Self::Text { .. } | Self::Genomic { .. } | Self::Discontinuous { .. } => {
274 Modality::Symbolic
275 }
276 Self::BoundingBox { .. } | Self::Cuboid { .. } => Modality::Iconic,
277 Self::Temporal { .. } => Modality::Iconic, Self::TextWithBbox { .. } => Modality::Hybrid,
279 }
280 }
281
282 #[must_use]
284 pub fn text_offsets(&self) -> Option<(usize, usize)> {
285 match self {
286 Self::Text { start, end } => Some((*start, *end)),
287 Self::TextWithBbox { start, end, .. } => Some((*start, *end)),
288 Self::Discontinuous { segments } => {
289 let start = segments.iter().map(|(s, _)| *s).min()?;
290 let end = segments.iter().map(|(_, e)| *e).max()?;
291 Some((start, end))
292 }
293 _ => None,
294 }
295 }
296
297 #[must_use]
299 pub fn overlaps(&self, other: &Self) -> bool {
300 match (self, other) {
301 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
302 s1 < e2 && s2 < e1
303 }
304 (
305 Self::BoundingBox {
306 x: x1,
307 y: y1,
308 width: w1,
309 height: h1,
310 page: p1,
311 },
312 Self::BoundingBox {
313 x: x2,
314 y: y2,
315 width: w2,
316 height: h2,
317 page: p2,
318 },
319 ) => {
320 if p1 != p2 {
322 return false;
323 }
324 x1 < &(x2 + w2) && &(x1 + w1) > x2 && y1 < &(y2 + h2) && &(y1 + h1) > y2
326 }
327 _ => false, }
329 }
330
331 #[must_use]
335 pub fn iou(&self, other: &Self) -> Option<f64> {
336 match (self, other) {
337 (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
338 let intersection_start = (*s1).max(*s2);
339 let intersection_end = (*e1).min(*e2);
340 if intersection_start >= intersection_end {
341 return Some(0.0);
342 }
343 let intersection = (intersection_end - intersection_start) as f64;
344 let union = ((*e1).max(*e2) - (*s1).min(*s2)) as f64;
345 if union == 0.0 {
346 Some(0.0)
347 } else {
348 Some(intersection / union)
349 }
350 }
351 (
352 Self::BoundingBox {
353 x: x1,
354 y: y1,
355 width: w1,
356 height: h1,
357 page: p1,
358 },
359 Self::BoundingBox {
360 x: x2,
361 y: y2,
362 width: w2,
363 height: h2,
364 page: p2,
365 },
366 ) => {
367 if p1 != p2 {
368 return Some(0.0);
369 }
370 let x_overlap = (x1 + w1).min(x2 + w2) - x1.max(*x2);
371 let y_overlap = (y1 + h1).min(y2 + h2) - y1.max(*y2);
372 if x_overlap <= 0.0 || y_overlap <= 0.0 {
373 return Some(0.0);
374 }
375 let intersection = (x_overlap * y_overlap) as f64;
376 let area1 = (*w1 * *h1) as f64;
377 let area2 = (*w2 * *h2) as f64;
378 let union = area1 + area2 - intersection;
379 if union == 0.0 {
380 Some(0.0)
381 } else {
382 Some(intersection / union)
383 }
384 }
385 _ => None,
386 }
387 }
388}
389
390impl Default for Location {
391 fn default() -> Self {
392 Self::Text { start: 0, end: 0 }
393 }
394}
395
396impl From<&Span> for Location {
397 fn from(span: &Span) -> Self {
398 match span {
399 Span::Text { start, end } => Self::Text {
400 start: *start,
401 end: *end,
402 },
403 Span::BoundingBox {
404 x,
405 y,
406 width,
407 height,
408 page,
409 } => Self::BoundingBox {
410 x: *x,
411 y: *y,
412 width: *width,
413 height: *height,
414 page: *page,
415 },
416 Span::Hybrid { start, end, bbox } => Self::TextWithBbox {
417 start: *start,
418 end: *end,
419 bbox: Box::new(Location::from(bbox.as_ref())),
420 },
421 }
422 }
423}
424
425impl From<Span> for Location {
426 fn from(span: Span) -> Self {
427 Self::from(&span)
428 }
429}
430
431impl Location {
440 #[must_use]
445 pub fn to_span(&self) -> Option<Span> {
446 match self {
447 Self::Text { start, end } => Some(Span::Text {
448 start: *start,
449 end: *end,
450 }),
451 Self::BoundingBox {
452 x,
453 y,
454 width,
455 height,
456 page,
457 } => Some(Span::BoundingBox {
458 x: *x,
459 y: *y,
460 width: *width,
461 height: *height,
462 page: *page,
463 }),
464 Self::TextWithBbox { start, end, bbox } => {
465 let inner_span = bbox.to_span()?;
466 Some(Span::Hybrid {
467 start: *start,
468 end: *end,
469 bbox: Box::new(inner_span),
470 })
471 }
472 Self::Temporal { .. }
474 | Self::Cuboid { .. }
475 | Self::Genomic { .. }
476 | Self::Discontinuous { .. } => None,
477 }
478 }
479}
480
481pub use super::types::SignalId;
487
488#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
518pub struct Signal<L = Location> {
519 pub id: SignalId,
521 pub location: L,
523 pub surface: String,
525 pub label: super::types::TypeLabel,
529 pub confidence: f32,
531 pub hierarchical: Option<HierarchicalConfidence>,
533 pub provenance: Option<Provenance>,
535 pub modality: Modality,
537 pub normalized: Option<String>,
539 pub negated: bool,
541 pub quantifier: Option<Quantifier>,
543}
544
545#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
550#[non_exhaustive]
551pub enum Quantifier {
552 Universal,
554 Existential,
556 None,
558 Definite,
560 Approximate,
562 Bare,
564}
565
566impl<L> Signal<L> {
567 #[must_use]
577 pub fn new(
578 id: impl Into<SignalId>,
579 location: L,
580 surface: impl Into<String>,
581 label: impl Into<super::types::TypeLabel>,
582 confidence: f32,
583 ) -> Self {
584 Self {
585 id: id.into(),
586 location,
587 surface: surface.into(),
588 label: label.into(),
589 confidence: confidence.clamp(0.0, 1.0),
590 hierarchical: None,
591 provenance: None,
592 modality: Modality::default(),
593 normalized: None,
594 negated: false,
595 quantifier: None,
596 }
597 }
598
599 #[must_use]
601 pub fn label(&self) -> &str {
602 self.label.as_str()
603 }
604
605 #[must_use]
607 pub fn type_label(&self) -> super::types::TypeLabel {
608 self.label.clone()
609 }
610
611 #[must_use]
613 pub fn surface(&self) -> &str {
614 &self.surface
615 }
616
617 #[must_use]
619 pub fn is_confident(&self, threshold: f32) -> bool {
620 self.confidence >= threshold
621 }
622
623 #[must_use]
625 pub fn with_modality(mut self, modality: Modality) -> Self {
626 self.modality = modality;
627 self
628 }
629
630 #[must_use]
632 pub fn negated(mut self) -> Self {
633 self.negated = true;
634 self
635 }
636
637 #[must_use]
639 pub fn with_quantifier(mut self, q: Quantifier) -> Self {
640 self.quantifier = Some(q);
641 self
642 }
643
644 #[must_use]
646 pub fn with_provenance(mut self, p: Provenance) -> Self {
647 self.provenance = Some(p);
648 self
649 }
650}
651
652impl Signal<Location> {
653 #[must_use]
655 pub fn text_offsets(&self) -> Option<(usize, usize)> {
656 self.location.text_offsets()
657 }
658
659 #[must_use]
676 pub fn validate_against(&self, source_text: &str) -> Option<SignalValidationError> {
677 let (start, end) = self.location.text_offsets()?;
678
679 let char_count = source_text.chars().count();
680
681 if end > char_count {
683 return Some(SignalValidationError::OutOfBounds {
684 signal_id: self.id,
685 end,
686 text_len: char_count,
687 });
688 }
689
690 if start >= end {
691 return Some(SignalValidationError::InvalidSpan {
692 signal_id: self.id,
693 start,
694 end,
695 });
696 }
697
698 let actual: String = source_text.chars().skip(start).take(end - start).collect();
700
701 if actual != self.surface {
702 return Some(SignalValidationError::TextMismatch {
703 signal_id: self.id,
704 expected: self.surface.clone(),
705 actual,
706 start,
707 end,
708 });
709 }
710
711 None
712 }
713
714 #[must_use]
716 pub fn is_valid(&self, source_text: &str) -> bool {
717 self.validate_against(source_text).is_none()
718 }
719
720 #[must_use]
735 pub fn from_text(
736 source: &str,
737 surface: &str,
738 label: impl Into<super::types::TypeLabel>,
739 confidence: f32,
740 ) -> Option<Self> {
741 Self::from_text_nth(source, surface, label, confidence, 0)
742 }
743
744 #[must_use]
746 pub fn from_text_nth(
747 source: &str,
748 surface: &str,
749 label: impl Into<super::types::TypeLabel>,
750 confidence: f32,
751 occurrence: usize,
752 ) -> Option<Self> {
753 for (count, (byte_idx, _)) in source.match_indices(surface).enumerate() {
755 if count == occurrence {
756 let start = source[..byte_idx].chars().count();
758 let end = start + surface.chars().count();
759
760 return Some(Self::new(
761 SignalId::ZERO,
762 Location::text(start, end),
763 surface,
764 label,
765 confidence,
766 ));
767 }
768 }
769
770 None
771 }
772}
773
774#[derive(Debug, Clone, PartialEq)]
776pub enum SignalValidationError {
777 OutOfBounds {
779 signal_id: SignalId,
781 end: usize,
783 text_len: usize,
785 },
786 InvalidSpan {
788 signal_id: SignalId,
790 start: usize,
792 end: usize,
794 },
795 TextMismatch {
797 signal_id: SignalId,
799 expected: String,
801 actual: String,
803 start: usize,
805 end: usize,
807 },
808}
809
810impl std::fmt::Display for SignalValidationError {
811 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
812 match self {
813 Self::OutOfBounds {
814 signal_id,
815 end,
816 text_len,
817 } => {
818 write!(
819 f,
820 "S{}: end offset {} exceeds text length {}",
821 signal_id, end, text_len
822 )
823 }
824 Self::InvalidSpan {
825 signal_id,
826 start,
827 end,
828 } => {
829 write!(f, "S{}: invalid span [{}, {})", signal_id, start, end)
830 }
831 Self::TextMismatch {
832 signal_id,
833 expected,
834 actual,
835 start,
836 end,
837 } => {
838 write!(
839 f,
840 "S{}: text mismatch at [{}, {}): expected '{}', found '{}'",
841 signal_id, start, end, expected, actual
842 )
843 }
844 }
845 }
846}
847
848impl std::error::Error for SignalValidationError {}
849
850impl From<&Entity> for Signal<Location> {
857 fn from(e: &Entity) -> Self {
858 let mut signal = Signal::new(
859 SignalId::ZERO,
860 Location::text(e.start, e.end),
861 &e.text,
862 e.entity_type.as_label(),
863 f32::from(e.confidence),
864 );
865 signal.normalized = e.normalized.clone();
866 signal.provenance = e.provenance.clone();
867 signal.hierarchical = e.hierarchical_confidence;
868 signal
869 }
870}
871
872pub use super::types::TrackId;
878
879#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
881pub struct SignalRef {
882 pub signal_id: SignalId,
884 pub position: u32,
886}
887
888#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
894pub struct TrackRef {
895 pub doc_id: String,
897 pub track_id: TrackId,
899}
900
901#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
921pub struct Track {
922 pub id: TrackId,
924 pub signals: Vec<SignalRef>,
926 pub entity_type: Option<super::types::TypeLabel>,
930 pub canonical_surface: String,
932 pub identity_id: Option<IdentityId>,
934 pub cluster_confidence: f32,
936 pub embedding: Option<Vec<f32>>,
939}
940
941impl Track {
942 #[must_use]
944 pub fn new(id: impl Into<TrackId>, canonical_surface: impl Into<String>) -> Self {
945 Self {
946 id: id.into(),
947 signals: Vec::new(),
948 entity_type: None,
949 canonical_surface: canonical_surface.into(),
950 identity_id: None,
951 cluster_confidence: 1.0,
952 embedding: None,
953 }
954 }
955
956 pub fn add_signal(&mut self, signal_id: impl Into<SignalId>, position: u32) {
958 let signal_id = signal_id.into();
959 self.signals.push(SignalRef {
960 signal_id,
961 position,
962 });
963 }
964
965 #[must_use]
967 pub fn len(&self) -> usize {
968 self.signals.len()
969 }
970
971 #[must_use]
973 pub fn is_empty(&self) -> bool {
974 self.signals.is_empty()
975 }
976
977 #[must_use]
979 pub fn is_singleton(&self) -> bool {
980 self.signals.len() == 1
981 }
982
983 #[must_use]
985 pub const fn id(&self) -> TrackId {
986 self.id
987 }
988
989 #[must_use]
991 pub fn signals(&self) -> &[SignalRef] {
992 &self.signals
993 }
994
995 #[must_use]
997 pub fn canonical_surface(&self) -> &str {
998 &self.canonical_surface
999 }
1000
1001 #[must_use]
1003 pub const fn identity_id(&self) -> Option<IdentityId> {
1004 self.identity_id
1005 }
1006
1007 #[must_use]
1009 pub const fn cluster_confidence(&self) -> f32 {
1010 self.cluster_confidence
1011 }
1012
1013 pub fn set_cluster_confidence(&mut self, confidence: f32) {
1015 self.cluster_confidence = confidence.clamp(0.0, 1.0);
1016 }
1017
1018 pub fn set_identity_id(&mut self, identity_id: IdentityId) {
1020 self.identity_id = Some(identity_id);
1021 }
1022
1023 pub fn clear_identity_id(&mut self) {
1025 self.identity_id = None;
1026 }
1027
1028 #[must_use]
1030 pub fn with_identity(mut self, identity_id: IdentityId) -> Self {
1031 self.identity_id = Some(identity_id);
1032 self
1033 }
1034
1035 #[must_use]
1039 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1040 let s = entity_type.into();
1041 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1042 self
1043 }
1044
1045 #[must_use]
1059 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1060 self.entity_type = Some(label);
1061 self
1062 }
1063
1064 #[must_use]
1069 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1070 self.entity_type.clone()
1071 }
1072
1073 #[must_use]
1075 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1076 self.embedding = Some(embedding);
1077 self
1078 }
1079
1080 pub fn compute_spread(&self, doc: &GroundedDocument) -> Option<usize> {
1084 if self.signals.is_empty() {
1085 return Some(0);
1086 }
1087
1088 let positions: Vec<usize> = self
1089 .signals
1090 .iter()
1091 .filter_map(|sr| {
1092 doc.signals
1093 .iter()
1094 .find(|s| s.id == sr.signal_id)
1095 .and_then(|s| s.location.text_offsets())
1096 .map(|(start, _)| start)
1097 })
1098 .collect();
1099
1100 if positions.is_empty() {
1101 return None;
1102 }
1103
1104 let min_pos = *positions.iter().min().expect("positions non-empty");
1105 let max_pos = *positions.iter().max().expect("positions non-empty");
1106 Some(max_pos.saturating_sub(min_pos))
1107 }
1108
1109 pub fn collect_variations(&self, doc: &GroundedDocument) -> Vec<String> {
1113 let mut variations: std::collections::HashSet<String> = std::collections::HashSet::new();
1114
1115 for sr in &self.signals {
1116 if let Some(signal) = doc.signals.iter().find(|s| s.id == sr.signal_id) {
1117 variations.insert(signal.surface.clone());
1118 }
1119 }
1120
1121 variations.into_iter().collect()
1122 }
1123
1124 pub fn confidence_stats(&self, doc: &GroundedDocument) -> Option<(f32, f32, f32)> {
1128 let confidences: Vec<f32> = self
1129 .signals
1130 .iter()
1131 .filter_map(|sr| {
1132 doc.signals
1133 .iter()
1134 .find(|s| s.id == sr.signal_id)
1135 .map(|s| s.confidence)
1136 })
1137 .collect();
1138
1139 if confidences.is_empty() {
1140 return None;
1141 }
1142
1143 let min = confidences.iter().cloned().fold(f32::INFINITY, f32::min);
1144 let max = confidences
1145 .iter()
1146 .cloned()
1147 .fold(f32::NEG_INFINITY, f32::max);
1148 let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
1149
1150 Some((min, max, mean))
1151 }
1152
1153 pub fn compute_stats(&self, doc: &GroundedDocument, text_len: usize) -> TrackStats {
1157 let chain_length = self.signals.len();
1158 let spread = self.compute_spread(doc).unwrap_or(0);
1159 let variations = self.collect_variations(doc);
1160 let (min_conf, max_conf, mean_conf) = self.confidence_stats(doc).unwrap_or((0.0, 0.0, 0.0));
1161
1162 let positions: Vec<usize> = self
1164 .signals
1165 .iter()
1166 .filter_map(|sr| {
1167 doc.signals
1168 .iter()
1169 .find(|s| s.id == sr.signal_id)
1170 .and_then(|s| s.location.text_offsets())
1171 .map(|(start, _)| start)
1172 })
1173 .collect();
1174
1175 let first_position = positions.iter().min().copied().unwrap_or(0);
1176 let last_position = positions.iter().max().copied().unwrap_or(0);
1177 let relative_spread = if text_len > 0 {
1178 spread as f64 / text_len as f64
1179 } else {
1180 0.0
1181 };
1182
1183 TrackStats {
1184 chain_length,
1185 variation_count: variations.len(),
1186 variations,
1187 spread,
1188 relative_spread,
1189 first_position,
1190 last_position,
1191 min_confidence: min_conf,
1192 max_confidence: max_conf,
1193 mean_confidence: mean_conf,
1194 has_embedding: self.embedding.is_some(),
1195 }
1196 }
1197}
1198
1199#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1201pub struct TrackStats {
1202 pub chain_length: usize,
1204 pub variation_count: usize,
1206 pub variations: Vec<String>,
1208 pub spread: usize,
1210 pub relative_spread: f64,
1212 pub first_position: usize,
1214 pub last_position: usize,
1216 pub min_confidence: f32,
1218 pub max_confidence: f32,
1220 pub mean_confidence: f32,
1222 pub has_embedding: bool,
1224}
1225
1226pub use super::types::IdentityId;
1232
1233#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1238pub enum IdentitySource {
1239 CrossDocCoref {
1242 track_refs: Vec<TrackRef>,
1244 },
1245 KnowledgeBase {
1248 kb_name: String,
1250 kb_id: String,
1252 },
1253 Hybrid {
1256 track_refs: Vec<TrackRef>,
1258 kb_name: String,
1260 kb_id: String,
1262 },
1263}
1264
1265#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1287pub struct Identity {
1288 pub id: IdentityId,
1290 pub canonical_name: String,
1292 pub entity_type: Option<super::types::TypeLabel>,
1296 pub kb_id: Option<String>,
1298 pub kb_name: Option<String>,
1300 pub description: Option<String>,
1302 pub embedding: Option<Vec<f32>>,
1305 pub aliases: Vec<String>,
1307 pub confidence: f32,
1309 #[serde(default, skip_serializing_if = "Option::is_none")]
1311 pub source: Option<IdentitySource>,
1312}
1313
1314impl Identity {
1315 #[must_use]
1317 pub fn new(id: impl Into<IdentityId>, canonical_name: impl Into<String>) -> Self {
1318 Self {
1319 id: id.into(),
1320 canonical_name: canonical_name.into(),
1321 entity_type: None,
1322 kb_id: None,
1323 kb_name: None,
1324 description: None,
1325 embedding: None,
1326 aliases: Vec::new(),
1327 confidence: 1.0,
1328 source: None,
1329 }
1330 }
1331
1332 #[must_use]
1334 pub fn from_kb(
1335 id: impl Into<IdentityId>,
1336 canonical_name: impl Into<String>,
1337 kb_name: impl Into<String>,
1338 kb_id: impl Into<String>,
1339 ) -> Self {
1340 let kb_name_str = kb_name.into();
1341 let kb_id_str = kb_id.into();
1342 Self {
1343 id: id.into(),
1344 canonical_name: canonical_name.into(),
1345 entity_type: None,
1346 kb_id: Some(kb_id_str.clone()),
1347 kb_name: Some(kb_name_str.clone()),
1348 description: None,
1349 embedding: None,
1350 aliases: Vec::new(),
1351 confidence: 1.0,
1352 source: Some(IdentitySource::KnowledgeBase {
1353 kb_name: kb_name_str,
1354 kb_id: kb_id_str,
1355 }),
1356 }
1357 }
1358
1359 pub fn add_alias(&mut self, alias: impl Into<String>) {
1361 self.aliases.push(alias.into());
1362 }
1363
1364 #[must_use]
1366 pub const fn id(&self) -> IdentityId {
1367 self.id
1368 }
1369
1370 #[must_use]
1372 pub fn canonical_name(&self) -> &str {
1373 &self.canonical_name
1374 }
1375
1376 #[must_use]
1378 pub fn kb_id(&self) -> Option<&str> {
1379 self.kb_id.as_deref()
1380 }
1381
1382 #[must_use]
1384 pub fn kb_name(&self) -> Option<&str> {
1385 self.kb_name.as_deref()
1386 }
1387
1388 #[must_use]
1390 pub fn aliases(&self) -> &[String] {
1391 &self.aliases
1392 }
1393
1394 #[must_use]
1396 pub const fn confidence(&self) -> f32 {
1397 self.confidence
1398 }
1399
1400 pub fn set_confidence(&mut self, confidence: f32) {
1402 self.confidence = confidence.clamp(0.0, 1.0);
1403 }
1404
1405 #[must_use]
1407 pub fn source(&self) -> Option<&IdentitySource> {
1408 self.source.as_ref()
1409 }
1410
1411 #[must_use]
1413 pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1414 self.embedding = Some(embedding);
1415 self
1416 }
1417
1418 #[must_use]
1422 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1423 let s = entity_type.into();
1424 self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1425 self
1426 }
1427
1428 #[must_use]
1433 pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1434 self.entity_type = Some(label);
1435 self
1436 }
1437
1438 #[must_use]
1443 pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1444 self.entity_type.clone()
1445 }
1446
1447 #[must_use]
1449 pub fn with_description(mut self, description: impl Into<String>) -> Self {
1450 self.description = Some(description.into());
1451 self
1452 }
1453
1454 }
1456
1457#[derive(Deserialize)]
1465struct GroundedDocumentWire {
1466 id: String,
1467 text: String,
1468 signals: Vec<Signal<Location>>,
1469 tracks: HashMap<TrackId, Track>,
1470 identities: HashMap<IdentityId, Identity>,
1471}
1472
1473impl From<GroundedDocumentWire> for GroundedDocument {
1474 fn from(wire: GroundedDocumentWire) -> Self {
1475 let mut doc = Self {
1476 id: wire.id,
1477 text: wire.text,
1478 signals: wire.signals,
1479 tracks: wire.tracks,
1480 identities: wire.identities,
1481 signal_to_track: HashMap::new(),
1482 track_to_identity: HashMap::new(),
1483 next_signal_id: SignalId::ZERO,
1484 next_track_id: TrackId::ZERO,
1485 next_identity_id: IdentityId::ZERO,
1486 };
1487 doc.rebuild_indexes();
1488 doc
1489 }
1490}
1491
1492#[derive(Debug, Clone, Serialize, Deserialize)]
1556#[serde(from = "GroundedDocumentWire")]
1557pub struct GroundedDocument {
1558 pub id: String,
1560 pub text: String,
1562 pub signals: Vec<Signal<Location>>,
1564 pub tracks: HashMap<TrackId, Track>,
1566 pub identities: HashMap<IdentityId, Identity>,
1568 #[serde(skip)]
1571 signal_to_track: HashMap<SignalId, TrackId>,
1572 #[serde(skip)]
1575 track_to_identity: HashMap<TrackId, IdentityId>,
1576 #[serde(skip)]
1579 next_signal_id: SignalId,
1580 #[serde(skip)]
1583 next_track_id: TrackId,
1584 #[serde(skip)]
1587 next_identity_id: IdentityId,
1588}
1589
1590impl GroundedDocument {
1591 #[must_use]
1593 pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
1594 Self {
1595 id: id.into(),
1596 text: text.into(),
1597 signals: Vec::new(),
1598 tracks: HashMap::new(),
1599 identities: HashMap::new(),
1600 signal_to_track: HashMap::new(),
1601 track_to_identity: HashMap::new(),
1602 next_signal_id: SignalId::ZERO,
1603 next_track_id: TrackId::ZERO,
1604 next_identity_id: IdentityId::ZERO,
1605 }
1606 }
1607
1608 pub fn rebuild_indexes(&mut self) {
1616 self.signal_to_track.clear();
1617 self.track_to_identity.clear();
1618
1619 for (&track_id, track) in &self.tracks {
1620 for sig_ref in &track.signals {
1621 self.signal_to_track.insert(sig_ref.signal_id, track_id);
1622 }
1623 if let Some(identity_id) = track.identity_id {
1624 self.track_to_identity.insert(track_id, identity_id);
1625 }
1626 }
1627
1628 self.next_signal_id = self
1629 .signals
1630 .iter()
1631 .map(|s| s.id)
1632 .max()
1633 .map_or(SignalId::ZERO, |id| id + 1);
1634 self.next_track_id = self
1635 .tracks
1636 .keys()
1637 .copied()
1638 .max()
1639 .map_or(TrackId::ZERO, |id| id + 1);
1640 self.next_identity_id = self
1641 .identities
1642 .keys()
1643 .copied()
1644 .max()
1645 .map_or(IdentityId::ZERO, |id| id + 1);
1646 }
1647
1648 pub fn add_signal(&mut self, mut signal: Signal<Location>) -> SignalId {
1654 let id = self.next_signal_id;
1655 signal.id = id;
1656 self.signals.push(signal);
1657 self.next_signal_id += 1;
1658 id
1659 }
1660
1661 #[must_use]
1663 pub fn get_signal(&self, id: impl Into<SignalId>) -> Option<&Signal<Location>> {
1664 let id = id.into();
1665 self.signals.iter().find(|s| s.id == id)
1666 }
1667
1668 pub fn signals(&self) -> &[Signal<Location>] {
1670 &self.signals
1671 }
1672
1673 pub fn add_track(&mut self, mut track: Track) -> TrackId {
1679 let id = self.next_track_id;
1680 track.id = id;
1681
1682 for signal_ref in &track.signals {
1684 self.signal_to_track.insert(signal_ref.signal_id, id);
1685 }
1686
1687 self.tracks.insert(id, track);
1688 self.next_track_id += 1;
1689 id
1690 }
1691
1692 #[must_use]
1694 pub fn get_track(&self, id: impl Into<TrackId>) -> Option<&Track> {
1695 self.tracks.get(&id.into())
1696 }
1697
1698 #[must_use]
1700 pub fn get_track_mut(&mut self, id: impl Into<TrackId>) -> Option<&mut Track> {
1701 self.tracks.get_mut(&id.into())
1702 }
1703
1704 pub fn add_signal_to_track(
1709 &mut self,
1710 signal_id: impl Into<SignalId>,
1711 track_id: impl Into<TrackId>,
1712 position: u32,
1713 ) -> bool {
1714 let signal_id = signal_id.into();
1715 let track_id = track_id.into();
1716 if let Some(track) = self.tracks.get_mut(&track_id) {
1717 track.add_signal(signal_id, position);
1718 self.signal_to_track.insert(signal_id, track_id);
1719 true
1720 } else {
1721 false
1722 }
1723 }
1724
1725 #[must_use]
1727 pub fn track_for_signal(&self, signal_id: SignalId) -> Option<&Track> {
1728 let track_id = self.signal_to_track.get(&signal_id)?;
1729 self.tracks.get(track_id)
1730 }
1731
1732 pub fn tracks(&self) -> impl Iterator<Item = &Track> {
1734 self.tracks.values()
1735 }
1736
1737 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
1743 let id = self.next_identity_id;
1744 identity.id = id;
1745 self.identities.insert(id, identity);
1746 self.next_identity_id += 1;
1747 id
1748 }
1749
1750 pub fn link_track_to_identity(
1752 &mut self,
1753 track_id: impl Into<TrackId>,
1754 identity_id: impl Into<IdentityId>,
1755 ) {
1756 let track_id = track_id.into();
1757 let identity_id = identity_id.into();
1758 if let Some(track) = self.tracks.get_mut(&track_id) {
1759 track.identity_id = Some(identity_id);
1760 self.track_to_identity.insert(track_id, identity_id);
1761 }
1762 }
1763
1764 #[must_use]
1766 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
1767 self.identities.get(&id)
1768 }
1769
1770 #[must_use]
1772 pub fn identity_for_track(&self, track_id: TrackId) -> Option<&Identity> {
1773 let identity_id = self.track_to_identity.get(&track_id)?;
1774 self.identities.get(identity_id)
1775 }
1776
1777 #[must_use]
1779 pub fn identity_for_signal(&self, signal_id: SignalId) -> Option<&Identity> {
1780 let track_id = self.signal_to_track.get(&signal_id)?;
1781 self.identity_for_track(*track_id)
1782 }
1783
1784 pub fn identities(&self) -> impl Iterator<Item = &Identity> {
1786 self.identities.values()
1787 }
1788
1789 #[must_use]
1794 pub fn track_ref(&self, track_id: TrackId) -> Option<TrackRef> {
1795 if self.tracks.contains_key(&track_id) {
1797 Some(TrackRef {
1798 doc_id: self.id.clone(),
1799 track_id,
1800 })
1801 } else {
1802 None
1803 }
1804 }
1805
1806 #[must_use]
1812 pub fn to_entities(&self) -> Vec<Entity> {
1813 self.signals
1814 .iter()
1815 .map(|signal| {
1816 let (start, end) = signal.location.text_offsets().unwrap_or((0, 0));
1817 let track = self.track_for_signal(signal.id);
1818 let identity = track.and_then(|t| self.identity_for_track(t.id));
1819
1820 Entity {
1821 text: signal.surface.clone(),
1822 entity_type: EntityType::from_label(signal.label.as_str()),
1823 start,
1824 end,
1825 confidence: Confidence::from(signal.confidence),
1826 normalized: signal.normalized.clone(),
1827 provenance: signal.provenance.clone(),
1828 kb_id: identity.and_then(|i| i.kb_id.clone()),
1829 canonical_id: track.map(|t| super::types::CanonicalId::new(t.id.get())),
1830 hierarchical_confidence: signal.hierarchical,
1831 visual_span: match &signal.location {
1832 Location::BoundingBox {
1833 x,
1834 y,
1835 width,
1836 height,
1837 page,
1838 } => Some(Span::BoundingBox {
1839 x: *x,
1840 y: *y,
1841 width: *width,
1842 height: *height,
1843 page: *page,
1844 }),
1845 Location::TextWithBbox { bbox, .. } => {
1846 if let Location::BoundingBox {
1847 x,
1848 y,
1849 width,
1850 height,
1851 page,
1852 } = bbox.as_ref()
1853 {
1854 Some(Span::BoundingBox {
1855 x: *x,
1856 y: *y,
1857 width: *width,
1858 height: *height,
1859 page: *page,
1860 })
1861 } else {
1862 None
1863 }
1864 }
1865 _ => None,
1866 },
1867 discontinuous_span: match &signal.location {
1868 Location::Discontinuous { segments } => Some(DiscontinuousSpan::new(
1869 segments.iter().map(|(s, e)| (*s)..(*e)).collect(),
1870 )),
1871 _ => None,
1872 },
1873 valid_from: None,
1874 valid_until: None,
1875 viewport: None,
1876 phi_features: None,
1877 mention_type: None,
1878 }
1879 })
1880 .collect()
1881 }
1882
1883 #[must_use]
1885 pub fn from_entities(
1886 id: impl Into<String>,
1887 text: impl Into<String>,
1888 entities: &[Entity],
1889 ) -> Self {
1890 let mut doc = Self::new(id, text);
1891
1892 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1898 enum TrackKey {
1899 Canonical(super::types::CanonicalId),
1900 Singleton(usize),
1901 }
1902
1903 let mut tracks_map: HashMap<TrackKey, Vec<SignalId>> = HashMap::new();
1904 let mut signal_to_entity_idx: HashMap<SignalId, usize> = HashMap::new();
1905
1906 for (idx, entity) in entities.iter().enumerate() {
1907 let location = if let Some(disc) = &entity.discontinuous_span {
1908 Location::Discontinuous {
1909 segments: disc.segments().iter().map(|r| (r.start, r.end)).collect(),
1910 }
1911 } else if let Some(visual) = &entity.visual_span {
1912 Location::from(visual)
1913 } else {
1914 Location::text(entity.start, entity.end)
1915 };
1916
1917 let mut signal = Signal::new(
1918 SignalId::new(idx as u64),
1919 location,
1920 &entity.text,
1921 entity.entity_type.as_label(),
1922 f32::from(entity.confidence),
1923 );
1924 signal.normalized = entity.normalized.clone();
1925 signal.provenance = entity.provenance.clone();
1926 signal.hierarchical = entity.hierarchical_confidence;
1927
1928 let signal_id = doc.add_signal(signal);
1929 signal_to_entity_idx.insert(signal_id, idx);
1930
1931 let key = match entity.canonical_id {
1932 Some(cid) => TrackKey::Canonical(cid),
1933 None => TrackKey::Singleton(idx),
1934 };
1935 tracks_map.entry(key).or_default().push(signal_id);
1936 }
1937
1938 for (_key, signal_ids) in tracks_map {
1940 if let Some(first_signal) = signal_ids.first().and_then(|id| doc.get_signal(*id)) {
1941 let mut track = Track::new(doc.next_track_id, &first_signal.surface);
1942 track.entity_type =
1943 Some(super::types::TypeLabel::from(first_signal.label.as_str()));
1944
1945 for (pos, &signal_id) in signal_ids.iter().enumerate() {
1946 track.add_signal(signal_id, pos as u32);
1947 }
1948
1949 let kb_id = signal_ids.iter().find_map(|sid| {
1952 let ent_idx = signal_to_entity_idx.get(sid).copied()?;
1953 entities.get(ent_idx)?.kb_id.clone()
1954 });
1955 if let Some(kb_id) = kb_id {
1956 let identity = Identity::from_kb(
1957 doc.next_identity_id,
1958 &track.canonical_surface,
1959 "unknown",
1960 kb_id,
1961 );
1962 let identity_id = doc.add_identity(identity);
1963 track = track.with_identity(identity_id);
1964 }
1965
1966 doc.add_track(track);
1967 }
1968 }
1969
1970 doc
1971 }
1972
1973 #[must_use]
1975 pub fn signals_with_label(&self, label: &str) -> Vec<&Signal<Location>> {
1976 let want = super::types::TypeLabel::from(label);
1977 self.signals.iter().filter(|s| s.label == want).collect()
1978 }
1979
1980 #[must_use]
1982 pub fn confident_signals(&self, threshold: f32) -> Vec<&Signal<Location>> {
1983 self.signals
1984 .iter()
1985 .filter(|s| s.confidence >= threshold)
1986 .collect()
1987 }
1988
1989 pub fn linked_tracks(&self) -> impl Iterator<Item = &Track> {
1991 self.tracks.values().filter(|t| t.identity_id.is_some())
1992 }
1993
1994 pub fn unlinked_tracks(&self) -> impl Iterator<Item = &Track> {
1996 self.tracks.values().filter(|t| t.identity_id.is_none())
1997 }
1998
1999 #[must_use]
2001 pub fn untracked_signal_count(&self) -> usize {
2002 self.signals
2003 .iter()
2004 .filter(|s| !self.signal_to_track.contains_key(&s.id))
2005 .count()
2006 }
2007
2008 #[must_use]
2010 pub fn untracked_signals(&self) -> Vec<&Signal<Location>> {
2011 self.signals
2012 .iter()
2013 .filter(|s| !self.signal_to_track.contains_key(&s.id))
2014 .collect()
2015 }
2016
2017 #[must_use]
2023 pub fn signals_by_modality(&self, modality: Modality) -> Vec<&Signal<Location>> {
2024 self.signals
2025 .iter()
2026 .filter(|s| s.modality == modality)
2027 .collect()
2028 }
2029
2030 #[must_use]
2032 pub fn text_signals(&self) -> Vec<&Signal<Location>> {
2033 self.signals_by_modality(Modality::Symbolic)
2034 }
2035
2036 #[must_use]
2038 pub fn visual_signals(&self) -> Vec<&Signal<Location>> {
2039 self.signals_by_modality(Modality::Iconic)
2040 }
2041
2042 #[must_use]
2044 pub fn overlapping_signals(&self, location: &Location) -> Vec<&Signal<Location>> {
2045 self.signals
2046 .iter()
2047 .filter(|s| s.location.overlaps(location))
2048 .collect()
2049 }
2050
2051 #[must_use]
2053 pub fn signals_in_range(&self, start: usize, end: usize) -> Vec<&Signal<Location>> {
2054 self.signals
2055 .iter()
2056 .filter(|s| {
2057 if let Some((s_start, s_end)) = s.location.text_offsets() {
2058 s_start >= start && s_end <= end
2059 } else {
2060 false
2061 }
2062 })
2063 .collect()
2064 }
2065
2066 #[must_use]
2068 pub fn negated_signals(&self) -> Vec<&Signal<Location>> {
2069 self.signals.iter().filter(|s| s.negated).collect()
2070 }
2071
2072 #[must_use]
2074 pub fn quantified_signals(&self, quantifier: Quantifier) -> Vec<&Signal<Location>> {
2075 self.signals
2076 .iter()
2077 .filter(|s| s.quantifier == Some(quantifier))
2078 .collect()
2079 }
2080
2081 #[must_use]
2103 pub fn validate(&self) -> Vec<SignalValidationError> {
2104 self.signals
2105 .iter()
2106 .filter_map(|s| s.validate_against(&self.text))
2107 .collect()
2108 }
2109
2110 #[must_use]
2134 pub fn validate_invariants(&self) -> Vec<String> {
2135 let mut errors = Vec::new();
2136
2137 let mut seen_ids = std::collections::HashSet::new();
2139 for signal in &self.signals {
2140 if !seen_ids.insert(signal.id) {
2141 errors.push(format!("Duplicate signal ID: {}", signal.id));
2142 }
2143 }
2144
2145 let signal_ids: std::collections::HashSet<_> = self.signals.iter().map(|s| s.id).collect();
2147
2148 for (track_id, track) in &self.tracks {
2150 for signal_ref in &track.signals {
2151 if !signal_ids.contains(&signal_ref.signal_id) {
2152 errors.push(format!(
2153 "Track {} references non-existent signal {}",
2154 track_id, signal_ref.signal_id
2155 ));
2156 }
2157 }
2158 }
2159
2160 for (signal_id, track_id) in &self.signal_to_track {
2162 if let Some(track) = self.tracks.get(track_id) {
2164 if !track.signals.iter().any(|r| r.signal_id == *signal_id) {
2166 errors.push(format!(
2167 "signal_to_track[{}] = {} but track doesn't contain signal",
2168 signal_id, track_id
2169 ));
2170 }
2171 } else {
2172 errors.push(format!(
2173 "signal_to_track[{}] = {} but track doesn't exist",
2174 signal_id, track_id
2175 ));
2176 }
2177 }
2178
2179 for (track_id, identity_id) in &self.track_to_identity {
2181 if let Some(track) = self.tracks.get(track_id) {
2183 if track.identity_id != Some(*identity_id) {
2184 errors.push(format!(
2185 "track_to_identity[{}] = {} but track.identity_id = {:?}",
2186 track_id, identity_id, track.identity_id
2187 ));
2188 }
2189 } else {
2190 errors.push(format!(
2191 "track_to_identity[{}] = {} but track doesn't exist",
2192 track_id, identity_id
2193 ));
2194 }
2195
2196 if !self.identities.contains_key(identity_id) {
2198 errors.push(format!(
2199 "track_to_identity[{}] = {} but identity doesn't exist",
2200 track_id, identity_id
2201 ));
2202 }
2203 }
2204
2205 for (track_id, track) in &self.tracks {
2207 if let Some(identity_id) = track.identity_id {
2208 if !self.identities.contains_key(&identity_id) {
2209 errors.push(format!(
2210 "Track {} references non-existent identity {}",
2211 track_id, identity_id
2212 ));
2213 }
2214 }
2215 }
2216
2217 errors
2218 }
2219
2220 #[must_use]
2222 pub fn invariants_hold(&self) -> bool {
2223 self.validate_invariants().is_empty()
2224 }
2225
2226 #[must_use]
2228 pub fn is_valid(&self) -> bool {
2229 self.signals.iter().all(|s| s.is_valid(&self.text))
2230 }
2231
2232 pub fn add_signal_validated(
2236 &mut self,
2237 signal: Signal<Location>,
2238 ) -> Result<SignalId, SignalValidationError> {
2239 if let Some(err) = signal.validate_against(&self.text) {
2240 return Err(err);
2241 }
2242 Ok(self.add_signal(signal))
2243 }
2244
2245 pub fn add_signal_from_text(
2259 &mut self,
2260 surface: &str,
2261 label: impl Into<super::types::TypeLabel>,
2262 confidence: f32,
2263 ) -> Option<SignalId> {
2264 let signal = Signal::from_text(&self.text, surface, label, confidence)?;
2265 Some(self.add_signal(signal))
2266 }
2267
2268 pub fn add_signal_from_text_nth(
2270 &mut self,
2271 surface: &str,
2272 label: impl Into<super::types::TypeLabel>,
2273 confidence: f32,
2274 occurrence: usize,
2275 ) -> Option<SignalId> {
2276 let signal = Signal::from_text_nth(&self.text, surface, label, confidence, occurrence)?;
2277 Some(self.add_signal(signal))
2278 }
2279
2280 #[must_use]
2286 pub fn stats(&self) -> DocumentStats {
2287 let signal_count = self.signals.len();
2288 let track_count = self.tracks.len();
2289 let identity_count = self.identities.len();
2290
2291 let linked_track_count = self
2292 .tracks
2293 .values()
2294 .filter(|t| t.identity_id.is_some())
2295 .count();
2296 let untracked_count = self.untracked_signal_count();
2297
2298 let avg_track_size = if track_count > 0 {
2299 self.tracks.values().map(|t| t.len()).sum::<usize>() as f32 / track_count as f32
2300 } else {
2301 0.0
2302 };
2303
2304 let singleton_count = self.tracks.values().filter(|t| t.is_singleton()).count();
2305
2306 let avg_confidence = if signal_count > 0 {
2307 self.signals.iter().map(|s| s.confidence).sum::<f32>() / signal_count as f32
2308 } else {
2309 0.0
2310 };
2311
2312 let negated_count = self.signals.iter().filter(|s| s.negated).count();
2313
2314 let symbolic_count = self
2316 .signals
2317 .iter()
2318 .filter(|s| s.modality == Modality::Symbolic)
2319 .count();
2320 let iconic_count = self
2321 .signals
2322 .iter()
2323 .filter(|s| s.modality == Modality::Iconic)
2324 .count();
2325 let hybrid_count = self
2326 .signals
2327 .iter()
2328 .filter(|s| s.modality == Modality::Hybrid)
2329 .count();
2330
2331 DocumentStats {
2332 signal_count,
2333 track_count,
2334 identity_count,
2335 linked_track_count,
2336 untracked_count,
2337 avg_track_size,
2338 singleton_count,
2339 avg_confidence,
2340 negated_count,
2341 symbolic_count,
2342 iconic_count,
2343 hybrid_count,
2344 }
2345 }
2346
2347 pub fn add_signals(
2355 &mut self,
2356 signals: impl IntoIterator<Item = Signal<Location>>,
2357 ) -> Vec<SignalId> {
2358 signals.into_iter().map(|s| self.add_signal(s)).collect()
2359 }
2360
2361 pub fn create_track_from_signals(
2365 &mut self,
2366 canonical: impl Into<String>,
2367 signal_ids: &[SignalId],
2368 ) -> Option<TrackId> {
2369 if signal_ids.is_empty() {
2370 return None;
2371 }
2372
2373 let mut track = Track::new(TrackId::ZERO, canonical);
2374 for (pos, &id) in signal_ids.iter().enumerate() {
2375 track.add_signal(id, pos as u32);
2376 }
2377 Some(self.add_track(track))
2378 }
2379
2380 pub fn merge_tracks(&mut self, track_ids: &[TrackId]) -> Option<TrackId> {
2385 if track_ids.is_empty() {
2386 return None;
2387 }
2388
2389 let mut all_signals: Vec<SignalRef> = Vec::new();
2391 let mut canonical = String::new();
2392 let mut entity_type = None;
2393
2394 for &track_id in track_ids {
2395 if let Some(track) = self.tracks.get(&track_id) {
2396 if canonical.is_empty() {
2397 canonical = track.canonical_surface.clone();
2398 entity_type = track.entity_type.clone();
2399 }
2400 all_signals.extend(track.signals.iter().cloned());
2401 }
2402 }
2403
2404 if all_signals.is_empty() {
2405 return None;
2406 }
2407
2408 all_signals.sort_by_key(|s| s.position);
2410
2411 for &track_id in track_ids {
2413 self.tracks.remove(&track_id);
2414 }
2415
2416 let mut new_track = Track::new(TrackId::ZERO, canonical);
2418 new_track.entity_type = entity_type;
2419 for (pos, signal_ref) in all_signals.iter().enumerate() {
2420 new_track.add_signal(signal_ref.signal_id, pos as u32);
2421 }
2422
2423 Some(self.add_track(new_track))
2424 }
2425
2426 #[must_use]
2428 pub fn find_overlapping_signal_pairs(&self) -> Vec<(SignalId, SignalId)> {
2429 let mut pairs = Vec::new();
2430 let signals: Vec<_> = self.signals.iter().collect();
2431
2432 for i in 0..signals.len() {
2433 for j in (i + 1)..signals.len() {
2434 if signals[i].location.overlaps(&signals[j].location) {
2435 pairs.push((signals[i].id, signals[j].id));
2436 }
2437 }
2438 }
2439
2440 pairs
2441 }
2442}
2443
2444#[derive(Debug, Clone, Copy, Default)]
2446pub struct DocumentStats {
2447 pub signal_count: usize,
2449 pub track_count: usize,
2451 pub identity_count: usize,
2453 pub linked_track_count: usize,
2455 pub untracked_count: usize,
2457 pub avg_track_size: f32,
2459 pub singleton_count: usize,
2461 pub avg_confidence: f32,
2463 pub negated_count: usize,
2465 pub symbolic_count: usize,
2467 pub iconic_count: usize,
2469 pub hybrid_count: usize,
2471}
2472
2473impl std::fmt::Display for DocumentStats {
2474 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2475 writeln!(f, "Document Statistics:")?;
2476 writeln!(
2477 f,
2478 " Signals: {} (avg confidence: {:.2})",
2479 self.signal_count, self.avg_confidence
2480 )?;
2481 writeln!(
2482 f,
2483 " Tracks: {} (avg size: {:.1}, singletons: {})",
2484 self.track_count, self.avg_track_size, self.singleton_count
2485 )?;
2486 writeln!(
2487 f,
2488 " Identities: {} ({} tracks linked)",
2489 self.identity_count, self.linked_track_count
2490 )?;
2491 writeln!(f, " Untracked signals: {}", self.untracked_count)?;
2492 writeln!(
2493 f,
2494 " Modalities: {} symbolic, {} iconic, {} hybrid",
2495 self.symbolic_count, self.iconic_count, self.hybrid_count
2496 )?;
2497 if self.negated_count > 0 {
2498 writeln!(f, " Negated: {}", self.negated_count)?;
2499 }
2500 Ok(())
2501 }
2502}
2503
2504#[derive(Debug, Clone)]
2514struct IntervalNode {
2515 signal_id: SignalId,
2517 start: usize,
2519 end: usize,
2521 max_end: usize,
2523 left: Option<Box<IntervalNode>>,
2525 right: Option<Box<IntervalNode>>,
2527}
2528
2529impl IntervalNode {
2530 fn new(signal_id: SignalId, start: usize, end: usize) -> Self {
2531 Self {
2532 signal_id,
2533 start,
2534 end,
2535 max_end: end,
2536 left: None,
2537 right: None,
2538 }
2539 }
2540
2541 fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2542 self.max_end = self.max_end.max(end);
2543
2544 if start < self.start {
2545 if let Some(ref mut left) = self.left {
2546 left.insert(signal_id, start, end);
2547 } else {
2548 self.left = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2549 }
2550 } else if let Some(ref mut right) = self.right {
2551 right.insert(signal_id, start, end);
2552 } else {
2553 self.right = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2554 }
2555 }
2556
2557 fn query_overlap(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2558 if self.start < query_end && query_start < self.end {
2560 results.push(self.signal_id);
2561 }
2562
2563 if let Some(ref left) = self.left {
2565 if left.max_end > query_start {
2566 left.query_overlap(query_start, query_end, results);
2567 }
2568 }
2569
2570 if let Some(ref right) = self.right {
2572 if self.start < query_end {
2573 right.query_overlap(query_start, query_end, results);
2574 }
2575 }
2576 }
2577
2578 fn query_containing(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2579 if self.start <= query_start && self.end >= query_end {
2581 results.push(self.signal_id);
2582 }
2583
2584 if let Some(ref left) = self.left {
2586 if left.max_end >= query_end {
2587 left.query_containing(query_start, query_end, results);
2588 }
2589 }
2590
2591 if let Some(ref right) = self.right {
2593 if self.start <= query_start {
2594 right.query_containing(query_start, query_end, results);
2595 }
2596 }
2597 }
2598
2599 fn query_contained_in(
2600 &self,
2601 range_start: usize,
2602 range_end: usize,
2603 results: &mut Vec<SignalId>,
2604 ) {
2605 if self.start >= range_start && self.end <= range_end {
2607 results.push(self.signal_id);
2608 }
2609
2610 if let Some(ref left) = self.left {
2612 left.query_contained_in(range_start, range_end, results);
2613 }
2614
2615 if let Some(ref right) = self.right {
2617 if self.start < range_end {
2618 right.query_contained_in(range_start, range_end, results);
2619 }
2620 }
2621 }
2622}
2623
2624#[derive(Debug, Clone, Default)]
2640pub struct TextSpatialIndex {
2641 root: Option<IntervalNode>,
2642 size: usize,
2643}
2644
2645impl TextSpatialIndex {
2646 #[must_use]
2648 pub fn new() -> Self {
2649 Self::default()
2650 }
2651
2652 #[must_use]
2654 pub fn from_signals(signals: &[Signal<Location>]) -> Self {
2655 let mut index = Self::new();
2656 for signal in signals {
2657 if let Some((start, end)) = signal.location.text_offsets() {
2658 index.insert(signal.id, start, end);
2659 }
2660 }
2661 index
2662 }
2663
2664 pub fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2666 if let Some(ref mut root) = self.root {
2667 root.insert(signal_id, start, end);
2668 } else {
2669 self.root = Some(IntervalNode::new(signal_id, start, end));
2670 }
2671 self.size += 1;
2672 }
2673
2674 #[must_use]
2676 pub fn query_overlap(&self, start: usize, end: usize) -> Vec<SignalId> {
2677 let mut results = Vec::new();
2678 if let Some(ref root) = self.root {
2679 root.query_overlap(start, end, &mut results);
2680 }
2681 results
2682 }
2683
2684 #[must_use]
2686 pub fn query_containing(&self, start: usize, end: usize) -> Vec<SignalId> {
2687 let mut results = Vec::new();
2688 if let Some(ref root) = self.root {
2689 root.query_containing(start, end, &mut results);
2690 }
2691 results
2692 }
2693
2694 #[must_use]
2696 pub fn query_contained_in(&self, start: usize, end: usize) -> Vec<SignalId> {
2697 let mut results = Vec::new();
2698 if let Some(ref root) = self.root {
2699 root.query_contained_in(start, end, &mut results);
2700 }
2701 results
2702 }
2703
2704 #[must_use]
2706 pub fn len(&self) -> usize {
2707 self.size
2708 }
2709
2710 #[must_use]
2712 pub fn is_empty(&self) -> bool {
2713 self.size == 0
2714 }
2715}
2716
2717impl GroundedDocument {
2718 #[must_use]
2737 pub fn build_text_index(&self) -> TextSpatialIndex {
2738 TextSpatialIndex::from_signals(&self.signals)
2739 }
2740
2741 #[must_use]
2746 pub fn query_signals_in_range_indexed(
2747 &self,
2748 start: usize,
2749 end: usize,
2750 ) -> Vec<&Signal<Location>> {
2751 let index = self.build_text_index();
2752 let ids = index.query_contained_in(start, end);
2753 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2754 }
2755
2756 #[must_use]
2758 pub fn query_overlapping_signals_indexed(
2759 &self,
2760 start: usize,
2761 end: usize,
2762 ) -> Vec<&Signal<Location>> {
2763 let index = self.build_text_index();
2764 let ids = index.query_overlap(start, end);
2765 ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2766 }
2767
2768 #[must_use]
2781 pub fn to_coref_document(&self) -> super::coref::CorefDocument {
2782 use super::coref::{CorefChain, CorefDocument, Mention};
2783 use std::collections::HashMap;
2784
2785 let signal_by_id: HashMap<SignalId, &Signal<Location>> =
2787 self.signals.iter().map(|s| (s.id, s)).collect();
2788
2789 let mut chains: Vec<CorefChain> = Vec::new();
2790
2791 for track in self.tracks.values() {
2792 let mut mentions: Vec<Mention> = Vec::new();
2793
2794 for sref in &track.signals {
2795 let Some(signal) = signal_by_id.get(&sref.signal_id) else {
2796 continue;
2797 };
2798
2799 let Some((start, end)) = signal.location.text_offsets() else {
2800 continue;
2801 };
2802
2803 let mut m = Mention::new(signal.surface.clone(), start, end);
2804 m.entity_type = Some(signal.label.to_string());
2805 mentions.push(m);
2806 }
2807
2808 if mentions.is_empty() {
2809 continue;
2810 }
2811
2812 let mut chain = CorefChain::new(mentions);
2813 chain.entity_type = track.entity_type.as_ref().map(|t| t.to_string());
2814 chains.push(chain);
2815 }
2816
2817 chains.sort_by_key(|c| c.mentions.first().map(|m| m.start).unwrap_or(usize::MAX));
2819
2820 CorefDocument::with_id(&self.text, &self.id, chains)
2821 }
2822}
2823
2824pub fn render_document_html(doc: &GroundedDocument) -> String {
2832 let mut html = String::new();
2833 let stats = doc.stats();
2834
2835 html.push_str(r#"<!DOCTYPE html>
2836<html>
2837<head>
2838<meta charset="UTF-8">
2839<meta name="color-scheme" content="dark light">
2840<title>grounded::GroundedDocument</title>
2841<style>
2842:root{
2843 /* Allow UA widgets (inputs/scrollbars) to match the theme */
2844 color-scheme: light dark;
2845 /* Dark (default) */
2846 --bg:#0a0a0a;
2847 --panel-bg:#0d0d0d;
2848 --text:#b0b0b0;
2849 --text-strong:#fff;
2850 --muted:#666;
2851 --border:#222;
2852 --border-strong:#333;
2853 --hover:#111;
2854 --input-bg:#080808;
2855 --active:#fff;
2856 --track-strong:rgba(255,255,255,0.35);
2857 --track-soft:rgba(255,255,255,0.18);
2858 /* Entity colors (dark) */
2859 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2860 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2861 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2862 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2863 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2864 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2865 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2866}
2867@media (prefers-color-scheme: light){
2868 :root{
2869 --bg:#ffffff;
2870 --panel-bg:#f7f7f7;
2871 --text:#222;
2872 --text-strong:#000;
2873 --muted:#555;
2874 --border:#d6d6d6;
2875 --border-strong:#c6c6c6;
2876 --hover:#f0f0f0;
2877 --input-bg:#ffffff;
2878 --active:#000;
2879 --track-strong:rgba(0,0,0,0.25);
2880 --track-soft:rgba(0,0,0,0.12);
2881 /* Entity colors (light) */
2882 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2883 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2884 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2885 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2886 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2887 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2888 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2889 }
2890}
2891html[data-theme='dark']{
2892 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
2893 --muted:#666; --border:#222; --border-strong:#333; --hover:#111;
2894 --input-bg:#080808; --active:#fff;
2895 --track-strong:rgba(255,255,255,0.35); --track-soft:rgba(255,255,255,0.18);
2896 --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2897 --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2898 --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2899 --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2900 --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2901 --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2902 --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2903}
2904html[data-theme='light']{
2905 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
2906 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0;
2907 --input-bg:#ffffff; --active:#000;
2908 --track-strong:rgba(0,0,0,0.25); --track-soft:rgba(0,0,0,0.12);
2909 --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2910 --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2911 --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2912 --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2913 --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2914 --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2915 --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2916}
2917
2918*{box-sizing:border-box;margin:0;padding:0}
2919body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
2920h1,h2,h3{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
2921h1{font-size:14px}h2{font-size:12px}h3{font-size:11px;color:var(--muted)}
2922 a{color:inherit}
2923 a:hover{text-decoration:underline}
2924table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
2925th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
2926th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
2927tr:hover{background:var(--hover)}
2928.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:8px}
2929.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
2930.panel-h{display:flex;align-items:center;gap:8px}
2931.toggle{cursor:pointer;user-select:none;color:var(--muted);border:1px solid var(--border);background:var(--bg);padding:2px 6px;font-size:10px}
2932.panel-collapsed table,.panel-collapsed .panel-body{display:none}
2933.toolbar{display:flex;gap:8px;align-items:center;margin:8px 0 0}
2934.toolbar input{width:100%;max-width:520px;background:var(--input-bg);border:1px solid var(--border);color:var(--text);padding:6px 8px;font:12px monospace}
2935.muted{color:var(--muted)}
2936.panel-body{white-space:pre-wrap;word-break:break-word}
2937.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
2938.e{padding:1px 2px;border-bottom:1px solid}
2939.seg{cursor:pointer}
2940.e-per{background:var(--per-bg);border-color:var(--per-br);color:var(--per-tx)}
2941.e-org{background:var(--org-bg);border-color:var(--org-br);color:var(--org-tx)}
2942.e-loc{background:var(--loc-bg);border-color:var(--loc-br);color:var(--loc-tx)}
2943.e-misc{background:var(--mis-bg);border-color:var(--mis-br);color:var(--mis-tx)}
2944.e-date{background:var(--dat-bg);border-color:var(--dat-br);color:var(--dat-tx)}
2945.e-track{box-shadow:inset 0 0 0 1px var(--track-strong)}
2946.e-track-hover{box-shadow:inset 0 0 0 1px var(--track-soft)}
2947.e-active{outline:2px solid var(--active);outline-offset:1px}
2948.conf{color:var(--muted);font-size:10px}
2949.badge{display:inline-block;padding:1px 4px;font-size:9px;text-transform:uppercase}
2950.badge-y{background:var(--badge-y-bg);color:var(--badge-y-tx);border:1px solid var(--badge-y-br)}
2951.badge-n{background:var(--badge-n-bg);color:var(--badge-n-tx);border:1px solid var(--badge-n-br)}
2952.stats{display:flex;gap:16px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
2953.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
2954.id{color:var(--muted);font-size:9px}
2955.kb{color:var(--muted)}
2956.arrow{color:var(--muted)}
2957</style>
2958</head>
2959<body>
2960"#);
2961
2962 html.push_str(&format!(
2964 r#"<div class="panel-h" style="justify-content:space-between"><h1>doc_id="{}" len={}</h1><span class="toggle" id="theme-toggle" title="toggle theme (auto → dark → light)">theme: auto</span></div>"#,
2965 html_escape(&doc.id),
2966 doc.text.len()
2967 ));
2968
2969 html.push_str(r#"<div class="stats">"#);
2970 html.push_str(&format!(
2971 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">signals</div></div>"#,
2972 stats.signal_count
2973 ));
2974 html.push_str(&format!(
2975 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">tracks</div></div>"#,
2976 stats.track_count
2977 ));
2978 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">identities</div></div>"#, stats.identity_count));
2979 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{:.2}</div><div class="stat-l">avg_conf</div></div>"#, stats.avg_confidence));
2980 html.push_str(&format!(
2981 r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">linked</div></div>"#,
2982 stats.linked_track_count
2983 ));
2984 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">untracked</div></div>"#, stats.untracked_count));
2985 if stats.iconic_count > 0 || stats.hybrid_count > 0 {
2986 html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}/{}/{}</div><div class="stat-l">sym/ico/hyb</div></div>"#,
2987 stats.symbolic_count, stats.iconic_count, stats.hybrid_count));
2988 }
2989 html.push_str(r#"</div>"#);
2990
2991 html.push_str(r#"<h2>text</h2>"#);
2993 html.push_str(r#"<div class="text-box">"#);
2994 html.push_str(&annotate_text_html(
2995 &doc.text,
2996 doc.signals(),
2997 &doc.signal_to_track,
2998 ));
2999 html.push_str(r#"</div>"#);
3000
3001 html.push_str(
3003 r#"<h2>selection</h2><div class="panel" id="selection-panel" role="region" aria-label="selection"><div class="panel-h"><h3>selection</h3><span class="muted" id="selection-hint" role="status" aria-live="polite">click a mention / row to see coref track details</span></div><pre class="panel-body" id="selection-body" role="textbox" aria-readonly="true" aria-label="selection details">—</pre></div>"#,
3004 );
3005
3006 html.push_str(r#"<div class="grid">"#);
3008
3009 html.push_str(r#"<div class="panel" id="panel-signals"><div class="panel-h"><h3>signals (level 1)</h3><span class="toggle" data-toggle="panel-signals">toggle</span></div><div class="toolbar"><input id="signal-filter" type="text" placeholder="filter signals: id / label / surface (e.g. 'PER', 'S12', 'Paris')" /><span class="muted" id="signal-filter-count"></span></div><table id="signals-table">"#);
3011 html.push_str(r#"<tr><th>id</th><th>span</th><th>surface</th><th>label</th><th>conf</th><th>track</th></tr>"#);
3012 for signal in doc.signals() {
3013 let (span, start_opt, end_opt) = if let Some((s, e)) = signal.location.text_offsets() {
3014 (format!("[{},{})", s, e), Some(s), Some(e))
3015 } else {
3016 ("bbox".to_string(), None, None)
3017 };
3018 let track_id_num = doc.signal_to_track.get(&signal.id).copied();
3019 let track_id = track_id_num
3020 .map(|t| format!("T{}", t))
3021 .unwrap_or_else(|| "-".to_string());
3022 let track_attr = track_id_num
3023 .map(|t| format!(r#" data-track="{}""#, t))
3024 .unwrap_or_default();
3025 let offs_attr = match (start_opt, end_opt) {
3026 (Some(s), Some(e)) => format!(r#" data-start="{}" data-end="{}""#, s, e),
3027 _ => String::new(),
3028 };
3029 let neg = if signal.negated { " NEG" } else { "" };
3030 html.push_str(&format!(
3031 r#"<tr data-sid="S{sid}" data-label="{label}" data-surface="{surface}"{track_attr}{offs_attr} data-conf="{conf:.2}"><td class="id"><a href='#S{sid}'>S{sid}</a></td><td>{span}</td><td>{surface}</td><td>{label}{neg}</td><td class="conf">{conf:.2}</td><td class="id">{track}</td></tr>"#,
3032 sid = signal.id,
3033 span = span,
3034 surface = html_escape(&signal.surface),
3035 label = html_escape(signal.label.as_str()),
3036 neg = neg,
3037 conf = signal.confidence,
3038 track = track_id,
3039 track_attr = track_attr,
3040 offs_attr = offs_attr
3041 ));
3042 }
3043 html.push_str(r#"</table></div>"#);
3044
3045 html.push_str(r#"<div class="panel" id="panel-tracks"><div class="panel-h"><h3>tracks (level 2)</h3><span class="toggle" data-toggle="panel-tracks">toggle</span></div><table id="tracks-table">"#);
3047 html.push_str(r#"<tr><th>id</th><th>canonical</th><th>type</th><th>|S|</th><th>signals</th><th>identity</th></tr>"#);
3048 for track in doc.tracks() {
3049 let entity_type = track
3050 .entity_type
3051 .as_ref()
3052 .map(|t| t.as_str())
3053 .unwrap_or("-");
3054 let signals: Vec<String> = track
3055 .signals
3056 .iter()
3057 .map(|s| format!("S{}", s.signal_id))
3058 .collect();
3059 let identity = doc
3060 .identity_for_track(track.id)
3061 .map(|i| format!("I{}", i.id))
3062 .unwrap_or_else(|| "-".to_string());
3063 let linked_badge = if track.identity_id.is_some() {
3064 r#"<span class="badge badge-y">y</span>"#
3065 } else {
3066 r#"<span class="badge badge-n">n</span>"#
3067 };
3068 html.push_str(&format!(
3069 r#"<tr data-tid="{tid}"><td class="id">T{tid}</td><td>{canonical_surface}</td><td>{etype}</td><td>{n}</td><td class="id">{sigs}</td><td class="id">{ident} {badge}</td></tr>"#,
3070 tid = track.id,
3071 canonical_surface = html_escape(&track.canonical_surface),
3072 etype = html_escape(entity_type),
3073 n = track.len(),
3074 sigs = html_escape(&signals.join(" ")),
3075 ident = identity,
3076 badge = linked_badge
3077 ));
3078 }
3079 html.push_str(r#"</table></div>"#);
3080
3081 html.push_str(r#"<div class="panel" id="panel-identities"><div class="panel-h"><h3>identities (level 3)</h3><span class="toggle" data-toggle="panel-identities">toggle</span></div><table>"#);
3083 html.push_str(r#"<tr><th>id</th><th>name</th><th>type</th><th>kb</th><th>kb_id</th><th>aliases</th></tr>"#);
3084 for identity in doc.identities() {
3085 let kb = identity.kb_name.as_deref().unwrap_or("-");
3086 let kb_id = identity.kb_id.as_deref().unwrap_or("-");
3087 let entity_type = identity
3088 .entity_type
3089 .as_ref()
3090 .map(|t| t.as_str())
3091 .unwrap_or("-");
3092 let aliases = if identity.aliases.is_empty() {
3093 "-".to_string()
3094 } else {
3095 identity.aliases.join(", ")
3096 };
3097 html.push_str(&format!(
3098 r#"<tr><td class="id">I{}</td><td>{}</td><td>{}</td><td class="kb">{}</td><td class="kb">{}</td><td>{}</td></tr>"#,
3099 identity.id, html_escape(&identity.canonical_name), entity_type, kb, kb_id, html_escape(&aliases)
3100 ));
3101 }
3102 html.push_str(r#"</table></div>"#);
3103
3104 html.push_str(r#"</div>"#); html.push_str(r#"<h2>hierarchy trace</h2><div class="panel"><table>"#);
3108 html.push_str(r#"<tr><th>signal</th><th></th><th>track</th><th></th><th>identity</th><th>kb_id</th></tr>"#);
3109 for signal in doc.signals() {
3110 let track = doc.track_for_signal(signal.id);
3111 let identity = doc.identity_for_signal(signal.id);
3112
3113 let track_str = track
3114 .map(|t| format!("T{} \"{}\"", t.id, html_escape(&t.canonical_surface)))
3115 .unwrap_or_else(|| "-".to_string());
3116 let identity_str = identity
3117 .map(|i| format!("I{} \"{}\"", i.id, html_escape(&i.canonical_name)))
3118 .unwrap_or_else(|| "-".to_string());
3119 let kb_str = identity
3120 .and_then(|i| i.kb_id.as_ref())
3121 .map(|s| s.as_str())
3122 .unwrap_or("-");
3123
3124 html.push_str(&format!(
3125 r#"<tr><td>S{} "{}"</td><td class="arrow">→</td><td>{}</td><td class="arrow">→</td><td>{}</td><td class="kb">{}</td></tr>"#,
3126 signal.id, html_escape(&signal.surface), track_str, identity_str, kb_str
3127 ));
3128 }
3129 html.push_str(r#"</table></div>"#);
3130
3131 html.push_str(r#"<script>
3134(() => {
3135 // Index signal metadata from the signals table, and map signal/track → text elements.
3136 const signalMeta = new Map();
3137 document.querySelectorAll('#signals-table tr[data-sid]').forEach((row) => {
3138 const sid = row.getAttribute('data-sid');
3139 if (!sid) return;
3140 signalMeta.set(sid, {
3141 sid,
3142 label: row.getAttribute('data-label') || '',
3143 surface: row.getAttribute('data-surface') || '',
3144 conf: row.getAttribute('data-conf') || '',
3145 start: row.getAttribute('data-start'),
3146 end: row.getAttribute('data-end'),
3147 track: row.getAttribute('data-track'),
3148 });
3149 });
3150
3151 const signalEls = new Map();
3152 const addSignalEl = (sid, el) => {
3153 if (!sid || !el) return;
3154 const arr = signalEls.get(sid) || [];
3155 arr.push(el);
3156 signalEls.set(sid, arr);
3157 };
3158 // Old-style inline spans (non-overlapping renderer).
3159 document.querySelectorAll('span.e[data-sid]').forEach((el) => {
3160 addSignalEl(el.getAttribute('data-sid'), el);
3161 });
3162 // Segmented spans (overlap/discontinuous-safe renderer).
3163 document.querySelectorAll('span.seg[data-sids]').forEach((el) => {
3164 const raw = (el.getAttribute('data-sids') || '').trim();
3165 if (!raw) return;
3166 raw.split(/\s+/).filter(Boolean).forEach((sid) => addSignalEl(sid, el));
3167 });
3168
3169 const trackEls = new Map();
3170 for (const [sid, els] of signalEls.entries()) {
3171 const meta = signalMeta.get(sid);
3172 const tid = meta ? meta.track : null;
3173 if (!tid) continue;
3174 const arr = trackEls.get(tid) || [];
3175 els.forEach((el) => arr.push(el));
3176 trackEls.set(tid, arr);
3177 }
3178
3179 const selectionBody = document.getElementById('selection-body');
3180 const selectionHint = document.getElementById('selection-hint');
3181 const defaultHint = selectionHint ? (selectionHint.textContent || '') : '';
3182 const setSelection = (text) => {
3183 if (!selectionBody) return;
3184 selectionBody.textContent = text;
3185 };
3186 const setHint = (text) => {
3187 if (!selectionHint) return;
3188 selectionHint.textContent = text || defaultHint;
3189 };
3190
3191 // Theme toggle: auto (prefers-color-scheme) → dark → light.
3192 const themeBtn = document.getElementById('theme-toggle');
3193 const themeKey = 'anno-theme';
3194 const applyTheme = (theme) => {
3195 const t = theme || 'auto';
3196 if (t === 'auto') {
3197 delete document.documentElement.dataset.theme;
3198 } else {
3199 document.documentElement.dataset.theme = t;
3200 }
3201 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
3202 };
3203 const readTheme = () => {
3204 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
3205 };
3206 const writeTheme = (t) => {
3207 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
3208 };
3209 applyTheme(readTheme());
3210 if (themeBtn) {
3211 themeBtn.addEventListener('click', () => {
3212 const cur = readTheme();
3213 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
3214 writeTheme(next);
3215 applyTheme(next);
3216 });
3217 }
3218
3219 let activeSignalEls = [];
3220 let activeSignalRow = null;
3221 const clearActive = () => {
3222 if (activeSignalEls && activeSignalEls.length) {
3223 activeSignalEls.forEach((el) => el.classList.remove('e-active'));
3224 }
3225 if (activeSignalRow) activeSignalRow.classList.remove('e-active');
3226 activeSignalEls = [];
3227 activeSignalRow = null;
3228 };
3229
3230 let activeTrack = null;
3231 let hoverTrack = null;
3232
3233 const removeTrackClass = (tid, cls) => {
3234 if (!tid) return;
3235 const els = trackEls.get(tid);
3236 if (!els) return;
3237 els.forEach((el) => el.classList.remove(cls));
3238 };
3239
3240 const addTrackClass = (tid, cls) => {
3241 if (!tid) return;
3242 const els = trackEls.get(tid);
3243 if (!els) return;
3244 els.forEach((el) => el.classList.add(cls));
3245 };
3246
3247 const trackSize = (tid) => {
3248 const els = tid ? trackEls.get(tid) : null;
3249 return els ? els.length : 0;
3250 };
3251
3252 const getTrackSelectionText = (tid) => {
3253 if (!tid) return 'track: - (untracked)';
3254 const row = document.querySelector(`#tracks-table tr[data-tid='${tid}']`);
3255 if (!row) return `track T${tid}`;
3256 const cells = row.querySelectorAll('td');
3257 const canonical = (cells[1]?.textContent || '').trim();
3258 const etype = (cells[2]?.textContent || '').trim();
3259 const count = (cells[3]?.textContent || '').trim();
3260 const sigs = (cells[4]?.textContent || '').trim();
3261 const lines = [];
3262 lines.push(`track T${tid} canonical="${canonical}" type="${etype}" mentions=${count}`);
3263 if (sigs) lines.push(`track signals: ${sigs}`);
3264 return lines.join('\n');
3265 };
3266
3267 const renderTrackSelection = (tid) => setSelection(getTrackSelectionText(tid));
3268
3269 const renderSignalSelectionBySid = (sid) => {
3270 const meta = signalMeta.get(sid);
3271 const label = meta ? (meta.label || '') : '';
3272 const conf = meta ? (meta.conf || '') : '';
3273 const start = meta ? meta.start : null;
3274 const end = meta ? meta.end : null;
3275 const tid = meta ? meta.track : null;
3276 const lines = [];
3277 if (start !== null && end !== null) {
3278 lines.push(`signal ${sid} label=${label} conf=${conf} span=[${start},${end})`);
3279 } else {
3280 lines.push(`signal ${sid} label=${label} conf=${conf}`);
3281 }
3282 if (meta && meta.surface) lines.push(`surface: ${meta.surface}`);
3283 lines.push('');
3284 lines.push(getTrackSelectionText(tid));
3285 setSelection(lines.join('\n'));
3286 };
3287
3288 const setActiveTrack = (tid) => {
3289 const next = tid || null;
3290 if (activeTrack === next) return;
3291 removeTrackClass(activeTrack, 'e-track');
3292 activeTrack = next;
3293 if (activeTrack) addTrackClass(activeTrack, 'e-track');
3294 if (hoverTrack && activeTrack && hoverTrack === activeTrack) {
3295 removeTrackClass(hoverTrack, 'e-track-hover');
3296 }
3297 };
3298
3299 const setHoverTrack = (tid) => {
3300 const next = tid || null;
3301 if (hoverTrack === next) return;
3302 removeTrackClass(hoverTrack, 'e-track-hover');
3303 hoverTrack = next;
3304 if (!hoverTrack) {
3305 setHint('');
3306 return;
3307 }
3308 if (activeTrack && hoverTrack === activeTrack) {
3309 setHint(`selected track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3310 return;
3311 }
3312 addTrackClass(hoverTrack, 'e-track-hover');
3313 setHint(`hover track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3314 };
3315
3316 const emitToParentSpan = (start, end) => {
3317 try {
3318 if (!window.parent || window.parent === window) return;
3319 if (start === null || end === null) return;
3320 window.parent.postMessage({ type: 'anno:activate-span', start: Number(start), end: Number(end) }, '*');
3321 } catch (_) {
3322 // ignore: best-effort bridge for iframe containers
3323 }
3324 };
3325
3326 const activateBySpan = (start, end, emit) => {
3327 if (start === null || end === null || start === undefined || end === undefined) return;
3328 // Prefer an exact signal span if present; otherwise fall back to the table row metadata.
3329 const el = document.querySelector(`span.e[data-sid][data-start='${start}'][data-end='${end}']`);
3330 if (el) {
3331 const sid = el.getAttribute('data-sid');
3332 if (sid) activateSignal(sid, emit);
3333 return;
3334 }
3335 const row = document.querySelector(`#signals-table tr[data-start='${start}'][data-end='${end}']`);
3336 if (!row) return;
3337 const sid = row.getAttribute('data-sid');
3338 if (!sid) return;
3339 activateSignal(sid, emit);
3340 };
3341
3342 const activateSignal = (sid, emit) => {
3343 clearActive();
3344 const els = signalEls.get(sid) || [];
3345 if (!els.length) return;
3346 els.forEach((el) => el.classList.add('e-active'));
3347 activeSignalEls = els;
3348 const row = document.querySelector(`#signals-table tr[data-sid='${sid}']`);
3349 if (row) {
3350 row.classList.add('e-active');
3351 activeSignalRow = row;
3352 }
3353 const primaryEl = els[0];
3354 primaryEl.scrollIntoView({ block: 'center', behavior: 'smooth' });
3355 const meta = signalMeta.get(sid);
3356 const tid = meta ? meta.track : primaryEl.getAttribute('data-track');
3357 setActiveTrack(tid);
3358 renderSignalSelectionBySid(sid);
3359 if (emit && meta && meta.start !== null && meta.end !== null) {
3360 emitToParentSpan(meta.start, meta.end);
3361 }
3362 };
3363
3364 // Table click
3365 const signalsTable = document.getElementById('signals-table');
3366 if (signalsTable) {
3367 signalsTable.addEventListener('click', (ev) => {
3368 const a = ev.target && ev.target.closest ? ev.target.closest("a[href^='#S']") : null;
3369 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3370 const sid = (a && a.getAttribute('href') ? a.getAttribute('href').slice(1) : null) || (row ? row.getAttribute('data-sid') : null);
3371 if (!sid) return;
3372 ev.preventDefault();
3373 activateSignal(sid, true);
3374 history.replaceState(null, '', '#' + sid);
3375 });
3376
3377 // Hover a signals row → preview track highlight
3378 signalsTable.addEventListener('mouseover', (ev) => {
3379 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3380 if (!row) return;
3381 const tid = row.getAttribute('data-track');
3382 setHoverTrack(tid);
3383 });
3384 signalsTable.addEventListener('mouseout', (ev) => {
3385 const to = ev.relatedTarget;
3386 if (to && signalsTable.contains(to)) return;
3387 setHoverTrack(null);
3388 });
3389 }
3390
3391 // Clicking an inline entity should also toggle active highlight.
3392 const pickPrimarySid = (el) => {
3393 if (!el) return null;
3394 const p = el.getAttribute('data-primary');
3395 if (p) return p;
3396 const raw = (el.getAttribute('data-sids') || '').trim();
3397 if (!raw) return null;
3398 const sids = raw.split(/\s+/).filter(Boolean);
3399 if (!sids.length) return null;
3400 // Prefer the shortest mention span from metadata.
3401 let best = sids[0];
3402 let bestLen = null;
3403 for (const sid of sids) {
3404 const meta = signalMeta.get(sid);
3405 const s = meta && meta.start !== null ? Number(meta.start) : null;
3406 const e = meta && meta.end !== null ? Number(meta.end) : null;
3407 const len = (s !== null && e !== null) ? (e - s) : null;
3408 if (len === null) continue;
3409 if (bestLen === null || len < bestLen) {
3410 best = sid;
3411 bestLen = len;
3412 }
3413 }
3414 return best;
3415 };
3416
3417 document.addEventListener('click', (ev) => {
3418 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3419 if (span) {
3420 activateSignal(span.getAttribute('data-sid'), true);
3421 return;
3422 }
3423 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3424 if (!seg) return;
3425 activateSignal(pickPrimarySid(seg), true);
3426 });
3427
3428 // Hover an inline entity → preview highlight its track
3429 document.addEventListener('mouseover', (ev) => {
3430 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3431 if (span) {
3432 setHoverTrack(span.getAttribute('data-track'));
3433 return;
3434 }
3435 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3436 if (!seg) return;
3437 const sid = pickPrimarySid(seg);
3438 const meta = sid ? signalMeta.get(sid) : null;
3439 setHoverTrack(meta ? meta.track : null);
3440 });
3441 document.addEventListener('mouseout', (ev) => {
3442 const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3443 const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3444 if (!span && !seg) return;
3445 const to = ev.relatedTarget;
3446 if (to && to.closest && (to.closest('span.e[data-sid]') || to.closest('span.seg[data-sids]'))) return;
3447 setHoverTrack(null);
3448 });
3449
3450 // Clicking a track row → select track (highlight + details)
3451 const tracksTable = document.getElementById('tracks-table');
3452 if (tracksTable) {
3453 tracksTable.addEventListener('click', (ev) => {
3454 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3455 if (!row) return;
3456 const tid = row.getAttribute('data-tid');
3457 setActiveTrack(tid);
3458 renderTrackSelection(tid);
3459 });
3460 tracksTable.addEventListener('mouseover', (ev) => {
3461 const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3462 if (!row) return;
3463 setHoverTrack(row.getAttribute('data-tid'));
3464 });
3465 tracksTable.addEventListener('mouseout', (ev) => {
3466 const to = ev.relatedTarget;
3467 if (to && tracksTable.contains(to)) return;
3468 setHoverTrack(null);
3469 });
3470 }
3471
3472 // Filter
3473 const input = document.getElementById('signal-filter');
3474 const countEl = document.getElementById('signal-filter-count');
3475 if (input && signalsTable) {
3476 const update = () => {
3477 const q = (input.value || '').trim().toLowerCase();
3478 let shown = 0;
3479 const rows = signalsTable.querySelectorAll('tr[data-sid]');
3480 rows.forEach(row => {
3481 const sid = (row.getAttribute('data-sid') || '').toLowerCase();
3482 const label = (row.getAttribute('data-label') || '').toLowerCase();
3483 const surface = (row.getAttribute('data-surface') || '').toLowerCase();
3484 const ok = !q || sid.includes(q) || label.includes(q) || surface.includes(q);
3485 row.style.display = ok ? '' : 'none';
3486 if (ok) shown += 1;
3487 });
3488 if (countEl) countEl.textContent = shown + ' shown';
3489 };
3490 input.addEventListener('input', update);
3491 update();
3492 }
3493
3494 // Panel toggles
3495 document.querySelectorAll('[data-toggle]').forEach(btn => {
3496 btn.addEventListener('click', () => {
3497 const id = btn.getAttribute('data-toggle');
3498 const panel = id ? document.getElementById(id) : null;
3499 if (!panel) return;
3500 panel.classList.toggle('panel-collapsed');
3501 });
3502 });
3503
3504 // If URL hash is #S123, focus it.
3505 const hash = (location.hash || '').slice(1);
3506 if (hash && hash.startsWith('S')) activateSignal(hash, false);
3507
3508 // Optional: allow parent pages (e.g., dataset explorers) to sync selection across iframes.
3509 window.addEventListener('message', (ev) => {
3510 const data = ev && ev.data ? ev.data : null;
3511 if (!data || data.type !== 'anno:activate-span') return;
3512 if (typeof data.start !== 'number' || typeof data.end !== 'number') return;
3513 activateBySpan(data.start, data.end, false);
3514 });
3515})();
3516</script>"#);
3517
3518 html.push_str(r#"</body></html>"#);
3519 html
3520}
3521
3522fn html_escape(s: &str) -> String {
3523 s.replace('&', "&")
3524 .replace('<', "<")
3525 .replace('>', ">")
3526 .replace('"', """)
3527}
3528
3529fn annotate_text_html(
3530 text: &str,
3531 signals: &[Signal<Location>],
3532 signal_to_track: &std::collections::HashMap<SignalId, TrackId>,
3533) -> String {
3534 let char_count = text.chars().count();
3535 if char_count == 0 {
3536 return String::new();
3537 }
3538
3539 #[derive(Debug, Clone)]
3540 struct SigMeta {
3541 sid: String,
3542 label: String,
3543 conf: f32,
3544 track_id: Option<TrackId>,
3545 covered_len: usize,
3546 }
3547
3548 #[derive(Debug, Clone)]
3549 struct Event {
3550 pos: usize,
3551 meta_idx: usize,
3552 delta: i32, }
3554
3555 let mut metas: Vec<SigMeta> = Vec::new();
3557 let mut events: Vec<Event> = Vec::new();
3558 let mut boundaries: Vec<usize> = vec![0, char_count];
3559
3560 for s in signals {
3561 let raw_segments: Vec<(usize, usize)> = match &s.location {
3562 Location::Text { start, end } => vec![(*start, *end)],
3563 Location::TextWithBbox { start, end, .. } => vec![(*start, *end)],
3564 Location::Discontinuous { segments } => segments.clone(),
3565 _ => Vec::new(),
3566 };
3567 if raw_segments.is_empty() {
3568 continue;
3569 }
3570
3571 let mut cleaned: Vec<(usize, usize)> = Vec::new();
3572 let mut covered_len = 0usize;
3573 for (start, end) in raw_segments {
3574 let start = start.min(char_count);
3575 let end = end.min(char_count);
3576 if start >= end {
3577 continue;
3578 }
3579 covered_len = covered_len.saturating_add(end - start);
3580 cleaned.push((start, end));
3581 }
3582 if cleaned.is_empty() {
3583 continue;
3584 }
3585
3586 let meta_idx = metas.len();
3587 let track_id = signal_to_track.get(&s.id).copied();
3588 metas.push(SigMeta {
3589 sid: format!("S{}", s.id),
3590 label: s.label.to_string(),
3591 conf: s.confidence,
3592 track_id,
3593 covered_len,
3594 });
3595
3596 for (start, end) in cleaned {
3597 boundaries.push(start);
3598 boundaries.push(end);
3599 events.push(Event {
3600 pos: start,
3601 meta_idx,
3602 delta: 1,
3603 });
3604 events.push(Event {
3605 pos: end,
3606 meta_idx,
3607 delta: -1,
3608 });
3609 }
3610 }
3611
3612 if metas.is_empty() {
3613 return html_escape(text);
3614 }
3615
3616 boundaries.sort_unstable();
3617 boundaries.dedup();
3618 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
3619
3620 let mut active_counts: Vec<u32> = vec![0; metas.len()];
3621 let mut active: Vec<usize> = Vec::new();
3622 let mut ev_idx = 0usize;
3623
3624 let mut result = String::new();
3625
3626 for bi in 0..boundaries.len().saturating_sub(1) {
3627 let pos = boundaries[bi];
3628 while ev_idx < events.len() && events[ev_idx].pos == pos {
3630 let e = &events[ev_idx];
3631 let idx = e.meta_idx;
3632 if e.delta < 0 {
3633 if active_counts[idx] > 0 {
3634 active_counts[idx] -= 1;
3635 if active_counts[idx] == 0 {
3636 active.retain(|&x| x != idx);
3637 }
3638 }
3639 } else {
3640 active_counts[idx] += 1;
3641 if active_counts[idx] == 1 {
3642 active.push(idx);
3643 }
3644 }
3645 ev_idx += 1;
3646 }
3647
3648 let next = boundaries[bi + 1];
3649 if next <= pos {
3650 continue;
3651 }
3652
3653 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
3654 if active.is_empty() {
3655 result.push_str(&html_escape(&seg_text));
3656 continue;
3657 }
3658
3659 let primary_idx = active
3661 .iter()
3662 .copied()
3663 .min_by(|a, b| {
3664 metas[*a]
3665 .covered_len
3666 .cmp(&metas[*b].covered_len)
3667 .then_with(|| {
3668 metas[*b]
3669 .conf
3670 .partial_cmp(&metas[*a].conf)
3671 .unwrap_or(std::cmp::Ordering::Equal)
3672 })
3673 })
3674 .unwrap_or(active[0]);
3675 let primary = &metas[primary_idx];
3676
3677 let class = match primary.label.to_uppercase().as_str() {
3678 "PER" | "PERSON" => "e-per",
3679 "ORG" | "ORGANIZATION" | "COMPANY" => "e-org",
3680 "LOC" | "LOCATION" | "GPE" => "e-loc",
3681 "DATE" | "TIME" => "e-date",
3682 _ => "e-misc",
3683 };
3684
3685 let mut sids: Vec<&str> = active.iter().map(|i| metas[*i].sid.as_str()).collect();
3686 sids.sort_unstable();
3687 let data_sids = sids.join(" ");
3688
3689 let mut title = format!(
3690 "sids=[{}] primary={} [{}..{})",
3691 data_sids, primary.sid, pos, next
3692 );
3693 if let Some(t) = primary.track_id {
3694 title.push_str(&format!(" track=T{}", t));
3695 }
3696
3697 result.push_str(&format!(
3698 r#"<span class="e seg {class}" data-sids="{sids}" data-start="{start}" data-end="{end}" data-primary="{primary}" title="{title}">{text}</span>"#,
3699 class = class,
3700 sids = html_escape(&data_sids),
3701 start = pos,
3702 end = next,
3703 primary = html_escape(&primary.sid),
3704 title = html_escape(&title),
3705 text = html_escape(&seg_text),
3706 ));
3707 }
3708
3709 result
3710}
3711
3712#[derive(Debug, Clone)]
3718pub struct EvalComparison {
3719 pub text: String,
3721 pub gold: Vec<Signal<Location>>,
3723 pub predicted: Vec<Signal<Location>>,
3725 pub matches: Vec<EvalMatch>,
3727}
3728
3729#[derive(Debug, Clone)]
3731pub enum EvalMatch {
3732 Correct {
3734 gold_id: SignalId,
3736 pred_id: SignalId,
3738 },
3739 TypeMismatch {
3741 gold_id: SignalId,
3743 pred_id: SignalId,
3745 gold_label: String,
3747 pred_label: String,
3749 },
3750 BoundaryError {
3752 gold_id: SignalId,
3754 pred_id: SignalId,
3756 iou: f64,
3758 },
3759 Spurious {
3761 pred_id: SignalId,
3763 },
3764 Missed {
3766 gold_id: SignalId,
3768 },
3769}
3770
3771impl EvalComparison {
3772 #[must_use]
3792 pub fn compare(
3793 text: &str,
3794 gold: Vec<Signal<Location>>,
3795 predicted: Vec<Signal<Location>>,
3796 ) -> Self {
3797 let mut matches = Vec::new();
3798 let mut gold_matched = vec![false; gold.len()];
3799 let mut pred_matched = vec![false; predicted.len()];
3800
3801 for (pi, pred) in predicted.iter().enumerate() {
3803 let pred_offsets = match pred.location.text_offsets() {
3804 Some(o) => o,
3805 None => continue,
3806 };
3807
3808 for (gi, g) in gold.iter().enumerate() {
3809 if gold_matched[gi] {
3810 continue;
3811 }
3812 let gold_offsets = match g.location.text_offsets() {
3813 Some(o) => o,
3814 None => continue,
3815 };
3816
3817 if pred_offsets == gold_offsets {
3819 if pred.label == g.label {
3820 matches.push(EvalMatch::Correct {
3821 gold_id: g.id,
3822 pred_id: pred.id,
3823 });
3824 } else {
3825 matches.push(EvalMatch::TypeMismatch {
3826 gold_id: g.id,
3827 pred_id: pred.id,
3828 gold_label: g.label.to_string(),
3829 pred_label: pred.label.to_string(),
3830 });
3831 }
3832 gold_matched[gi] = true;
3833 pred_matched[pi] = true;
3834 break;
3835 }
3836 }
3837 }
3838
3839 for (pi, pred) in predicted.iter().enumerate() {
3841 if pred_matched[pi] {
3842 continue;
3843 }
3844 let pred_offsets = match pred.location.text_offsets() {
3845 Some(o) => o,
3846 None => continue,
3847 };
3848
3849 for (gi, g) in gold.iter().enumerate() {
3850 if gold_matched[gi] {
3851 continue;
3852 }
3853 let gold_offsets = match g.location.text_offsets() {
3854 Some(o) => o,
3855 None => continue,
3856 };
3857
3858 if pred_offsets.0 < gold_offsets.1 && pred_offsets.1 > gold_offsets.0 {
3860 let iou = pred.location.iou(&g.location).unwrap_or(0.0);
3861 matches.push(EvalMatch::BoundaryError {
3862 gold_id: g.id,
3863 pred_id: pred.id,
3864 iou,
3865 });
3866 gold_matched[gi] = true;
3867 pred_matched[pi] = true;
3868 break;
3869 }
3870 }
3871 }
3872
3873 for (pi, pred) in predicted.iter().enumerate() {
3875 if !pred_matched[pi] {
3876 matches.push(EvalMatch::Spurious { pred_id: pred.id });
3877 }
3878 }
3879
3880 for (gi, g) in gold.iter().enumerate() {
3882 if !gold_matched[gi] {
3883 matches.push(EvalMatch::Missed { gold_id: g.id });
3884 }
3885 }
3886
3887 Self {
3888 text: text.to_string(),
3889 gold,
3890 predicted,
3891 matches,
3892 }
3893 }
3894
3895 #[must_use]
3897 pub fn correct_count(&self) -> usize {
3898 self.matches
3899 .iter()
3900 .filter(|m| matches!(m, EvalMatch::Correct { .. }))
3901 .count()
3902 }
3903
3904 #[must_use]
3906 pub fn error_count(&self) -> usize {
3907 self.matches.len() - self.correct_count()
3908 }
3909
3910 #[must_use]
3912 pub fn precision(&self) -> f64 {
3913 if self.predicted.is_empty() {
3914 0.0
3915 } else {
3916 self.correct_count() as f64 / self.predicted.len() as f64
3917 }
3918 }
3919
3920 #[must_use]
3922 pub fn recall(&self) -> f64 {
3923 if self.gold.is_empty() {
3924 0.0
3925 } else {
3926 self.correct_count() as f64 / self.gold.len() as f64
3927 }
3928 }
3929
3930 #[must_use]
3932 pub fn f1(&self) -> f64 {
3933 let p = self.precision();
3934 let r = self.recall();
3935 if p + r > 0.0 {
3936 2.0 * p * r / (p + r)
3937 } else {
3938 0.0
3939 }
3940 }
3941}
3942
3943pub fn render_eval_html(cmp: &EvalComparison) -> String {
3947 render_eval_html_with_title(cmp, "eval comparison")
3948}
3949
3950#[must_use]
3954pub fn render_eval_html_with_title(cmp: &EvalComparison, title: &str) -> String {
3955 let mut html = String::new();
3956 let title = html_escape(title);
3957
3958 html.push_str(
3959 r#"<!DOCTYPE html>
3960<html>
3961<head>
3962<meta charset="UTF-8">
3963<meta name="color-scheme" content="dark light">
3964"#,
3965 );
3966 html.push_str(&format!("<title>{}</title>", title));
3967 html.push_str(r#"
3968:root{
3969 color-scheme: light dark;
3970 --bg:#0a0a0a;
3971 --panel-bg:#0d0d0d;
3972 --text:#b0b0b0;
3973 --text-strong:#fff;
3974 --muted:#666;
3975 --border:#222;
3976 --border-strong:#333;
3977 --hover:#111;
3978 --input-bg:#080808;
3979 --active:#ddd;
3980 /* Eval entity colors (dark) */
3981 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3982 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3983 /* Match row borders */
3984 --m-ok:#4a8a4a;
3985 --m-type:#8a8a4a;
3986 --m-bound:#4a8a8a;
3987 --m-fp:#8a4a4a;
3988 --m-fn:#8a4a8a;
3989}
3990@media (prefers-color-scheme: light){
3991 :root{
3992 --bg:#ffffff;
3993 --panel-bg:#f7f7f7;
3994 --text:#222;
3995 --text-strong:#000;
3996 --muted:#555;
3997 --border:#d6d6d6;
3998 --border-strong:#c6c6c6;
3999 --hover:#f0f0f0;
4000 --input-bg:#ffffff;
4001 --active:#000;
4002 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4003 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4004 --m-ok:#2f8a2f;
4005 --m-type:#8a7a2f;
4006 --m-bound:#2f7a8a;
4007 --m-fp:#8a2f2f;
4008 --m-fn:#6a2f8a;
4009 }
4010}
4011html[data-theme='dark']{
4012 --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
4013 --muted:#666; --border:#222; --border-strong:#333; --hover:#111; --input-bg:#080808; --active:#ddd;
4014 --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
4015 --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
4016 --m-ok:#4a8a4a; --m-type:#8a8a4a; --m-bound:#4a8a8a; --m-fp:#8a4a4a; --m-fn:#8a4a8a;
4017}
4018html[data-theme='light']{
4019 --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
4020 --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0; --input-bg:#ffffff; --active:#000;
4021 --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4022 --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4023 --m-ok:#2f8a2f; --m-type:#8a7a2f; --m-bound:#2f7a8a; --m-fp:#8a2f2f; --m-fn:#6a2f8a;
4024}
4025
4026<style>
4027*{box-sizing:border-box;margin:0;padding:0}
4028body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
4029h1,h2{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
4030h1{font-size:14px}h2{font-size:12px}
4031table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
4032th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
4033th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
4034tr:hover{background:var(--hover)}
4035.grid{display:grid;grid-template-columns:1fr 1fr;gap:8px}
4036.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
4037.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
4038.stats{display:flex;gap:24px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
4039.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
4040/* Entities */
4041.e{padding:1px 2px;border-bottom:2px solid}
4042.seg{cursor:pointer}
4043.e-gold{background:var(--gold-bg);border-color:var(--gold-br);color:var(--gold-tx)}
4044.e-pred{background:var(--pred-bg);border-color:var(--pred-br);color:var(--pred-tx)}
4045.e-active{outline:1px solid var(--active);outline-offset:1px}
4046/* Match types */
4047.correct{background:#1a2e1a;border-color:#4a8a4a}
4048.type-err{background:#2e2e1a;border-color:#8a8a4a}
4049.boundary{background:#1a2e2e;border-color:#4a8a8a}
4050.spurious{background:#2e1a1a;border-color:#8a4a4a}
4051.missed{background:#2e1a2e;border-color:#8a4a8a}
4052.match-row.correct{border-left:3px solid var(--m-ok)}
4053.match-row.type-err{border-left:3px solid var(--m-type)}
4054.match-row.boundary{border-left:3px solid var(--m-bound)}
4055.match-row.spurious{border-left:3px solid var(--m-fp)}
4056.match-row.missed{border-left:3px solid var(--m-fn)}
4057.match-row.active{outline:1px solid var(--muted)}
4058.sel{color:var(--muted);margin:6px 0 12px}
4059.metric{font-size:14px;color:var(--muted)}.metric b{color:var(--text-strong)}
4060</style>
4061</head>
4062<body>
4063"#);
4064
4065 html.push_str(&format!(
4067 "<div class=\"panel-h\" style=\"justify-content:space-between\"><h1>{}</h1><span class=\"toggle\" id=\"theme-toggle\" title=\"toggle theme (auto → dark → light)\">theme: auto</span></div>",
4068 title
4069 ));
4070
4071 html.push_str("<div class=\"stats\">");
4073 html.push_str(&format!(
4074 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">gold</div></div>",
4075 cmp.gold.len()
4076 ));
4077 html.push_str(&format!(
4078 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">predicted</div></div>",
4079 cmp.predicted.len()
4080 ));
4081 html.push_str(&format!(
4082 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">correct</div></div>",
4083 cmp.correct_count()
4084 ));
4085 html.push_str(&format!(
4086 "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">errors</div></div>",
4087 cmp.error_count()
4088 ));
4089 html.push_str(&format!(
4090 "<div class=\"metric\">P=<b>{:.1}%</b> R=<b>{:.1}%</b> F1=<b>{:.1}%</b></div>",
4091 cmp.precision() * 100.0,
4092 cmp.recall() * 100.0,
4093 cmp.f1() * 100.0
4094 ));
4095 html.push_str("</div>");
4096
4097 html.push_str("<div id=\"selection\" class=\"sel\">click a match row to select spans</div>");
4099
4100 html.push_str("<div class=\"grid\">");
4102
4103 html.push_str("<div class=\"panel\"><h2>gold (ground truth)</h2><div class=\"text-box\">");
4105 let gold_spans: Vec<EvalHtmlSpan> = cmp
4106 .gold
4107 .iter()
4108 .map(|s| {
4109 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4110 EvalHtmlSpan {
4111 start,
4112 end,
4113 label: s.label.to_string(),
4114 class: "e-gold",
4115 id: format!("G{}", s.id),
4116 }
4117 })
4118 .collect();
4119 html.push_str(&annotate_text_spans(&cmp.text, &gold_spans));
4120 html.push_str("</div></div>");
4121
4122 html.push_str("<div class=\"panel\"><h2>predicted</h2><div class=\"text-box\">");
4124 let pred_spans: Vec<EvalHtmlSpan> = cmp
4125 .predicted
4126 .iter()
4127 .map(|s| {
4128 let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4129 EvalHtmlSpan {
4130 start,
4131 end,
4132 label: s.label.to_string(),
4133 class: "e-pred",
4134 id: format!("P{}", s.id),
4135 }
4136 })
4137 .collect();
4138 html.push_str(&annotate_text_spans(&cmp.text, &pred_spans));
4139 html.push_str("</div></div>");
4140
4141 html.push_str("</div>");
4142
4143 html.push_str("<h2>matches</h2><table>");
4145 html.push_str("<tr><th>type</th><th>gold</th><th>predicted</th><th>notes</th></tr>");
4146
4147 for (mi, m) in cmp.matches.iter().enumerate() {
4148 let (class, mtype, gold_text, pred_text, notes, gid, pid) = match m {
4149 EvalMatch::Correct { gold_id, pred_id } => {
4150 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4151 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4152 (
4153 "correct",
4154 "✓",
4155 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4156 .unwrap_or_default(),
4157 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4158 .unwrap_or_default(),
4159 String::new(),
4160 Some(format!("G{}", gold_id)),
4161 Some(format!("P{}", pred_id)),
4162 )
4163 }
4164 EvalMatch::TypeMismatch {
4165 gold_id,
4166 pred_id,
4167 gold_label,
4168 pred_label,
4169 } => {
4170 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4171 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4172 (
4173 "type-err",
4174 "type",
4175 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4176 .unwrap_or_default(),
4177 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4178 .unwrap_or_default(),
4179 format!("{} → {}", gold_label, pred_label),
4180 Some(format!("G{}", gold_id)),
4181 Some(format!("P{}", pred_id)),
4182 )
4183 }
4184 EvalMatch::BoundaryError {
4185 gold_id,
4186 pred_id,
4187 iou,
4188 } => {
4189 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4190 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4191 (
4192 "boundary",
4193 "bound",
4194 g.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4195 .unwrap_or_default(),
4196 p.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4197 .unwrap_or_default(),
4198 format!("IoU={:.2}", iou),
4199 Some(format!("G{}", gold_id)),
4200 Some(format!("P{}", pred_id)),
4201 )
4202 }
4203 EvalMatch::Spurious { pred_id } => {
4204 let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4205 (
4206 "spurious",
4207 "FP",
4208 String::new(),
4209 p.map(|s| format!("[{}] {}", s.label, s.surface()))
4210 .unwrap_or_default(),
4211 "false positive".to_string(),
4212 None,
4213 Some(format!("P{}", pred_id)),
4214 )
4215 }
4216 EvalMatch::Missed { gold_id } => {
4217 let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4218 (
4219 "missed",
4220 "FN",
4221 g.map(|s| format!("[{}] {}", s.label, s.surface()))
4222 .unwrap_or_default(),
4223 String::new(),
4224 "false negative".to_string(),
4225 Some(format!("G{}", gold_id)),
4226 None,
4227 )
4228 }
4229 };
4230
4231 let mut data_attrs = String::new();
4232 if let Some(gid) = gid.as_deref() {
4233 data_attrs.push_str(&format!(" data-gid=\"{}\"", html_escape(gid)));
4234 }
4235 if let Some(pid) = pid.as_deref() {
4236 data_attrs.push_str(&format!(" data-pid=\"{}\"", html_escape(pid)));
4237 }
4238
4239 html.push_str(&format!(
4240 "<tr id=\"M{mid}\" class=\"match-row {class}\"{attrs}><td><a class=\"match-link\" href=\"#M{mid}\">{mtype}</a></td><td>{gold}</td><td>{pred}</td><td>{notes}</td></tr>",
4241 mid = mi,
4242 class = class,
4243 attrs = data_attrs,
4244 mtype = html_escape(mtype),
4245 gold = html_escape(&gold_text),
4246 pred = html_escape(&pred_text),
4247 notes = html_escape(¬es)
4248 ));
4249 }
4250 html.push_str("</table>");
4251
4252 html.push_str(
4253 r#"<script>
4254(() => {
4255 // Theme toggle: auto (prefers-color-scheme) → dark → light.
4256 const themeBtn = document.getElementById('theme-toggle');
4257 const themeKey = 'anno-theme';
4258 const applyTheme = (theme) => {
4259 const t = theme || 'auto';
4260 if (t === 'auto') {
4261 delete document.documentElement.dataset.theme;
4262 } else {
4263 document.documentElement.dataset.theme = t;
4264 }
4265 if (themeBtn) themeBtn.textContent = `theme: ${t}`;
4266 };
4267 const readTheme = () => {
4268 try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
4269 };
4270 const writeTheme = (t) => {
4271 try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
4272 };
4273 applyTheme(readTheme());
4274 if (themeBtn) {
4275 themeBtn.addEventListener('click', () => {
4276 const cur = readTheme();
4277 const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
4278 writeTheme(next);
4279 applyTheme(next);
4280 });
4281 }
4282
4283 function clearActive() {
4284 document.querySelectorAll(".e-active").forEach((el) => el.classList.remove("e-active"));
4285 document.querySelectorAll("tr.match-row.active").forEach((el) => el.classList.remove("active"));
4286 }
4287
4288 function findSpanEls(eid) {
4289 if (!eid) return [];
4290 // New segmented renderer: one span can be split across multiple elements.
4291 const els = Array.from(document.querySelectorAll(`span.e[data-eids~='${eid}']`));
4292 if (els.length) return els;
4293 // Back-compat: older HTML used a single element id.
4294 const single = document.getElementById(eid);
4295 return single ? [single] : [];
4296 }
4297
4298 function activate(gid, pid, row) {
4299 clearActive();
4300 const gEls = findSpanEls(gid);
4301 const pEls = findSpanEls(pid);
4302 const sel = document.getElementById("selection");
4303 gEls.forEach((el) => el.classList.add("e-active"));
4304 pEls.forEach((el) => el.classList.add("e-active"));
4305 if (row) row.classList.add("active");
4306 if (sel) {
4307 const parts = [];
4308 if (gEls.length) {
4309 const lbl = gEls[0].dataset && gEls[0].dataset.label ? ` [${gEls[0].dataset.label}]` : "";
4310 parts.push(`gold ${gid}${lbl}`);
4311 }
4312 if (pEls.length) {
4313 const lbl = pEls[0].dataset && pEls[0].dataset.label ? ` [${pEls[0].dataset.label}]` : "";
4314 parts.push(`pred ${pid}${lbl}`);
4315 }
4316 sel.textContent = parts.length ? parts.join(" | ") : "no selection";
4317 }
4318 if (row && row.id) {
4319 // Keep deep links stable without triggering navigation jump.
4320 // NOTE: single quotes avoid the Rust raw-string delimiter issue with quote+hash.
4321 history.replaceState(null, "", '#' + row.id);
4322 }
4323 const target = gEls[0] || pEls[0];
4324 if (target) target.scrollIntoView({ behavior: "smooth", block: "center" });
4325 }
4326
4327 document.querySelectorAll("tr.match-row[data-gid], tr.match-row[data-pid]").forEach((tr) => {
4328 tr.addEventListener("click", () => activate(tr.dataset.gid, tr.dataset.pid, tr));
4329 });
4330
4331 document.querySelectorAll("a.match-link").forEach((a) => {
4332 a.addEventListener("click", (ev) => {
4333 ev.preventDefault();
4334 const tr = a.closest("tr.match-row");
4335 if (!tr) return;
4336 activate(tr.dataset.gid, tr.dataset.pid, tr);
4337 });
4338 });
4339
4340 // Auto-select a match row if the URL has a deep link (e.g. #M12).
4341 const hash = (location.hash || "").slice(1);
4342 if (hash && hash.startsWith("M")) {
4343 const tr = document.getElementById(hash);
4344 if (tr && tr.classList && tr.classList.contains("match-row")) {
4345 activate(tr.dataset.gid, tr.dataset.pid, tr);
4346 }
4347 }
4348})();
4349</script>"#,
4350 );
4351
4352 html.push_str("</body></html>");
4353 html
4354}
4355
4356#[derive(Debug, Clone)]
4358struct EvalHtmlSpan {
4359 start: usize,
4360 end: usize,
4361 label: String,
4362 class: &'static str,
4363 id: String,
4364}
4365
4366fn annotate_text_spans(text: &str, spans: &[EvalHtmlSpan]) -> String {
4367 let char_count = text.chars().count();
4368 if char_count == 0 || spans.is_empty() {
4369 return html_escape(text);
4370 }
4371
4372 #[derive(Debug, Clone)]
4373 struct Meta {
4374 id: String,
4375 label: String,
4376 class: &'static str,
4377 len: usize,
4378 }
4379 #[derive(Debug, Clone)]
4380 struct Event {
4381 pos: usize,
4382 meta_idx: usize,
4383 delta: i32,
4384 }
4385
4386 let mut metas: Vec<Meta> = Vec::with_capacity(spans.len());
4387 let mut events: Vec<Event> = Vec::new();
4388 let mut boundaries: Vec<usize> = vec![0, char_count];
4389
4390 for s in spans {
4391 let start = s.start.min(char_count);
4392 let end = s.end.min(char_count);
4393 if start >= end {
4394 continue;
4395 }
4396 let meta_idx = metas.len();
4397 metas.push(Meta {
4398 id: s.id.clone(),
4399 label: s.label.to_string(),
4400 class: s.class,
4401 len: end - start,
4402 });
4403 boundaries.push(start);
4404 boundaries.push(end);
4405 events.push(Event {
4406 pos: start,
4407 meta_idx,
4408 delta: 1,
4409 });
4410 events.push(Event {
4411 pos: end,
4412 meta_idx,
4413 delta: -1,
4414 });
4415 }
4416
4417 if metas.is_empty() {
4418 return html_escape(text);
4419 }
4420
4421 boundaries.sort_unstable();
4422 boundaries.dedup();
4423 events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
4424
4425 let mut active_counts: Vec<u32> = vec![0; metas.len()];
4426 let mut active: Vec<usize> = Vec::new();
4427 let mut ev_idx = 0usize;
4428 let mut result = String::new();
4429
4430 for bi in 0..boundaries.len().saturating_sub(1) {
4431 let pos = boundaries[bi];
4432 while ev_idx < events.len() && events[ev_idx].pos == pos {
4433 let e = &events[ev_idx];
4434 let idx = e.meta_idx;
4435 if e.delta < 0 {
4436 if active_counts[idx] > 0 {
4437 active_counts[idx] -= 1;
4438 if active_counts[idx] == 0 {
4439 active.retain(|&x| x != idx);
4440 }
4441 }
4442 } else {
4443 active_counts[idx] += 1;
4444 if active_counts[idx] == 1 {
4445 active.push(idx);
4446 }
4447 }
4448 ev_idx += 1;
4449 }
4450
4451 let next = boundaries[bi + 1];
4452 if next <= pos {
4453 continue;
4454 }
4455
4456 let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
4457 if active.is_empty() {
4458 result.push_str(&html_escape(&seg_text));
4459 continue;
4460 }
4461
4462 let primary_idx = active
4463 .iter()
4464 .copied()
4465 .min_by_key(|i| metas[*i].len)
4466 .unwrap_or(active[0]);
4467 let primary = &metas[primary_idx];
4468 let mut eids: Vec<&str> = active.iter().map(|i| metas[*i].id.as_str()).collect();
4469 eids.sort_unstable();
4470 let data_eids = eids.join(" ");
4471
4472 let title = format!(
4473 "eids=[{}] primary={} [{}..{})",
4474 data_eids, primary.id, pos, next
4475 );
4476 result.push_str(&format!(
4477 "<span class=\"e seg {class}\" data-eids=\"{eids}\" data-label=\"{label}\" data-start=\"{start}\" data-end=\"{end}\" title=\"{title}\">{text}</span>",
4478 class = primary.class,
4479 eids = html_escape(&data_eids),
4480 label = html_escape(&primary.label),
4481 start = pos,
4482 end = next,
4483 title = html_escape(&title),
4484 text = html_escape(&seg_text)
4485 ));
4486 }
4487
4488 result
4489}
4490
4491#[derive(Debug, Clone, Default)]
4497pub struct ProcessOptions {
4498 pub labels: Vec<String>,
4500 pub threshold: f32,
4502}
4503
4504#[derive(Debug)]
4506pub struct ProcessResult {
4507 pub document: GroundedDocument,
4509 pub valid: bool,
4511 pub errors: Vec<SignalValidationError>,
4513}
4514
4515impl ProcessResult {
4516 #[must_use]
4518 pub fn to_html(&self) -> String {
4519 render_document_html(&self.document)
4520 }
4521}
4522
4523#[derive(Debug, Clone)]
4532pub struct Corpus {
4533 documents: std::collections::HashMap<String, GroundedDocument>,
4534 identities: std::collections::HashMap<IdentityId, Identity>,
4535 next_identity_id: IdentityId,
4536}
4537
4538impl Corpus {
4539 #[must_use]
4541 pub fn new() -> Self {
4542 Self {
4543 documents: std::collections::HashMap::new(),
4544 identities: std::collections::HashMap::new(),
4545 next_identity_id: IdentityId::ZERO,
4546 }
4547 }
4548
4549 #[must_use]
4551 pub fn identities(&self) -> &std::collections::HashMap<IdentityId, Identity> {
4552 &self.identities
4553 }
4554
4555 #[must_use]
4557 pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
4558 self.identities.get(&id)
4559 }
4560
4561 pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
4566 let id = self.next_identity_id;
4567 identity.id = id;
4568 self.identities.insert(id, identity);
4569 self.next_identity_id += 1;
4570 id
4571 }
4572
4573 #[must_use]
4577 pub fn next_identity_id(&self) -> IdentityId {
4578 self.next_identity_id
4579 }
4580
4581 pub fn documents(&self) -> impl Iterator<Item = &GroundedDocument> {
4585 self.documents.values()
4586 }
4587
4588 #[must_use]
4592 pub fn get_document(&self, doc_id: &str) -> Option<&GroundedDocument> {
4593 self.documents.get(doc_id)
4594 }
4595
4596 pub fn get_document_mut(&mut self, doc_id: &str) -> Option<&mut GroundedDocument> {
4600 self.documents.get_mut(doc_id)
4601 }
4602
4603 pub fn add_document(&mut self, document: GroundedDocument) -> String {
4608 let doc_id = document.id.clone();
4609 self.documents.insert(doc_id.clone(), document);
4610 doc_id
4611 }
4612
4613 pub fn link_track_to_kb(
4635 &mut self,
4636 track_ref: &TrackRef,
4637 kb_name: impl Into<String>,
4638 kb_id: impl Into<String>,
4639 canonical_name: impl Into<String>,
4640 ) -> super::Result<IdentityId> {
4641 use super::error::Error;
4642
4643 let doc = self.documents.get_mut(&track_ref.doc_id).ok_or_else(|| {
4644 Error::track_ref(format!(
4645 "Document '{}' not found in corpus",
4646 track_ref.doc_id
4647 ))
4648 })?;
4649 let track = doc.get_track(track_ref.track_id).ok_or_else(|| {
4650 Error::track_ref(format!(
4651 "Track {} not found in document '{}'",
4652 track_ref.track_id, track_ref.doc_id
4653 ))
4654 })?;
4655
4656 let kb_name_str = kb_name.into();
4657 let kb_id_str = kb_id.into();
4658 let canonical_name_str = canonical_name.into();
4659
4660 let identity_id = if let Some(existing_id) = track.identity_id {
4662 if let Some(identity) = self.identities.get_mut(&existing_id) {
4664 identity.kb_id = Some(kb_id_str.clone());
4665 identity.kb_name = Some(kb_name_str.clone());
4666 identity.canonical_name = canonical_name_str.clone();
4667
4668 identity.source = Some(match identity.source.take() {
4670 Some(IdentitySource::CrossDocCoref { track_refs }) => IdentitySource::Hybrid {
4671 track_refs,
4672 kb_name: kb_name_str.clone(),
4673 kb_id: kb_id_str.clone(),
4674 },
4675 _ => IdentitySource::KnowledgeBase {
4676 kb_name: kb_name_str.clone(),
4677 kb_id: kb_id_str.clone(),
4678 },
4679 });
4680
4681 existing_id
4682 } else {
4683 let new_id = self.next_identity_id;
4691 self.next_identity_id += 1;
4692
4693 let identity = Identity {
4694 id: new_id,
4695 canonical_name: canonical_name_str,
4696 entity_type: track.entity_type.clone(),
4697 kb_id: Some(kb_id_str.clone()),
4698 kb_name: Some(kb_name_str.clone()),
4699 description: None,
4700 embedding: track.embedding.clone(),
4701 aliases: Vec::new(),
4702 confidence: track.cluster_confidence,
4703 source: Some(IdentitySource::KnowledgeBase {
4704 kb_name: kb_name_str,
4705 kb_id: kb_id_str,
4706 }),
4707 };
4708
4709 self.identities.insert(new_id, identity);
4710 doc.link_track_to_identity(track_ref.track_id, new_id);
4713 new_id
4714 }
4715 } else {
4716 let new_id = self.next_identity_id;
4718 self.next_identity_id += 1;
4719
4720 let identity = Identity {
4721 id: new_id,
4722 canonical_name: canonical_name_str,
4723 entity_type: track.entity_type.clone(),
4724 kb_id: Some(kb_id_str.clone()),
4725 kb_name: Some(kb_name_str.clone()),
4726 description: None,
4727 embedding: track.embedding.clone(),
4728 aliases: Vec::new(),
4729 confidence: track.cluster_confidence,
4730 source: Some(IdentitySource::KnowledgeBase {
4731 kb_name: kb_name_str,
4732 kb_id: kb_id_str,
4733 }),
4734 };
4735
4736 self.identities.insert(new_id, identity);
4737 doc.link_track_to_identity(track_ref.track_id, new_id);
4738 new_id
4739 };
4740
4741 Ok(identity_id)
4742 }
4743}
4744
4745impl Default for Corpus {
4746 fn default() -> Self {
4747 Self::new()
4748 }
4749}
4750
4751#[cfg(test)]
4752mod tests {
4753 #![allow(clippy::unwrap_used)] use super::*;
4755 use crate::EntityCategory;
4756
4757 #[test]
4758 fn test_render_eval_html_has_interactive_hooks_and_is_unicode_safe() {
4759 let text = "習近平在北京會見了普京。";
4761
4762 let gold: Vec<Signal<Location>> = vec![
4763 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 1.0),
4764 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "LOC", 1.0),
4765 ];
4766
4767 let predicted: Vec<Signal<Location>> = vec![
4769 Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 0.9),
4770 Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "PER", 0.7),
4771 ];
4772
4773 let cmp = EvalComparison::compare(text, gold, predicted);
4774 let html = render_eval_html_with_title(&cmp, "test");
4775
4776 assert!(html.contains("id=\"selection\""));
4778
4779 assert!(html.contains("data-eids=\"G0\""));
4781 assert!(html.contains("data-eids=\"P0\""));
4782
4783 assert!(html.contains("class=\"match-link\""));
4785 assert!(html.contains("href=\"#M0\""));
4786 assert!(html.contains("data-gid=\"G0\""));
4787 assert!(html.contains("data-pid=\"P0\""));
4788
4789 assert!(html.contains("北京"));
4791 }
4792
4793 fn find_char_span(text: &str, needle: &str) -> Option<(usize, usize)> {
4794 let hay: Vec<char> = text.chars().collect();
4795 let pat: Vec<char> = needle.chars().collect();
4796 if pat.is_empty() || hay.len() < pat.len() {
4797 return None;
4798 }
4799 for i in 0..=(hay.len() - pat.len()) {
4800 if hay[i..(i + pat.len())] == pat[..] {
4801 return Some((i, i + pat.len()));
4802 }
4803 }
4804 None
4805 }
4806
4807 #[test]
4808 fn test_annotate_text_html_supports_overlaps_discontinuous_and_unicode() {
4809 let text = "Marie Curie met Cher in Paris. 習近平在北京會見了普京。 \
4811التقى محمد بن سلمان في الرياض. Путин встретился с Си Цзиньпином в Москве. \
4812प्रधान मंत्री शर्मा दिल्ली में मिले। severe pain ... in abdomen.";
4813
4814 let (m0s, m0e) = find_char_span(text, "Marie Curie").unwrap();
4816 let (m1s, m1e) = find_char_span(text, "Curie").unwrap();
4817
4818 let pain = find_char_span(text, "pain").unwrap();
4820 let abdomen = find_char_span(text, "abdomen").unwrap();
4821
4822 let signals: Vec<Signal<Location>> = vec![
4823 Signal::new(
4824 SignalId::new(0),
4825 Location::text(m0s, m0e),
4826 "Marie Curie",
4827 "PER",
4828 0.9,
4829 ),
4830 Signal::new(
4831 SignalId::new(1),
4832 Location::text(m1s, m1e),
4833 "Curie",
4834 "PER",
4835 0.8,
4836 ),
4837 Signal::new(
4838 SignalId::new(2),
4839 Location::Discontinuous {
4840 segments: vec![pain, abdomen],
4841 },
4842 "pain … abdomen",
4843 "SYMPTOM",
4844 0.7,
4845 ),
4846 ];
4847
4848 let html = annotate_text_html(text, &signals, &std::collections::HashMap::new());
4849
4850 assert!(html.contains("data-sids=\"S0 S1\"") || html.contains("data-sids=\"S1 S0\""));
4852
4853 assert!(html.contains("data-sids=\"S2\""));
4855
4856 assert!(html.contains("北京"));
4858 assert!(html.contains("Москве"));
4859 assert!(html.contains("शर्मा"));
4860 assert!(html.contains("محمد"));
4861 }
4862
4863 #[test]
4864 fn test_location_text_iou() {
4865 let l1 = Location::text(0, 10);
4866 let l2 = Location::text(5, 15);
4867 let iou = l1.iou(&l2).unwrap();
4868 assert!((iou - 0.333).abs() < 0.01);
4872 }
4873
4874 #[test]
4875 fn test_location_bbox_iou() {
4876 let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
4877 let b2 = Location::bbox(0.25, 0.25, 0.5, 0.5);
4878 let iou = b1.iou(&b2).unwrap();
4879 assert!((iou - 0.143).abs() < 0.01);
4883 }
4884
4885 #[test]
4886 fn test_location_different_types_no_iou() {
4887 let text = Location::text(0, 10);
4888 let bbox = Location::bbox(0.0, 0.0, 0.5, 0.5);
4889 assert!(text.iou(&bbox).is_none());
4890 }
4891
4892 #[test]
4893 fn test_signal_creation() {
4894 let signal: Signal<Location> =
4895 Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95);
4896 assert_eq!(signal.surface, "Marie Curie");
4897 assert_eq!(signal.label, "Person".into());
4898 assert!((signal.confidence - 0.95).abs() < 0.001);
4899 assert!(!signal.negated);
4900 }
4901
4902 #[test]
4903 fn test_signal_with_linguistic_features() {
4904 let signal: Signal<Location> =
4905 Signal::new(0, Location::text(0, 10), "not a doctor", "Occupation", 0.8)
4906 .negated()
4907 .with_quantifier(Quantifier::Existential)
4908 .with_modality(Modality::Symbolic);
4909
4910 assert!(signal.negated);
4911 assert_eq!(signal.quantifier, Some(Quantifier::Existential));
4912 assert!(signal.modality.supports_linguistic_features());
4913 }
4914
4915 #[test]
4916 fn test_track_formation() {
4917 let mut track = Track::new(0, "Marie Curie");
4918 track.add_signal(0, 0);
4919 track.add_signal(1, 1);
4920 track.add_signal(2, 2);
4921
4922 assert_eq!(track.len(), 3);
4923 assert!(!track.is_singleton());
4924 assert!(!track.is_empty());
4925 }
4926
4927 #[test]
4928 fn test_identity_creation() {
4929 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186")
4930 .with_type("Person")
4931 .with_embedding(vec![0.1, 0.2, 0.3]);
4932
4933 assert_eq!(identity.canonical_name, "Marie Curie");
4934 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4935 assert_eq!(identity.kb_name, Some("wikidata".to_string()));
4936 assert!(identity.embedding.is_some());
4937 }
4938
4939 #[test]
4940 fn test_grounded_document_hierarchy() {
4941 let mut doc = GroundedDocument::new(
4942 "doc1",
4943 "Marie Curie won the Nobel Prize. She was a physicist.",
4944 );
4945
4946 let s1 = doc.add_signal(Signal::new(
4948 0,
4949 Location::text(0, 12),
4950 "Marie Curie",
4951 "Person",
4952 0.95,
4953 ));
4954 let s2 = doc.add_signal(Signal::new(
4955 1,
4956 Location::text(38, 41),
4957 "She",
4958 "Person",
4959 0.88,
4960 ));
4961 let s3 = doc.add_signal(Signal::new(
4962 2,
4963 Location::text(17, 29),
4964 "Nobel Prize",
4965 "Award",
4966 0.92,
4967 ));
4968
4969 let mut track1 = Track::new(0, "Marie Curie");
4971 track1.add_signal(s1, 0);
4972 track1.add_signal(s2, 1);
4973 let track1_id = doc.add_track(track1);
4974
4975 let mut track2 = Track::new(1, "Nobel Prize");
4976 track2.add_signal(s3, 0);
4977 doc.add_track(track2);
4978
4979 let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
4981 let identity_id = doc.add_identity(identity);
4982 doc.link_track_to_identity(track1_id, identity_id);
4983
4984 assert_eq!(doc.signals().len(), 3);
4986 assert_eq!(doc.tracks().count(), 2);
4987 assert_eq!(doc.identities().count(), 1);
4988
4989 let track = doc.track_for_signal(s1).unwrap();
4991 assert_eq!(track.canonical_surface, "Marie Curie");
4992 assert_eq!(track.len(), 2);
4993
4994 let identity = doc.identity_for_track(track1_id).unwrap();
4996 assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4997
4998 let identity = doc.identity_for_signal(s1).unwrap();
5000 assert_eq!(identity.canonical_name, "Marie Curie");
5001 }
5002
5003 #[test]
5004 fn test_modality_features() {
5005 assert!(Modality::Symbolic.supports_linguistic_features());
5006 assert!(!Modality::Symbolic.supports_geometric_features());
5007
5008 assert!(!Modality::Iconic.supports_linguistic_features());
5009 assert!(Modality::Iconic.supports_geometric_features());
5010
5011 assert!(Modality::Hybrid.supports_linguistic_features());
5012 assert!(Modality::Hybrid.supports_geometric_features());
5013 }
5014
5015 #[test]
5016 fn test_location_from_span() {
5017 let span = Span::Text { start: 0, end: 10 };
5018 let location = Location::from(&span);
5019 assert_eq!(location.text_offsets(), Some((0, 10)));
5020
5021 let span = Span::BoundingBox {
5022 x: 0.1,
5023 y: 0.2,
5024 width: 0.3,
5025 height: 0.4,
5026 page: Some(1),
5027 };
5028 let location = Location::from(&span);
5029 assert!(matches!(location, Location::BoundingBox { .. }));
5030 }
5031
5032 #[test]
5033 fn test_entity_roundtrip() {
5034 use super::EntityType;
5035
5036 let entities = vec![
5037 Entity::new("Marie Curie", EntityType::Person, 0, 12, 0.95),
5038 Entity::new(
5039 "Nobel Prize",
5040 EntityType::custom("Award", EntityCategory::Creative),
5041 17,
5042 29,
5043 0.92,
5044 ),
5045 ];
5046
5047 let doc =
5048 GroundedDocument::from_entities("doc1", "Marie Curie won the Nobel Prize.", &entities);
5049 let converted = doc.to_entities();
5050
5051 assert_eq!(converted.len(), 2);
5052 assert_eq!(converted[0].text, "Marie Curie");
5053 assert_eq!(converted[1].text, "Nobel Prize");
5054 }
5055
5056 #[test]
5057 fn test_signal_confidence_threshold() {
5058 let signal: Signal<Location> = Signal::new(0, Location::text(0, 10), "test", "Type", 0.75);
5059 assert!(signal.is_confident(0.5));
5060 assert!(signal.is_confident(0.75));
5061 assert!(!signal.is_confident(0.8));
5062 }
5063
5064 #[test]
5065 fn test_document_filtering() {
5066 let mut doc = GroundedDocument::new("doc1", "Test text");
5067
5068 doc.add_signal(Signal::new(0, Location::text(0, 4), "high", "Person", 0.95));
5070 doc.add_signal(Signal::new(1, Location::text(5, 8), "low", "Person", 0.3));
5071 doc.add_signal(Signal::new(
5072 2,
5073 Location::text(9, 12),
5074 "org",
5075 "Organization",
5076 0.8,
5077 ));
5078
5079 let confident = doc.confident_signals(0.5);
5081 assert_eq!(confident.len(), 2);
5082
5083 let persons = doc.signals_with_label("Person");
5085 assert_eq!(persons.len(), 2);
5086
5087 let orgs = doc.signals_with_label("Organization");
5088 assert_eq!(orgs.len(), 1);
5089 }
5090
5091 #[test]
5092 fn test_untracked_signals() {
5093 let mut doc = GroundedDocument::new("doc1", "Test");
5094
5095 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5096 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5097 let _s3 = doc.add_signal(Signal::new(2, Location::text(9, 12), "c", "T", 0.9));
5098
5099 let mut track = Track::new(0, "a");
5101 track.add_signal(s1, 0);
5102 track.add_signal(s2, 1);
5103 doc.add_track(track);
5104
5105 assert_eq!(doc.untracked_signal_count(), 1);
5107 let untracked = doc.untracked_signals();
5108 assert_eq!(untracked.len(), 1);
5109 assert_eq!(untracked[0].surface, "c");
5110 }
5111
5112 #[test]
5113 fn test_linked_unlinked_tracks() {
5114 let mut doc = GroundedDocument::new("doc1", "Test");
5115
5116 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5117 let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5118
5119 let mut track1 = Track::new(0, "a");
5120 track1.add_signal(s1, 0);
5121 let track1_id = doc.add_track(track1);
5122
5123 let mut track2 = Track::new(1, "b");
5124 track2.add_signal(s2, 0);
5125 doc.add_track(track2);
5126
5127 let identity = Identity::new(0, "Entity A");
5129 let identity_id = doc.add_identity(identity);
5130 doc.link_track_to_identity(track1_id, identity_id);
5131
5132 assert_eq!(doc.linked_tracks().count(), 1);
5133 assert_eq!(doc.unlinked_tracks().count(), 1);
5134 }
5135
5136 #[test]
5137 fn test_location_overlaps() {
5138 let l1 = Location::text(0, 10);
5139 let l2 = Location::text(5, 15);
5140 let l3 = Location::text(15, 20);
5141
5142 assert!(l1.overlaps(&l2));
5143 assert!(!l1.overlaps(&l3));
5144 assert!(!l2.overlaps(&l3)); let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
5148 let b2 = Location::bbox(0.4, 0.4, 0.5, 0.5);
5149 let b3 = Location::bbox(0.6, 0.6, 0.2, 0.2);
5150
5151 assert!(b1.overlaps(&b2));
5152 assert!(!b1.overlaps(&b3));
5153 }
5154
5155 #[test]
5156 fn test_iou_edge_cases() {
5157 let l1 = Location::text(0, 5);
5159 let l2 = Location::text(10, 15);
5160 assert_eq!(l1.iou(&l2), Some(0.0));
5161
5162 let l3 = Location::text(0, 10);
5164 let l4 = Location::text(0, 10);
5165 assert_eq!(l3.iou(&l4), Some(1.0));
5166
5167 let l5 = Location::text(0, 20);
5169 let l6 = Location::text(5, 15);
5170 let iou = l5.iou(&l6).unwrap();
5171 assert!((iou - 0.5).abs() < 0.001);
5173 }
5174
5175 #[test]
5179 fn test_document_stats() {
5180 let mut doc = GroundedDocument::new("doc1", "Test document with entities.");
5181
5182 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9));
5184 let mut negated = Signal::new(0, Location::text(5, 13), "document", "Type", 0.8);
5185 negated.negated = true;
5186 let s2 = doc.add_signal(negated);
5187 let _s3 = doc.add_signal(Signal::new(
5188 0,
5189 Location::text(19, 27),
5190 "entities",
5191 "Type",
5192 0.7,
5193 ));
5194
5195 let mut track = Track::new(0, "Test");
5197 track.add_signal(s1, 0);
5198 track.add_signal(s2, 1);
5199 doc.add_track(track);
5200
5201 let identity = Identity::new(0, "Test Entity");
5203 let identity_id = doc.add_identity(identity);
5204 doc.link_track_to_identity(0, identity_id);
5205
5206 let stats = doc.stats();
5207
5208 assert_eq!(stats.signal_count, 3);
5209 assert_eq!(stats.track_count, 1);
5210 assert_eq!(stats.identity_count, 1);
5211 assert_eq!(stats.linked_track_count, 1);
5212 assert_eq!(stats.untracked_count, 1); assert_eq!(stats.negated_count, 1);
5214 assert!((stats.avg_confidence - 0.8).abs() < 0.01); assert!((stats.avg_track_size - 2.0).abs() < 0.01);
5216 }
5217
5218 #[test]
5219 fn test_batch_operations() {
5220 let mut doc = GroundedDocument::new("doc1", "Test document.");
5221
5222 let signals = vec![
5224 Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9),
5225 Signal::new(0, Location::text(5, 13), "document", "Type", 0.8),
5226 ];
5227 let ids = doc.add_signals(signals);
5228
5229 assert_eq!(ids.len(), 2);
5230 assert_eq!(doc.signals().len(), 2);
5231
5232 let track_id = doc.create_track_from_signals("Test", &ids);
5234 assert!(track_id.is_some());
5235
5236 let track = doc.get_track(track_id.unwrap()).unwrap();
5237 assert_eq!(track.len(), 2);
5238 assert_eq!(track.canonical_surface, "Test");
5239 }
5240
5241 #[test]
5242 fn test_merge_tracks() {
5243 let mut doc = GroundedDocument::new("doc1", "John Smith works at Acme. He is great.");
5244
5245 let s1 = doc.add_signal(Signal::new(
5247 0,
5248 Location::text(0, 10),
5249 "John Smith",
5250 "Person",
5251 0.9,
5252 ));
5253 let s2 = doc.add_signal(Signal::new(0, Location::text(26, 28), "He", "Person", 0.8));
5254
5255 let mut track1 = Track::new(0, "John Smith");
5257 track1.add_signal(s1, 0);
5258 let track1_id = doc.add_track(track1);
5259
5260 let mut track2 = Track::new(0, "He");
5261 track2.add_signal(s2, 0);
5262 let track2_id = doc.add_track(track2);
5263
5264 assert_eq!(doc.tracks().count(), 2);
5265
5266 let merged_id = doc.merge_tracks(&[track1_id, track2_id]);
5268 assert!(merged_id.is_some());
5269
5270 assert_eq!(doc.tracks().count(), 1);
5272 let merged = doc.get_track(merged_id.unwrap()).unwrap();
5273 assert_eq!(merged.len(), 2);
5274 assert_eq!(merged.canonical_surface, "John Smith"); }
5276
5277 #[test]
5278 fn test_find_overlapping_pairs() {
5279 let mut doc = GroundedDocument::new("doc1", "New York City is great.");
5280
5281 doc.add_signal(Signal::new(
5283 0,
5284 Location::text(0, 13),
5285 "New York City",
5286 "Location",
5287 0.9,
5288 ));
5289 doc.add_signal(Signal::new(
5290 0,
5291 Location::text(0, 8),
5292 "New York",
5293 "Location",
5294 0.85,
5295 ));
5296 doc.add_signal(Signal::new(0, Location::text(17, 22), "great", "Adj", 0.7)); let pairs = doc.find_overlapping_signal_pairs();
5299
5300 assert_eq!(pairs.len(), 1);
5302 }
5303
5304 #[test]
5305 fn test_signals_in_range() {
5306 let mut doc = GroundedDocument::new("doc1", "John went to Paris and Berlin last year.");
5307
5308 doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.9));
5309 doc.add_signal(Signal::new(
5310 0,
5311 Location::text(13, 18),
5312 "Paris",
5313 "Location",
5314 0.9,
5315 ));
5316 doc.add_signal(Signal::new(
5317 0,
5318 Location::text(23, 29),
5319 "Berlin",
5320 "Location",
5321 0.9,
5322 ));
5323 doc.add_signal(Signal::new(
5324 0,
5325 Location::text(30, 39),
5326 "last year",
5327 "Date",
5328 0.8,
5329 ));
5330
5331 let in_range = doc.signals_in_range(10, 30);
5333 assert_eq!(in_range.len(), 2); let surfaces: Vec<_> = in_range.iter().map(|s| &s.surface).collect();
5336 assert!(surfaces.contains(&&"Paris".to_string()));
5337 assert!(surfaces.contains(&&"Berlin".to_string()));
5338 }
5339
5340 #[test]
5341 fn test_modality_filtering() {
5342 let mut doc = GroundedDocument::new("doc1", "Test");
5343
5344 let mut text_signal = Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9);
5346 text_signal.modality = Modality::Symbolic;
5347 doc.add_signal(text_signal);
5348
5349 let mut visual_signal =
5351 Signal::new(0, Location::bbox(0.0, 0.0, 0.5, 0.5), "Box", "Type", 0.8);
5352 visual_signal.modality = Modality::Iconic;
5353 doc.add_signal(visual_signal);
5354
5355 assert_eq!(doc.text_signals().len(), 1);
5356 assert_eq!(doc.visual_signals().len(), 1);
5357 assert_eq!(doc.signals_by_modality(Modality::Hybrid).len(), 0);
5358 }
5359
5360 #[test]
5361 fn test_quantifier_variants() {
5362 let quantifiers = [
5364 Quantifier::Universal,
5365 Quantifier::Existential,
5366 Quantifier::None,
5367 Quantifier::Definite,
5368 Quantifier::Bare,
5369 ];
5370
5371 for q in quantifiers {
5372 let signal: Signal<Location> =
5373 Signal::new(0, Location::text(0, 5), "test", "Type", 0.9).with_quantifier(q);
5374
5375 assert_eq!(signal.quantifier, Some(q));
5376 }
5377 }
5378
5379 #[test]
5380 fn test_location_modality_derivation() {
5381 assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
5382 assert_eq!(
5383 Location::bbox(0.0, 0.0, 0.5, 0.5).modality(),
5384 Modality::Iconic
5385 );
5386
5387 let temporal = Location::Temporal {
5388 start_sec: 0.0,
5389 end_sec: 5.0,
5390 frame: None,
5391 };
5392 assert_eq!(temporal.modality(), Modality::Iconic);
5393
5394 let genomic = Location::Genomic {
5395 contig: "chr1".into(),
5396 start: 0,
5397 end: 1000,
5398 strand: Some('+'),
5399 };
5400 assert_eq!(genomic.modality(), Modality::Symbolic);
5401
5402 let hybrid = Location::TextWithBbox {
5403 start: 0,
5404 end: 10,
5405 bbox: Box::new(Location::bbox(0.0, 0.0, 0.5, 0.5)),
5406 };
5407 assert_eq!(hybrid.modality(), Modality::Hybrid);
5408 }
5409
5410 }
5413
5414#[cfg(test)]
5422mod proptests {
5423 #![allow(clippy::unwrap_used)] use super::*;
5425 use proptest::prelude::*;
5426
5427 fn confidence_strategy() -> impl Strategy<Value = f32> {
5433 0.0f32..=1.0
5434 }
5435
5436 fn label_strategy() -> impl Strategy<Value = String> {
5438 prop_oneof![
5439 Just("Person".to_string()),
5440 Just("Organization".to_string()),
5441 Just("Location".to_string()),
5442 Just("Date".to_string()),
5443 "[A-Z][a-z]{2,10}".prop_map(|s| s),
5444 ]
5445 }
5446
5447 fn surface_strategy() -> impl Strategy<Value = String> {
5449 "[A-Za-z ]{1,50}".prop_map(|s| s.trim().to_string())
5450 }
5451
5452 proptest! {
5457 #[test]
5459 fn iou_symmetric(
5460 start1 in 0usize..1000,
5461 len1 in 1usize..500,
5462 start2 in 0usize..1000,
5463 len2 in 1usize..500,
5464 ) {
5465 let a = Location::text(start1, start1 + len1);
5466 let b = Location::text(start2, start2 + len2);
5467
5468 let iou_ab = a.iou(&b);
5469 let iou_ba = b.iou(&a);
5470
5471 prop_assert_eq!(iou_ab, iou_ba, "IoU must be symmetric");
5472 }
5473
5474 #[test]
5476 fn iou_bounded(
5477 start1 in 0usize..1000,
5478 len1 in 1usize..500,
5479 start2 in 0usize..1000,
5480 len2 in 1usize..500,
5481 ) {
5482 let a = Location::text(start1, start1 + len1);
5483 let b = Location::text(start2, start2 + len2);
5484
5485 if let Some(iou) = a.iou(&b) {
5486 prop_assert!(iou >= 0.0, "IoU must be non-negative: got {}", iou);
5487 prop_assert!(iou <= 1.0, "IoU must be at most 1: got {}", iou);
5488 }
5489 }
5490
5491 #[test]
5493 fn iou_self_identity(start in 0usize..1000, len in 1usize..500) {
5494 let loc = Location::text(start, start + len);
5495 let iou = loc.iou(&loc).unwrap();
5496 prop_assert!(
5497 (iou - 1.0).abs() < 1e-6,
5498 "Self-IoU must be 1.0, got {}",
5499 iou
5500 );
5501 }
5502
5503 #[test]
5505 fn iou_non_overlapping_zero(
5506 start1 in 0usize..500,
5507 len1 in 1usize..100,
5508 ) {
5509 let end1 = start1 + len1;
5510 let start2 = end1 + 100; let len2 = 50;
5512
5513 let a = Location::text(start1, end1);
5514 let b = Location::text(start2, start2 + len2);
5515
5516 let iou = a.iou(&b).expect("bbox iou should be defined");
5517 prop_assert!(
5518 iou.abs() < 1e-6,
5519 "Non-overlapping IoU must be 0, got {}",
5520 iou
5521 );
5522 }
5523
5524 #[test]
5526 fn bbox_iou_symmetric_bounded(
5527 x1 in 0.0f32..0.8,
5528 y1 in 0.0f32..0.8,
5529 w1 in 0.05f32..0.2,
5530 h1 in 0.05f32..0.2,
5531 x2 in 0.0f32..0.8,
5532 y2 in 0.0f32..0.8,
5533 w2 in 0.05f32..0.2,
5534 h2 in 0.05f32..0.2,
5535 ) {
5536 let a = Location::bbox(x1, y1, w1, h1);
5537 let b = Location::bbox(x2, y2, w2, h2);
5538
5539 let iou_ab = a.iou(&b);
5540 let iou_ba = b.iou(&a);
5541
5542 prop_assert_eq!(iou_ab, iou_ba, "BBox IoU must be symmetric");
5544
5545 if let Some(iou) = iou_ab {
5547 prop_assert!(
5548 (0.0..=1.0).contains(&iou),
5549 "BBox IoU out of bounds: {}",
5550 iou
5551 );
5552 }
5553 }
5554 }
5555
5556 proptest! {
5561 #[test]
5563 fn signal_confidence_clamped(raw_conf in -10.0f32..10.0) {
5564 let signal: Signal<Location> = Signal::new(
5565 0,
5566 Location::text(0, 10),
5567 "test",
5568 "Type",
5569 raw_conf,
5570 );
5571
5572 prop_assert!(signal.confidence >= 0.0, "Confidence below 0: {}", signal.confidence);
5573 prop_assert!(signal.confidence <= 1.0, "Confidence above 1: {}", signal.confidence);
5574 }
5575
5576 #[test]
5578 fn signal_preserves_data(
5579 surface in surface_strategy(),
5580 label in label_strategy(),
5581 conf in confidence_strategy(),
5582 start in 0usize..1000,
5583 len in 1usize..100,
5584 ) {
5585 let signal: Signal<Location> = Signal::new(
5586 0,
5587 Location::text(start, start + len),
5588 &surface,
5589 label.as_str(),
5590 conf,
5591 );
5592
5593 prop_assert_eq!(&signal.surface, &surface);
5594 let want = crate::TypeLabel::from(label.as_str());
5595 prop_assert_eq!(signal.label, want);
5596 }
5597
5598 #[test]
5602 fn signal_negation_stable(conf in confidence_strategy()) {
5603 let signal: Signal<Location> = Signal::new(
5604 0,
5605 Location::text(0, 10),
5606 "test",
5607 "Type",
5608 conf,
5609 )
5610 .negated();
5611
5612 prop_assert!(signal.negated, "Signal should be negated after .negated()");
5613 }
5614
5615 #[test]
5617 fn symbolic_supports_linguistic(
5618 start in 0usize..1000,
5619 len in 1usize..100,
5620 ) {
5621 let loc = Location::text(start, start + len);
5622 prop_assert!(
5623 loc.modality().supports_linguistic_features(),
5624 "Text locations must support linguistic features"
5625 );
5626 }
5627
5628 #[test]
5630 fn iconic_supports_geometric(
5631 x in 0.0f32..0.9,
5632 y in 0.0f32..0.9,
5633 w in 0.01f32..0.5,
5634 h in 0.01f32..0.5,
5635 ) {
5636 let loc = Location::bbox(x, y, w, h);
5637 prop_assert!(
5638 loc.modality().supports_geometric_features(),
5639 "BBox locations must support geometric features"
5640 );
5641 }
5642 }
5643
5644 proptest! {
5649 #[test]
5651 fn track_length_monotonic(signal_count in 1usize..20) {
5652 let mut track = Track::new(0, "test");
5653
5654 for i in 0..signal_count {
5655 track.add_signal(i, i as u32);
5656 prop_assert_eq!(
5657 track.len(),
5658 i + 1,
5659 "Track length should be {} after adding {} signals",
5660 i + 1,
5661 i + 1
5662 );
5663 }
5664 }
5665
5666 #[test]
5668 fn track_not_empty_after_add(canonical in surface_strategy()) {
5669 let mut track = Track::new(0, &canonical);
5670 prop_assert!(track.is_empty(), "New track should be empty");
5671
5672 track.add_signal(0, 0);
5673 prop_assert!(!track.is_empty(), "Track should not be empty after add");
5674 }
5675
5676 #[test]
5678 fn track_positions_stored(signal_count in 1usize..10) {
5679 let mut track = Track::new(0, "test");
5680
5681 for i in 0..signal_count {
5682 track.add_signal(i, i as u32);
5683 }
5684
5685 for (idx, signal_ref) in track.signals.iter().enumerate() {
5686 prop_assert_eq!(
5687 signal_ref.position as usize,
5688 idx,
5689 "Signal position mismatch at index {}",
5690 idx
5691 );
5692 }
5693 }
5694 }
5695
5696 proptest! {
5701 #[test]
5703 fn document_signal_ids_monotonic(signal_count in 1usize..20) {
5704 let mut doc = GroundedDocument::new("test", "test text");
5705
5706 let mut prev_id: Option<SignalId> = None;
5707 for i in 0..signal_count {
5708 let id = doc.add_signal(Signal::new(
5709 999, Location::text(i * 10, i * 10 + 5),
5711 format!("entity_{}", i),
5712 "Type",
5713 0.9,
5714 ));
5715
5716 if let Some(prev) = prev_id {
5717 prop_assert!(id > prev, "Signal IDs should be monotonically increasing");
5718 }
5719 prev_id = Some(id);
5720 }
5721 }
5722
5723 #[test]
5725 fn document_track_membership_consistent(signal_count in 1usize..5) {
5726 let mut doc = GroundedDocument::new("test", "test text");
5727
5728 let mut signal_ids = Vec::new();
5730 for i in 0..signal_count {
5731 let id = doc.add_signal(Signal::new(
5732 0,
5733 Location::text(i * 10, i * 10 + 5),
5734 format!("entity_{}", i),
5735 "Type",
5736 0.9,
5737 ));
5738 signal_ids.push(id);
5739 }
5740
5741 let mut track = Track::new(0, "canonical");
5743 for (pos, &id) in signal_ids.iter().enumerate() {
5744 track.add_signal(id, pos as u32);
5745 }
5746 let track_id = doc.add_track(track);
5747
5748 for &signal_id in &signal_ids {
5750 let found_track = doc.track_for_signal(signal_id);
5751 prop_assert!(found_track.is_some(), "Signal should be in a track");
5752 prop_assert_eq!(
5753 found_track.unwrap().id,
5754 track_id,
5755 "Signal should be in the correct track"
5756 );
5757 }
5758 }
5759
5760 #[test]
5762 fn document_identity_transitivity(signal_count in 1usize..3) {
5763 let mut doc = GroundedDocument::new("test", "test text");
5764
5765 let mut signal_ids = Vec::new();
5767 for i in 0..signal_count {
5768 let id = doc.add_signal(Signal::new(
5769 0,
5770 Location::text(i * 10, i * 10 + 5),
5771 format!("entity_{}", i),
5772 "Type",
5773 0.9,
5774 ));
5775 signal_ids.push(id);
5776 }
5777
5778 let mut track = Track::new(0, "canonical");
5780 for (pos, &id) in signal_ids.iter().enumerate() {
5781 track.add_signal(id, pos as u32);
5782 }
5783 let track_id = doc.add_track(track);
5784
5785 let identity = Identity::from_kb(0, "Entity", "wikidata", "Q123");
5786 let identity_id = doc.add_identity(identity);
5787 doc.link_track_to_identity(track_id, identity_id);
5788
5789 for &signal_id in &signal_ids {
5791 let identity = doc.identity_for_signal(signal_id);
5792 prop_assert!(identity.is_some(), "Should find identity through signal");
5793 prop_assert_eq!(
5794 identity.unwrap().id,
5795 identity_id,
5796 "Should find correct identity"
5797 );
5798 }
5799 }
5800
5801 #[test]
5803 fn document_untracked_signals(total in 2usize..10, tracked in 0usize..10) {
5804 let tracked = tracked.min(total - 1); let mut doc = GroundedDocument::new("test", "test text");
5806
5807 let mut signal_ids = Vec::new();
5809 for i in 0..total {
5810 let id = doc.add_signal(Signal::new(
5811 0,
5812 Location::text(i * 10, i * 10 + 5),
5813 format!("entity_{}", i),
5814 "Type",
5815 0.9,
5816 ));
5817 signal_ids.push(id);
5818 }
5819
5820 let mut track = Track::new(0, "canonical");
5822 for (pos, &id) in signal_ids.iter().take(tracked).enumerate() {
5823 track.add_signal(id, pos as u32);
5824 }
5825 if tracked > 0 {
5826 doc.add_track(track);
5827 }
5828
5829 prop_assert_eq!(
5831 doc.untracked_signal_count(),
5832 total - tracked,
5833 "Wrong untracked count"
5834 );
5835 }
5836 }
5837
5838 proptest! {
5843 #[test]
5845 fn entity_roundtrip_preserves_text(
5846 text in surface_strategy(),
5847 start in 0usize..1000,
5848 len in 1usize..100,
5849 conf in 0.0f64..=1.0,
5850 ) {
5851 use super::EntityType;
5852
5853 let end = start + len;
5854 let entity = super::Entity::new(&text, EntityType::Person, start, end, conf);
5855
5856 let doc = GroundedDocument::from_entities("test", "x".repeat(end + 10), &[entity]);
5857 let converted = doc.to_entities();
5858
5859 prop_assert_eq!(converted.len(), 1, "Should have exactly one entity");
5860 prop_assert_eq!(&converted[0].text, &text, "Text should be preserved");
5861 prop_assert_eq!(converted[0].start, start, "Start should be preserved");
5862 prop_assert_eq!(converted[0].end, end, "End should be preserved");
5863 }
5864
5865 }
5868
5869 proptest! {
5874 #[test]
5876 fn modality_feature_consistency(_dummy in 0..1) {
5877 prop_assert!(Modality::Iconic.supports_geometric_features());
5879 prop_assert!(!Modality::Iconic.supports_linguistic_features());
5880
5881 prop_assert!(Modality::Symbolic.supports_linguistic_features());
5883 prop_assert!(!Modality::Symbolic.supports_geometric_features());
5884
5885 prop_assert!(Modality::Hybrid.supports_linguistic_features());
5887 prop_assert!(Modality::Hybrid.supports_geometric_features());
5888 }
5889 }
5890
5891 proptest! {
5896 #[test]
5898 fn overlap_symmetric(
5899 start1 in 0usize..1000,
5900 len1 in 1usize..100,
5901 start2 in 0usize..1000,
5902 len2 in 1usize..100,
5903 ) {
5904 let a = Location::text(start1, start1 + len1);
5905 let b = Location::text(start2, start2 + len2);
5906
5907 prop_assert_eq!(
5908 a.overlaps(&b),
5909 b.overlaps(&a),
5910 "Overlap must be symmetric"
5911 );
5912 }
5913
5914 #[test]
5916 fn overlap_reflexive(start in 0usize..1000, len in 1usize..100) {
5917 let loc = Location::text(start, start + len);
5918 prop_assert!(loc.overlaps(&loc), "Location must overlap with itself");
5919 }
5920
5921 #[test]
5923 fn iou_implies_overlap(
5924 start1 in 0usize..500,
5925 len1 in 1usize..100,
5926 start2 in 0usize..500,
5927 len2 in 1usize..100,
5928 ) {
5929 let a = Location::text(start1, start1 + len1);
5930 let b = Location::text(start2, start2 + len2);
5931
5932 if let Some(iou) = a.iou(&b) {
5933 if iou > 0.0 {
5934 prop_assert!(
5935 a.overlaps(&b),
5936 "IoU > 0 should imply overlap"
5937 );
5938 }
5939 }
5940 }
5941 }
5942
5943 proptest! {
5948 #[test]
5950 fn stats_signal_count_accurate(signal_count in 0usize..20) {
5951 let mut doc = GroundedDocument::new("test", "test");
5952 for i in 0..signal_count {
5953 doc.add_signal(Signal::new(
5954 0,
5955 Location::text(i * 10, i * 10 + 5),
5956 "entity",
5957 "Type",
5958 0.9,
5959 ));
5960 }
5961
5962 let stats = doc.stats();
5963 prop_assert_eq!(stats.signal_count, signal_count);
5964 }
5965
5966 #[test]
5968 fn stats_track_count_accurate(track_count in 0usize..10) {
5969 let mut doc = GroundedDocument::new("test", "test");
5970 for i in 0..track_count {
5971 let id = doc.add_signal(Signal::new(
5972 0,
5973 Location::text(i * 10, i * 10 + 5),
5974 "entity",
5975 "Type",
5976 0.9,
5977 ));
5978 let mut track = Track::new(0, format!("track_{}", i));
5979 track.add_signal(id, 0);
5980 doc.add_track(track);
5981 }
5982
5983 let stats = doc.stats();
5984 prop_assert_eq!(stats.track_count, track_count);
5985 }
5986
5987 #[test]
5989 fn stats_avg_confidence_bounded(
5990 confidences in proptest::collection::vec(0.0f32..=1.0, 1..10)
5991 ) {
5992 let mut doc = GroundedDocument::new("test", "test");
5993 for (i, conf) in confidences.iter().enumerate() {
5994 doc.add_signal(Signal::new(
5995 0,
5996 Location::text(i * 10, i * 10 + 5),
5997 "entity",
5998 "Type",
5999 *conf,
6000 ));
6001 }
6002
6003 let stats = doc.stats();
6004 prop_assert!(stats.avg_confidence >= 0.0);
6005 prop_assert!(stats.avg_confidence <= 1.0);
6006 }
6007 }
6008
6009 proptest! {
6014 #[test]
6016 fn batch_add_returns_all_ids(count in 1usize..10) {
6017 let mut doc = GroundedDocument::new("test", "test");
6018 let signals: Vec<Signal<Location>> = (0..count)
6019 .map(|i| Signal::new(0, Location::text(i * 10, i * 10 + 5), "e", "T", 0.9))
6020 .collect();
6021
6022 let ids = doc.add_signals(signals);
6023 prop_assert_eq!(ids.len(), count);
6024 prop_assert_eq!(doc.signals().len(), count);
6025 }
6026
6027 #[test]
6029 fn create_track_valid(signal_count in 1usize..5) {
6030 let mut doc = GroundedDocument::new("test", "test");
6031 let mut signal_ids = Vec::new();
6032 for i in 0..signal_count {
6033 let id = doc.add_signal(Signal::new(
6034 0,
6035 Location::text(i * 10, i * 10 + 5),
6036 "entity",
6037 "Type",
6038 0.9,
6039 ));
6040 signal_ids.push(id);
6041 }
6042
6043 let track_id = doc.create_track_from_signals("canonical", &signal_ids);
6044 prop_assert!(track_id.is_some());
6045
6046 let track = doc.get_track(track_id.unwrap());
6047 prop_assert!(track.is_some());
6048 prop_assert_eq!(track.unwrap().len(), signal_count);
6049 }
6050
6051 #[test]
6053 fn create_track_empty_returns_none(_dummy in 0..1) {
6054 let mut doc = GroundedDocument::new("test", "test");
6055 let track_id = doc.create_track_from_signals("canonical", &[]);
6056 prop_assert!(track_id.is_none());
6057 }
6058 }
6059
6060 proptest! {
6065 #[test]
6067 fn signals_in_range_within_bounds(
6068 range_start in 0usize..100,
6069 range_len in 10usize..50,
6070 ) {
6071 let range_end = range_start + range_len;
6072 let mut doc = GroundedDocument::new("test", "x".repeat(200));
6073
6074 doc.add_signal(Signal::new(0, Location::text(range_start + 2, range_start + 5), "inside", "T", 0.9));
6076 doc.add_signal(Signal::new(0, Location::text(0, 5), "before", "T", 0.9));
6077 doc.add_signal(Signal::new(0, Location::text(190, 195), "after", "T", 0.9));
6078
6079 let in_range = doc.signals_in_range(range_start, range_end);
6080
6081 for signal in &in_range {
6082 if let Some((start, end)) = signal.location.text_offsets() {
6083 prop_assert!(start >= range_start, "Signal start {} < range start {}", start, range_start);
6084 prop_assert!(end <= range_end, "Signal end {} > range end {}", end, range_end);
6085 }
6086 }
6087 }
6088
6089 #[test]
6091 fn overlapping_signals_symmetric(
6092 start1 in 10usize..50,
6093 len1 in 5usize..20,
6094 start2 in 10usize..50,
6095 len2 in 5usize..20,
6096 ) {
6097 let mut doc = GroundedDocument::new("test", "x".repeat(100));
6098
6099 let loc1 = Location::text(start1, start1 + len1);
6100 let loc2 = Location::text(start2, start2 + len2);
6101
6102 doc.add_signal(Signal::new(0, loc1.clone(), "A", "T", 0.9));
6103 doc.add_signal(Signal::new(0, loc2.clone(), "B", "T", 0.9));
6104
6105 let overlaps_loc1 = doc.overlapping_signals(&loc1);
6106 let overlaps_loc2 = doc.overlapping_signals(&loc2);
6107
6108 if loc1.overlaps(&loc2) {
6110 prop_assert!(overlaps_loc1.len() >= 2, "Should find both when overlapping");
6111 prop_assert!(overlaps_loc2.len() >= 2, "Should find both when overlapping");
6112 }
6113 }
6114 }
6115
6116 proptest! {
6121 #[test]
6123 fn modality_counts_sum_to_total(
6124 symbolic_count in 0usize..5,
6125 iconic_count in 0usize..5,
6126 ) {
6127 let mut doc = GroundedDocument::new("test", "test");
6128
6129 for i in 0..symbolic_count {
6131 let mut signal = Signal::new(
6132 0,
6133 Location::text(i * 10, i * 10 + 5),
6134 "entity",
6135 "Type",
6136 0.9,
6137 );
6138 signal.modality = Modality::Symbolic;
6139 doc.add_signal(signal);
6140 }
6141
6142 for i in 0..iconic_count {
6144 let mut signal = Signal::new(
6145 0,
6146 Location::bbox(i as f32 * 0.1, 0.0, 0.05, 0.05),
6147 "entity",
6148 "Type",
6149 0.9,
6150 );
6151 signal.modality = Modality::Iconic;
6152 doc.add_signal(signal);
6153 }
6154
6155 let stats = doc.stats();
6156 prop_assert_eq!(
6157 stats.symbolic_count + stats.iconic_count + stats.hybrid_count,
6158 stats.signal_count,
6159 "Modality counts should sum to total"
6160 );
6161 }
6162 }
6163
6164 proptest! {
6169 #[test]
6171 fn from_text_always_valid(
6172 text in "[a-zA-Z ]{20,100}",
6173 surface_start in 0usize..15,
6174 surface_len in 1usize..8,
6175 ) {
6176 let text_char_len = text.chars().count();
6177 let surface_end = (surface_start + surface_len).min(text_char_len);
6178 let surface_start = surface_start.min(surface_end.saturating_sub(1));
6179
6180 if surface_start < surface_end && surface_end <= text_char_len {
6181 let surface: String = text.chars()
6182 .skip(surface_start)
6183 .take(surface_end - surface_start)
6184 .collect();
6185
6186 if !surface.is_empty() {
6187 if let Some(signal) = Signal::<Location>::from_text(&text, &surface, "Test", 0.9) {
6189 prop_assert!(
6191 signal.validate_against(&text).is_none(),
6192 "Signal created via from_text must be valid"
6193 );
6194 }
6195 }
6196 }
6197 }
6198
6199 #[test]
6201 fn validated_add_rejects_invalid(
6202 text in "[a-z]{10,50}",
6203 wrong_surface in "[A-Z]{3,10}",
6204 ) {
6205 let mut doc = GroundedDocument::new("test", &text);
6206
6207 let signal = Signal::new(
6209 0,
6210 Location::text(0, wrong_surface.chars().count().min(text.chars().count())),
6211 wrong_surface.clone(),
6212 "Test",
6213 0.9,
6214 );
6215
6216 let expected: String = text.chars().take(wrong_surface.chars().count()).collect();
6219 if expected != wrong_surface {
6220 let result = doc.add_signal_validated(signal);
6221 prop_assert!(result.is_err(), "Should reject signal with mismatched surface");
6222 }
6223 }
6224
6225 #[test]
6227 fn round_trip_signal_from_text(
6228 prefix in "[a-z]{5,20}",
6229 entity in "[A-Z][a-z]{3,10}",
6230 suffix in "[a-z]{5,20}",
6231 ) {
6232 let text = format!("{} {} {}", prefix, entity, suffix);
6233 let mut doc = GroundedDocument::new("test", &text);
6234
6235 let id = doc.add_signal_from_text(&entity, "Entity", 0.9);
6236 prop_assert!(id.is_some(), "Should find entity in text");
6237
6238 let signal = doc.signals().iter().find(|s| s.id == id.unwrap());
6239 prop_assert!(signal.is_some(), "Should retrieve added signal");
6240
6241 let signal = signal.unwrap();
6242 prop_assert_eq!(signal.surface(), entity.as_str(), "Surface should match");
6243
6244 prop_assert!(
6246 doc.is_valid(),
6247 "Document should be valid after from_text add"
6248 );
6249 }
6250
6251 #[test]
6253 fn nth_occurrence_finds_correct(
6254 entity in "[A-Z][a-z]{2,5}",
6255 sep in " [a-z]+ ",
6256 ) {
6257 let text = format!("{}{}{}{}{}", entity, sep, entity, sep, entity);
6259 let mut doc = GroundedDocument::new("test", &text);
6260
6261 for n in 0..3 {
6263 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, n);
6264 prop_assert!(id.is_some(), "Should find occurrence {}", n);
6265 }
6266
6267 let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, 3);
6269 prop_assert!(id.is_none(), "Should NOT find 4th occurrence");
6270
6271 prop_assert!(doc.is_valid(), "All signals should be valid");
6273
6274 let offsets: Vec<_> = doc.signals()
6276 .iter()
6277 .filter_map(|s| s.text_offsets())
6278 .collect();
6279 let unique: std::collections::HashSet<_> = offsets.iter().collect();
6280 prop_assert_eq!(offsets.len(), unique.len(), "Each occurrence should have distinct offset");
6281 }
6282 }
6283
6284 #[test]
6289 fn test_track_stats_basic() {
6290 let text = "John met Mary. He said hello. John left.";
6291 let mut doc = GroundedDocument::new("test", text);
6292 let text_len = text.chars().count();
6293
6294 let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.95));
6296 let s2 = doc.add_signal(Signal::new(
6297 0,
6298 Location::text(30, 34),
6299 "John",
6300 "Person",
6301 0.90,
6302 ));
6303
6304 let track_id = doc.add_track(Track::new(0, "John".to_string()));
6306 doc.add_signal_to_track(s1, track_id, 0);
6307 doc.add_signal_to_track(s2, track_id, 1);
6308
6309 let track = doc.get_track(track_id).unwrap();
6311 let stats = track.compute_stats(&doc, text_len);
6312
6313 assert_eq!(stats.chain_length, 2, "Two mentions");
6314 assert_eq!(stats.variation_count, 1, "One unique surface form");
6315 assert!(stats.spread > 0, "Spread should be positive");
6316 assert!(stats.relative_spread > 0.0 && stats.relative_spread < 1.0);
6317 assert!((stats.min_confidence - 0.90).abs() < 0.01);
6318 assert!((stats.max_confidence - 0.95).abs() < 0.01);
6319 assert!((stats.mean_confidence - 0.925).abs() < 0.01);
6320 }
6321
6322 #[test]
6323 fn test_track_stats_singleton() {
6324 let text = "Paris is beautiful.";
6325 let mut doc = GroundedDocument::new("test", text);
6326 let text_len = text.chars().count();
6327
6328 let s1 = doc.add_signal(Signal::new(
6329 0,
6330 Location::text(0, 5),
6331 "Paris",
6332 "Location",
6333 0.88,
6334 ));
6335 let track_id = doc.add_track(Track::new(0, "Paris".to_string()));
6336 doc.add_signal_to_track(s1, track_id, 0);
6337
6338 let track = doc.get_track(track_id).unwrap();
6339 let stats = track.compute_stats(&doc, text_len);
6340
6341 assert_eq!(stats.chain_length, 1);
6342 assert_eq!(stats.spread, 0, "Singleton has zero spread");
6343 assert_eq!(stats.first_position, stats.last_position);
6344 assert!((stats.min_confidence - stats.max_confidence).abs() < 0.001);
6345 }
6346}