1use crate::offset::TextSpan;
38use serde::{Deserialize, Serialize};
39
40#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
42pub enum ParentheticalType {
43 Abbreviation,
45 Ticker,
47 Alias,
49 TemporalBounds,
51 Translation,
53 Clarification,
55 CrossReference,
57 Citation,
59 Role,
61 LocationQualifier,
63 Measurement,
65 #[default]
67 Unknown,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct Parenthetical {
73 pub antecedent: String,
75 pub content: String,
77 pub start: usize,
79 pub end: usize,
81 pub content_start: usize,
83 pub content_end: usize,
85 pub parenthetical_type: ParentheticalType,
87 pub confidence: f64,
89 pub is_alias: bool,
91}
92
93impl Parenthetical {
94 pub fn new(
96 antecedent: &str,
97 content: &str,
98 start: usize,
99 end: usize,
100 content_start: usize,
101 content_end: usize,
102 ) -> Self {
103 Self {
104 antecedent: antecedent.to_string(),
105 content: content.to_string(),
106 start,
107 end,
108 content_start,
109 content_end,
110 parenthetical_type: ParentheticalType::Unknown,
111 confidence: 0.5,
112 is_alias: false,
113 }
114 }
115
116 pub fn with_type(mut self, ptype: ParentheticalType) -> Self {
118 self.parenthetical_type = ptype;
119 self
120 }
121
122 pub fn is_abbreviation(&self) -> bool {
124 matches!(self.parenthetical_type, ParentheticalType::Abbreviation)
125 }
126
127 pub fn is_ticker(&self) -> bool {
129 matches!(self.parenthetical_type, ParentheticalType::Ticker)
130 }
131
132 pub fn is_temporal(&self) -> bool {
134 matches!(self.parenthetical_type, ParentheticalType::TemporalBounds)
135 }
136
137 pub fn get_alias(&self) -> Option<&str> {
142 if self.is_alias {
143 Some(&self.content)
144 } else {
145 None
146 }
147 }
148}
149
150#[derive(Debug, Clone, Default)]
152pub struct ParentheticalExtractor {
153 min_antecedent_len: usize,
155 max_content_len: usize,
157}
158
159impl ParentheticalExtractor {
160 pub fn new() -> Self {
162 Self {
163 min_antecedent_len: 2,
164 max_content_len: 100,
165 }
166 }
167
168 pub fn with_min_antecedent(mut self, len: usize) -> Self {
170 self.min_antecedent_len = len;
171 self
172 }
173
174 pub fn extract(&self, text: &str) -> Vec<Parenthetical> {
176 let mut results = Vec::new();
177 let chars: Vec<(usize, char)> = text.char_indices().collect();
178 let mut i = 0;
179
180 while i < chars.len() {
181 if chars[i].1 == '(' {
182 let open_idx = chars[i].0;
183
184 let mut depth = 1;
186 let mut j = i + 1;
187 while j < chars.len() && depth > 0 {
188 match chars[j].1 {
189 '(' => depth += 1,
190 ')' => depth -= 1,
191 _ => {}
192 }
193 j += 1;
194 }
195
196 if depth == 0 && j > i + 1 {
197 let close_idx = chars[j - 1].0;
198 let content_start = open_idx + 1;
199 let content_end = close_idx;
200 let content = &text[content_start..content_end];
201
202 if content.chars().count() <= self.max_content_len {
204 let (antecedent, antecedent_start_byte, _antecedent_end_byte) =
206 self.find_antecedent(text, open_idx);
207
208 if antecedent.chars().count() >= self.min_antecedent_len {
209 let start_byte = antecedent_start_byte;
210 let end_byte = close_idx + 1; let span = TextSpan::from_bytes(text, start_byte, end_byte);
212 let content_span =
213 TextSpan::from_bytes(text, content_start, content_end);
214
215 let mut paren = Parenthetical::new(
216 &antecedent,
217 content,
218 span.char_start,
219 span.char_end,
220 content_span.char_start,
221 content_span.char_end,
222 );
223
224 paren = self.classify(paren);
226
227 results.push(paren);
228 }
229 }
230 }
231 i = j;
232 } else {
233 i += 1;
234 }
235 }
236
237 results
238 }
239
240 fn find_antecedent(&self, text: &str, paren_start: usize) -> (String, usize, usize) {
247 if paren_start == 0 {
248 return (String::new(), 0, 0);
249 }
250
251 let before = &text[..paren_start];
253 let trimmed = before.trim_end();
254 let trimmed_end = trimmed.len(); let abbrev_suffixes = [
259 "Inc.", "Corp.", "Ltd.", "LLC.", "Co.", "Ltd", "Dr.", "Mr.", "Mrs.", "Ms.", "Jr.",
260 "Sr.", "Ph.D.", "M.D.", "Prof.", "Rev.", "Gen.", "Col.", "Capt.", "Sgt.", "St.", "Mt.",
261 "Ave.", "Blvd.", "Rd.",
262 ];
263
264 let mut phrase_start = 0;
266 let bytes = trimmed.as_bytes();
267
268 for i in (0..bytes.len()).rev() {
269 let c = bytes[i] as char;
270 if c == '.' || c == ',' || c == ';' || c == ':' || c == '\n' {
271 let suffix = &trimmed[..=i];
273 let is_abbrev = abbrev_suffixes.iter().any(|abbr| suffix.ends_with(abbr));
274
275 if !is_abbrev || c != '.' {
276 phrase_start = i + 1;
277 break;
278 }
279 }
280 }
281
282 let mut antecedent_start = phrase_start;
284 for (rel, c) in trimmed[phrase_start..].char_indices() {
285 if !c.is_whitespace() {
286 antecedent_start = phrase_start + rel;
287 break;
288 }
289 }
290
291 let antecedent = trimmed[antecedent_start..trimmed_end].to_string();
292 (antecedent, antecedent_start, trimmed_end)
293 }
294
295 fn classify(&self, mut paren: Parenthetical) -> Parenthetical {
297 let content = paren.content.trim();
298 let antecedent = paren.antecedent.trim();
299
300 if content.len() <= 5
302 && content.chars().all(|c| c.is_ascii_uppercase())
303 && !content.is_empty()
304 {
305 if antecedent.ends_with("Inc.")
307 || antecedent.ends_with("Corp.")
308 || antecedent.ends_with("Ltd.")
309 || antecedent.ends_with("LLC")
310 || antecedent.ends_with("Company")
311 {
312 paren.parenthetical_type = ParentheticalType::Ticker;
313 paren.is_alias = true;
314 paren.confidence = 0.9;
315 return paren;
316 }
317 }
318
319 if self.is_likely_abbreviation(antecedent, content) {
321 paren.parenthetical_type = ParentheticalType::Abbreviation;
322 paren.is_alias = true;
323 paren.confidence = 0.85;
324 return paren;
325 }
326
327 if self.is_temporal_bounds(content) {
329 paren.parenthetical_type = ParentheticalType::TemporalBounds;
330 paren.confidence = 0.9;
331 return paren;
332 }
333
334 if !content.is_ascii() || !antecedent.is_ascii() {
336 paren.parenthetical_type = ParentheticalType::Translation;
337 paren.is_alias = true;
338 paren.confidence = 0.7;
339 return paren;
340 }
341
342 if self.is_role(content) {
344 paren.parenthetical_type = ParentheticalType::Role;
345 paren.confidence = 0.8;
346 return paren;
347 }
348
349 if self.is_location_qualifier(content) {
351 paren.parenthetical_type = ParentheticalType::LocationQualifier;
352 paren.confidence = 0.75;
353 return paren;
354 }
355
356 if content.starts_with('[')
358 || content.contains("et al")
359 || content.contains("19")
360 || content.contains("20")
361 {
362 paren.parenthetical_type = ParentheticalType::Citation;
363 paren.confidence = 0.7;
364 return paren;
365 }
366
367 if content.split_whitespace().count() <= 3
369 && content
370 .chars()
371 .next()
372 .map(|c| c.is_uppercase())
373 .unwrap_or(false)
374 {
375 paren.parenthetical_type = ParentheticalType::Alias;
376 paren.is_alias = true;
377 paren.confidence = 0.6;
378 return paren;
379 }
380
381 paren.parenthetical_type = ParentheticalType::Clarification;
383 paren.confidence = 0.5;
384 paren
385 }
386
387 fn is_likely_abbreviation(&self, antecedent: &str, content: &str) -> bool {
389 if !content
391 .chars()
392 .all(|c| c.is_uppercase() || c.is_whitespace() || c == '.')
393 {
394 return false;
395 }
396
397 let antecedent_initials: String = antecedent
399 .split_whitespace()
400 .filter_map(|w| w.chars().next())
401 .filter(|c| c.is_uppercase())
402 .collect();
403
404 let content_letters: String = content.chars().filter(|c| c.is_alphabetic()).collect();
405
406 if antecedent_initials == content_letters {
407 return true;
408 }
409
410 content.len() >= 2 && content.len() <= 10
412 }
413
414 fn is_temporal_bounds(&self, content: &str) -> bool {
416 let patterns = [
418 r"^\d{4}\s*[-–—]\s*\d{4}$", r"^\d{4}\s*[-–—]\s*(present|\d{4})?$", r"^b\.\s*\d{4}$", r"^d\.\s*\d{4}$", r"^born\s+\d{4}$", r"^\d{4}s$", ];
425
426 for pattern in &patterns {
427 if let Ok(re) = regex::Regex::new(pattern) {
428 if re.is_match(content) {
429 return true;
430 }
431 }
432 }
433
434 false
435 }
436
437 fn is_role(&self, content: &str) -> bool {
439 let role_indicators = [
440 "CEO",
441 "CFO",
442 "CTO",
443 "COO",
444 "CMO",
445 "President",
446 "Director",
447 "Manager",
448 "Chairman",
449 "Senator",
450 "Governor",
451 "Mayor",
452 "Minister",
453 "Dr.",
454 "Prof.",
455 "Rev.",
456 "founder",
457 "co-founder",
458 "editor",
459 ];
460
461 let lower = content.to_lowercase();
462 role_indicators
463 .iter()
464 .any(|r| lower.contains(&r.to_lowercase()))
465 }
466
467 fn is_location_qualifier(&self, content: &str) -> bool {
469 let qualifiers = [
470 "UK",
471 "US",
472 "USA",
473 "England",
474 "Scotland",
475 "Wales",
476 "Massachusetts",
477 "California",
478 "Texas",
479 "New York",
480 "Ontario",
481 "Quebec",
482 "Bavaria",
483 "Saxony",
484 ];
485
486 qualifiers.iter().any(|q| content.contains(q))
487 }
488}
489
490#[derive(Debug, Clone, Serialize, Deserialize)]
494pub struct AliasPair {
495 pub primary: String,
497 pub alias: String,
499 pub doc_id: Option<String>,
501 pub confidence: f64,
503 pub alias_type: ParentheticalType,
505}
506
507impl AliasPair {
508 pub fn from_parenthetical(paren: &Parenthetical, doc_id: Option<&str>) -> Option<Self> {
510 if !paren.is_alias {
511 return None;
512 }
513
514 Some(Self {
515 primary: paren.antecedent.clone(),
516 alias: paren.content.clone(),
517 doc_id: doc_id.map(|s| s.to_string()),
518 confidence: paren.confidence,
519 alias_type: paren.parenthetical_type.clone(),
520 })
521 }
522}
523
524pub fn extract_aliases(text: &str, doc_id: Option<&str>) -> Vec<AliasPair> {
526 let extractor = ParentheticalExtractor::new();
527 let parentheticals = extractor.extract(text);
528
529 parentheticals
530 .iter()
531 .filter_map(|p| AliasPair::from_parenthetical(p, doc_id))
532 .collect()
533}
534
535#[cfg(test)]
536mod tests {
537 use super::*;
538 use crate::offset::TextSpan;
539
540 #[test]
541 fn test_abbreviation_extraction() {
542 let extractor = ParentheticalExtractor::new();
543 let text = "The World Health Organization (WHO) announced new guidelines.";
544 let results = extractor.extract(text);
545
546 assert_eq!(results.len(), 1);
547 assert_eq!(results[0].antecedent, "The World Health Organization");
548 assert_eq!(results[0].content, "WHO");
549 assert_eq!(
550 results[0].parenthetical_type,
551 ParentheticalType::Abbreviation
552 );
553 assert!(results[0].is_alias);
554 }
555
556 #[test]
557 fn test_ticker_extraction() {
558 let extractor = ParentheticalExtractor::new();
559 let text = "Apple Inc. (AAPL) reported strong earnings.";
560 let results = extractor.extract(text);
561
562 assert_eq!(results.len(), 1);
563 assert_eq!(results[0].content, "AAPL");
564 assert_eq!(results[0].parenthetical_type, ParentheticalType::Ticker);
565 }
566
567 #[test]
568 fn test_temporal_bounds() {
569 let extractor = ParentheticalExtractor::new();
570 let text = "Napoleon Bonaparte (1769-1821) was Emperor of France.";
571 let results = extractor.extract(text);
572
573 assert_eq!(results.len(), 1);
574 assert_eq!(results[0].content, "1769-1821");
575 assert_eq!(
576 results[0].parenthetical_type,
577 ParentheticalType::TemporalBounds
578 );
579 }
580
581 #[test]
582 fn test_translation() {
583 let extractor = ParentheticalExtractor::new();
584 let text = "北京 (Beijing) is the capital.";
585 let results = extractor.extract(text);
586
587 assert_eq!(results.len(), 1);
588 assert_eq!(
589 results[0].parenthetical_type,
590 ParentheticalType::Translation
591 );
592 }
593
594 #[test]
595 fn test_role_extraction() {
596 let extractor = ParentheticalExtractor::new();
597 let text = "Tim Cook (CEO of Apple) spoke at the conference.";
598 let results = extractor.extract(text);
599
600 assert_eq!(results.len(), 1);
601 assert_eq!(results[0].parenthetical_type, ParentheticalType::Role);
602 }
603
604 #[test]
605 fn test_parenthetical_offsets_are_character_offsets_with_unicode_prefix() {
606 let extractor = ParentheticalExtractor::new();
608 let text = "Müller (CEO) spoke.";
609 let results = extractor.extract(text);
610 assert_eq!(results.len(), 1);
611
612 let p = &results[0];
613 let span_text = TextSpan::from_chars(text, p.start, p.end).extract(text);
614 assert_eq!(span_text, "Müller (CEO)");
615
616 let content_text = TextSpan::from_chars(text, p.content_start, p.content_end).extract(text);
617 assert_eq!(content_text, "CEO");
618 }
619
620 #[test]
621 fn test_alias_pair_extraction() {
622 let text = "The United Nations (UN) held a meeting.";
623 let aliases = extract_aliases(text, Some("doc1"));
624
625 assert_eq!(aliases.len(), 1);
626 assert_eq!(aliases[0].primary, "The United Nations");
627 assert_eq!(aliases[0].alias, "UN");
628 assert_eq!(aliases[0].doc_id, Some("doc1".to_string()));
629 }
630
631 #[test]
632 fn test_multiple_parentheticals() {
633 let extractor = ParentheticalExtractor::new();
634 let text = "Microsoft Corp. (MSFT) and Apple Inc. (AAPL) are tech giants.";
635 let results = extractor.extract(text);
636
637 assert_eq!(results.len(), 2);
638 }
639
640 #[test]
641 fn test_nested_parentheses_skipped() {
642 let extractor = ParentheticalExtractor::new();
643 let text = "Complex formula (f(x) = x^2) is quadratic.";
644 let results = extractor.extract(text);
645
646 assert_eq!(results.len(), 1);
648 }
649}