1use serde::{Deserialize, Serialize};
69
70#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
78pub enum AppositionType {
79 Appositive,
81 AlsoKnownAs,
83 Aka,
85 Nickname,
87 BirthName,
89 FormerlyKnownAs,
91 Renamed,
93 NowKnownAs,
95 ColonExpansion,
97 OrAlternative,
99 RealName,
101 BetterKnownAs,
103 Nee,
105 StyledAs,
107 #[default]
109 Generic,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct Apposition {
122 pub primary: String,
124 pub alias: String,
126 pub start: usize,
128 pub end: usize,
130 pub apposition_type: AppositionType,
132 pub confidence: f64,
134 pub primary_is_canonical: bool,
136}
137
138impl Apposition {
139 pub fn new(primary: &str, alias: &str, start: usize, end: usize) -> Self {
143 Self {
144 primary: primary.to_string(),
145 alias: alias.to_string(),
146 start,
147 end,
148 apposition_type: AppositionType::Generic,
149 confidence: 0.7,
150 primary_is_canonical: true,
151 }
152 }
153
154 #[must_use]
156 pub fn with_type(mut self, atype: AppositionType) -> Self {
157 self.apposition_type = atype;
158 self
159 }
160
161 #[must_use]
166 pub fn alias_is_canonical(mut self) -> Self {
167 self.primary_is_canonical = false;
168 self
169 }
170
171 pub fn canonical(&self) -> &str {
178 if self.primary_is_canonical {
179 &self.primary
180 } else {
181 &self.alias
182 }
183 }
184
185 pub fn alternate(&self) -> &str {
187 if self.primary_is_canonical {
188 &self.alias
189 } else {
190 &self.primary
191 }
192 }
193}
194
195#[derive(Debug, Clone, Default)]
208pub struct AppositionExtractor {
209 #[allow(dead_code)] extract_appositives: bool,
212 extract_aka: bool,
214 extract_nicknames: bool,
216 extract_rename: bool,
218 extract_colon: bool,
220}
221
222impl AppositionExtractor {
223 pub fn new() -> Self {
225 Self {
226 extract_appositives: true,
227 extract_aka: true,
228 extract_nicknames: true,
229 extract_rename: true,
230 extract_colon: true,
231 }
232 }
233
234 pub fn extract(&self, text: &str) -> Vec<Apposition> {
239 let mut results = Vec::new();
240
241 if self.extract_aka {
243 results.extend(self.extract_aka_patterns(text));
244 }
245
246 results.extend(self.extract_born_patterns(text));
248
249 if self.extract_rename {
251 results.extend(self.extract_rename_patterns(text));
252 }
253
254 if self.extract_nicknames {
256 results.extend(self.extract_nickname_patterns(text));
257 }
258
259 if self.extract_colon {
261 results.extend(self.extract_colon_patterns(text));
262 }
263
264 results.extend(self.extract_nee_patterns(text));
266
267 results.sort_by_key(|a| a.start);
269 self.remove_overlaps(results)
270 }
271
272 fn extract_aka_patterns(&self, text: &str) -> Vec<Apposition> {
274 let mut results = Vec::new();
275 let _lower = text.to_lowercase();
276
277 let patterns = [
279 (
280 r"([A-Z][^,]+),\s*also known as\s+([A-Z][^,.]+)",
281 AppositionType::AlsoKnownAs,
282 true,
283 ),
284 (
285 r"([A-Z][^,]+),\s*a\.k\.a\.?\s+([A-Z][^,.]+)",
286 AppositionType::Aka,
287 true,
288 ),
289 (
290 r"([A-Z][^,]+),\s*aka\s+([A-Z][^,.]+)",
291 AppositionType::Aka,
292 true,
293 ),
294 (
295 r"([A-Z][^,]+),\s*better known as\s+([A-Z][^,.]+)",
296 AppositionType::BetterKnownAs,
297 false,
298 ),
299 (
300 r"([A-Z][^,]+),\s*real name\s+([A-Z][^,.]+)",
301 AppositionType::RealName,
302 false,
303 ),
304 ];
305
306 for (pattern, atype, primary_canonical) in &patterns {
307 if let Ok(re) = regex::Regex::new(pattern) {
308 for cap in re.captures_iter(text) {
309 if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
310 let mut appo = Apposition::new(
311 m1.as_str().trim(),
312 m2.as_str().trim(),
313 cap.get(0).expect("regex match should have group 0").start(),
314 cap.get(0).expect("regex match should have group 0").end(),
315 )
316 .with_type(atype.clone());
317
318 if !*primary_canonical {
319 appo = appo.alias_is_canonical();
320 }
321 appo.confidence = 0.9;
322
323 results.push(appo);
324 }
325 }
326 }
327 }
328
329 results
330 }
331
332 fn extract_born_patterns(&self, text: &str) -> Vec<Apposition> {
334 let mut results = Vec::new();
335
336 let patterns = [
338 r"([A-Z][A-Za-z\s]+),\s*born\s+([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
339 r"([A-Z][A-Za-z\s]+)\s*\(born\s+([A-Z][A-Za-z\s]+)\)",
340 ];
341
342 for pattern in &patterns {
343 if let Ok(re) = regex::Regex::new(pattern) {
344 for cap in re.captures_iter(text) {
345 if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
346 let appo = Apposition::new(
347 m1.as_str().trim(),
348 m2.as_str().trim(),
349 cap.get(0).expect("regex match should have group 0").start(),
350 cap.get(0).expect("regex match should have group 0").end(),
351 )
352 .with_type(AppositionType::BirthName)
353 .alias_is_canonical(); results.push(appo);
356 }
357 }
358 }
359 }
360
361 results
362 }
363
364 fn extract_rename_patterns(&self, text: &str) -> Vec<Apposition> {
366 let mut results = Vec::new();
367
368 let patterns = [
369 (
370 r"([A-Z][A-Za-z\s]+),\s*formerly\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
371 AppositionType::FormerlyKnownAs,
372 true,
373 ),
374 (
375 r"([A-Z][A-Za-z\s]+),\s*previously\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
376 AppositionType::FormerlyKnownAs,
377 true,
378 ),
379 (
380 r"([A-Z][A-Za-z\s]+),\s*now\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
381 AppositionType::NowKnownAs,
382 false,
383 ),
384 (
385 r"([A-Z][A-Za-z\s]+),\s*currently\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
386 AppositionType::NowKnownAs,
387 false,
388 ),
389 ];
390
391 for (pattern, atype, primary_canonical) in &patterns {
392 if let Ok(re) = regex::Regex::new(pattern) {
393 for cap in re.captures_iter(text) {
394 if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
395 let mut appo = Apposition::new(
396 m1.as_str().trim(),
397 m2.as_str().trim(),
398 cap.get(0).expect("regex match should have group 0").start(),
399 cap.get(0).expect("regex match should have group 0").end(),
400 )
401 .with_type(atype.clone());
402
403 if !*primary_canonical {
404 appo = appo.alias_is_canonical();
405 }
406 appo.confidence = 0.85;
407
408 results.push(appo);
409 }
410 }
411 }
412 }
413
414 results
415 }
416
417 fn extract_nickname_patterns(&self, text: &str) -> Vec<Apposition> {
419 let mut results = Vec::new();
420
421 let patterns = [
423 r#"([A-Z][a-z]+)\s+'([A-Z][^']+)'\s+([A-Z][a-z]+)"#,
424 r#"([A-Z][a-z]+)\s+"([A-Z][^"]+)"\s+([A-Z][a-z]+)"#,
425 r#"([A-Z][a-z]+)\s+'([A-Z][^']+)'\s+([A-Z][a-z]+)"#, ];
427
428 for pattern in &patterns {
429 if let Ok(re) = regex::Regex::new(pattern) {
430 for cap in re.captures_iter(text) {
431 if let (Some(first), Some(nick), Some(last)) =
432 (cap.get(1), cap.get(2), cap.get(3))
433 {
434 let full_name = format!("{} {}", first.as_str(), last.as_str());
435 let appo = Apposition::new(
436 &full_name,
437 nick.as_str(),
438 cap.get(0).expect("regex match should have group 0").start(),
439 cap.get(0).expect("regex match should have group 0").end(),
440 )
441 .with_type(AppositionType::Nickname);
442
443 results.push(appo);
444 }
445 }
446 }
447 }
448
449 results
450 }
451
452 fn extract_colon_patterns(&self, text: &str) -> Vec<Apposition> {
454 let mut results = Vec::new();
455
456 if let Ok(re) = regex::Regex::new(r"([A-Z]{2,8}):\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)") {
459 for cap in re.captures_iter(text) {
460 if let (Some(abbrev), Some(full)) = (cap.get(1), cap.get(2)) {
461 let full_text = full.as_str().trim();
462 let group_0 = cap.get(0).expect("regex match should have group 0");
463 let appo =
464 Apposition::new(full_text, abbrev.as_str(), group_0.start(), group_0.end())
465 .with_type(AppositionType::ColonExpansion);
466
467 results.push(appo);
468 }
469 }
470 }
471
472 results
473 }
474
475 fn extract_nee_patterns(&self, text: &str) -> Vec<Apposition> {
477 let mut results = Vec::new();
478
479 let patterns = [
480 r"([A-Z][A-Za-z\s]+),\s*née\s+([A-Z][a-z]+)",
481 r"([A-Z][A-Za-z\s]+),\s*nee\s+([A-Z][a-z]+)",
482 r"([A-Z][A-Za-z\s]+)\s*\(née\s+([A-Z][a-z]+)\)",
483 ];
484
485 for pattern in &patterns {
486 if let Ok(re) = regex::Regex::new(pattern) {
487 for cap in re.captures_iter(text) {
488 if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
489 let appo = Apposition::new(
490 m1.as_str().trim(),
491 m2.as_str().trim(),
492 cap.get(0).expect("regex match should have group 0").start(),
493 cap.get(0).expect("regex match should have group 0").end(),
494 )
495 .with_type(AppositionType::Nee);
496
497 results.push(appo);
498 }
499 }
500 }
501 }
502
503 results
504 }
505
506 fn remove_overlaps(&self, mut appos: Vec<Apposition>) -> Vec<Apposition> {
508 appos.sort_by(|a, b| {
509 b.confidence
510 .partial_cmp(&a.confidence)
511 .unwrap_or(std::cmp::Ordering::Equal)
512 });
513
514 let mut result = Vec::new();
515 for appo in appos {
516 let overlaps = result
517 .iter()
518 .any(|a: &Apposition| appo.start < a.end && appo.end > a.start);
519 if !overlaps {
520 result.push(appo);
521 }
522 }
523
524 result.sort_by_key(|a| a.start);
525 result
526 }
527}
528
529pub fn extract_all_aliases(text: &str) -> Vec<(String, String, f64)> {
548 use super::parenthetical::ParentheticalExtractor;
549
550 let mut aliases = Vec::new();
551
552 let paren_ext = ParentheticalExtractor::new();
554 for paren in paren_ext.extract(text) {
555 if paren.is_alias {
556 aliases.push((paren.antecedent, paren.content, paren.confidence));
557 }
558 }
559
560 let appo_ext = AppositionExtractor::new();
562 for appo in appo_ext.extract(text) {
563 aliases.push((
564 appo.canonical().to_string(),
565 appo.alternate().to_string(),
566 appo.confidence,
567 ));
568 }
569
570 aliases
571}
572
573#[cfg(test)]
574mod tests {
575 use super::*;
576
577 #[test]
578 fn test_aka_pattern() {
579 let extractor = AppositionExtractor::new();
580 let text = "Peter Parker, also known as Spider-Man, saved the city.";
581 let results = extractor.extract(text);
582
583 assert_eq!(results.len(), 1);
584 assert_eq!(results[0].primary, "Peter Parker");
585 assert_eq!(results[0].alias, "Spider-Man");
586 assert_eq!(results[0].apposition_type, AppositionType::AlsoKnownAs);
587 }
588
589 #[test]
590 fn test_born_pattern() {
591 let extractor = AppositionExtractor::new();
592 let text = "Lady Gaga, born Stefani Germanotta, is a famous singer.";
593 let results = extractor.extract(text);
594
595 assert_eq!(results.len(), 1);
596 assert_eq!(results[0].primary, "Lady Gaga");
597 assert_eq!(results[0].alias, "Stefani Germanotta");
598 assert_eq!(results[0].apposition_type, AppositionType::BirthName);
599 assert_eq!(results[0].canonical(), "Stefani Germanotta");
601 }
602
603 #[test]
604 fn test_formerly_pattern() {
605 let extractor = AppositionExtractor::new();
606 let text = "Mumbai, formerly Bombay, is India's largest city.";
607 let results = extractor.extract(text);
608
609 assert_eq!(results.len(), 1);
610 assert_eq!(results[0].primary, "Mumbai");
611 assert_eq!(results[0].alias, "Bombay");
612 assert_eq!(results[0].apposition_type, AppositionType::FormerlyKnownAs);
613 }
614
615 #[test]
616 fn test_nickname_pattern() {
617 let extractor = AppositionExtractor::new();
618 let text = "Dwayne 'The Rock' Johnson is an actor.";
619 let results = extractor.extract(text);
620
621 assert_eq!(results.len(), 1);
622 assert_eq!(results[0].primary, "Dwayne Johnson");
623 assert_eq!(results[0].alias, "The Rock");
624 assert_eq!(results[0].apposition_type, AppositionType::Nickname);
625 }
626
627 #[test]
628 fn test_colon_pattern() {
629 let extractor = AppositionExtractor::new();
630 let text = "AWS: Amazon Web Services provides cloud computing.";
631 let results = extractor.extract(text);
632
633 assert_eq!(results.len(), 1);
634 assert_eq!(results[0].alias, "AWS");
635 assert_eq!(results[0].primary, "Amazon Web Services");
636 }
637
638 #[test]
639 fn test_nee_pattern() {
640 let extractor = AppositionExtractor::new();
641 let text = "Hillary Clinton, née Rodham, was Secretary of State.";
642 let results = extractor.extract(text);
643
644 assert_eq!(results.len(), 1);
645 assert_eq!(results[0].alias, "Rodham");
646 assert_eq!(results[0].apposition_type, AppositionType::Nee);
647 }
648
649 #[test]
650 fn test_combined_extraction() {
651 let text = "Apple Inc. (AAPL), formerly Apple Computer, launched the iPhone.";
652 let aliases = extract_all_aliases(text);
653
654 assert!(!aliases.is_empty());
656 }
657
658 #[test]
659 fn test_better_known_as() {
660 let extractor = AppositionExtractor::new();
661 let text = "Marshall Mathers, better known as Eminem, is a rapper.";
662 let results = extractor.extract(text);
663
664 assert_eq!(results.len(), 1);
665 assert_eq!(results[0].apposition_type, AppositionType::BetterKnownAs);
666 assert_eq!(results[0].canonical(), "Eminem");
668 }
669}