scirs2-text 0.4.1

Text processing module for SciRS2 (scirs2-text)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
//! Advanced NER extractor with static helper methods, RAKE phrase extraction,
//! simple coreference resolution, and SVO relation extraction.
//!
//! This module provides a higher-level API that wraps the lower-level pattern
//! infrastructure in [`super::patterns`] and [`super::entities`].

use super::entities::{Entity, EntityType};
use super::patterns::{
    DATE_PATTERN, EMAIL_PATTERN, MONEY_PATTERN, PERCENTAGE_PATTERN, PHONE_PATTERN, TIME_PATTERN,
    URL_PATTERN,
};
use crate::error::{Result, TextError};
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::{HashMap, HashSet};

// ---------------------------------------------------------------------------
// Number pattern (not already in patterns.rs at module-level as a lazy_static)
// ---------------------------------------------------------------------------

lazy_static! {
    static ref NUMBER_PATTERN: Regex = Regex::new(
        r"(?x)
        (?:
            [+-]?                       # optional sign
            (?:
                \d{1,3}(?:,\d{3})+      # thousands-separated integer
                | \d+                   # plain integer
            )
            (?:\.\d+)?                  # optional decimal
            (?:[eE][+-]?\d+)?           # optional scientific exponent
        )
        \b"
    )
    .expect("NUMBER_PATTERN is valid");
}

// ---------------------------------------------------------------------------
// CoreferenceCluster (simple span-based)
// ---------------------------------------------------------------------------

/// A cluster of coreferring mentions, anchored to a canonical surface form.
#[derive(Debug, Clone)]
pub struct CoreferenceCluster {
    /// The canonical text for this cluster (e.g. the first named mention).
    pub canonical: String,
    /// All `(start_byte, end_byte)` spans that refer to the same entity.
    pub mentions: Vec<(usize, usize)>,
}

// ---------------------------------------------------------------------------
// AdvancedNerExtractor
// ---------------------------------------------------------------------------

/// Advanced named-entity extractor backed by regex patterns and an optional
/// custom-pattern registry.
///
/// Unlike [`super::extractors::RuleBasedNER`], this struct provides *static*
/// convenience helpers so callers can extract specific entity types without
/// constructing an instance.
///
/// # Example
///
/// ```rust
/// use scirs2_text::information_extraction::advanced_ner::AdvancedNerExtractor;
///
/// let emails = AdvancedNerExtractor::extract_emails("Contact: alice@example.com");
/// assert_eq!(emails.len(), 1);
/// assert_eq!(emails[0].text, "alice@example.com");
/// ```
pub struct AdvancedNerExtractor {
    /// Each entry holds an `EntityType` and the compiled regex for it.
    custom_patterns: Vec<(EntityType, Regex)>,
}

impl Default for AdvancedNerExtractor {
    fn default() -> Self {
        Self::new()
    }
}

impl AdvancedNerExtractor {
    // ------------------------------------------------------------------
    // Construction
    // ------------------------------------------------------------------

    /// Create an extractor with the built-in pattern set (email, URL, date,
    /// time, phone, money, percent, number).
    pub fn new() -> Self {
        Self {
            custom_patterns: Vec::new(),
        }
    }

    /// Register an additional custom pattern.  The `pattern` string is
    /// compiled as a [`regex::Regex`] and matched against the input text.
    ///
    /// # Errors
    ///
    /// Returns [`TextError::InvalidInput`] when `pattern` fails to compile.
    pub fn add_pattern(&mut self, entity_type: EntityType, pattern: &str) -> Result<()> {
        let re = Regex::new(pattern)
            .map_err(|e| TextError::InvalidInput(format!("Invalid regex '{}': {}", pattern, e)))?;
        self.custom_patterns.push((entity_type, re));
        Ok(())
    }

    // ------------------------------------------------------------------
    // Instance extraction
    // ------------------------------------------------------------------

    /// Extract all entity types from `text` (built-in + custom patterns).
    pub fn extract(&self, text: &str) -> Vec<Entity> {
        let mut entities = Vec::new();

        // Built-in patterns
        entities.extend(extract_with_pattern(text, &EMAIL_PATTERN, EntityType::Email, 1.0));
        entities.extend(extract_with_pattern(text, &URL_PATTERN, EntityType::Url, 1.0));
        entities.extend(extract_with_pattern(text, &DATE_PATTERN, EntityType::Date, 0.95));
        entities.extend(extract_with_pattern(text, &TIME_PATTERN, EntityType::Time, 0.95));
        entities.extend(extract_with_pattern(text, &PHONE_PATTERN, EntityType::Phone, 0.90));
        entities.extend(extract_with_pattern(text, &MONEY_PATTERN, EntityType::Money, 0.95));
        entities.extend(extract_with_pattern(
            text,
            &PERCENTAGE_PATTERN,
            EntityType::Percentage,
            0.95,
        ));
        entities.extend(extract_with_pattern(text, &NUMBER_PATTERN, EntityType::Custom("number".to_string()), 0.85));

        // Custom patterns
        for (et, re) in &self.custom_patterns {
            entities.extend(extract_with_pattern(text, re, et.clone(), 0.80));
        }

        // Sort by start position and remove overlapping lower-confidence matches
        entities.sort_by_key(|e| e.start);
        dedup_overlapping(entities)
    }

    // ------------------------------------------------------------------
    // Static helpers
    // ------------------------------------------------------------------

    /// Extract all email addresses from `text`.
    pub fn extract_emails(text: &str) -> Vec<Entity> {
        extract_with_pattern(text, &EMAIL_PATTERN, EntityType::Email, 1.0)
    }

    /// Extract all URLs from `text`.
    pub fn extract_urls(text: &str) -> Vec<Entity> {
        extract_with_pattern(text, &URL_PATTERN, EntityType::Url, 1.0)
    }

    /// Extract date expressions (ISO, US, European, spelled-out month) from `text`.
    pub fn extract_dates(text: &str) -> Vec<Entity> {
        extract_with_pattern(text, &DATE_PATTERN, EntityType::Date, 0.95)
    }

    /// Extract number-like tokens (integers, decimals, scientific notation,
    /// currency amounts, percentages) from `text`.
    pub fn extract_numbers(text: &str) -> Vec<Entity> {
        let mut out = Vec::new();
        out.extend(extract_with_pattern(
            text,
            &MONEY_PATTERN,
            EntityType::Money,
            0.95,
        ));
        out.extend(extract_with_pattern(
            text,
            &PERCENTAGE_PATTERN,
            EntityType::Percentage,
            0.95,
        ));
        out.extend(extract_with_pattern(
            text,
            &NUMBER_PATTERN,
            EntityType::Custom("number".to_string()),
            0.85,
        ));
        out.sort_by_key(|e| e.start);
        out
    }
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// Apply a single regex to `text` and return entities with `confidence`.
fn extract_with_pattern(
    text: &str,
    pattern: &Regex,
    entity_type: EntityType,
    confidence: f64,
) -> Vec<Entity> {
    pattern
        .find_iter(text)
        .map(|m| Entity {
            text: m.as_str().to_string(),
            entity_type: entity_type.clone(),
            start: m.start(),
            end: m.end(),
            confidence,
        })
        .collect()
}

/// Remove overlapping entities, keeping the one with higher confidence (or
/// the earlier start when equal).
fn dedup_overlapping(mut entities: Vec<Entity>) -> Vec<Entity> {
    entities.sort_by(|a, b| {
        a.start
            .cmp(&b.start)
            .then_with(|| b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal))
    });

    let mut result: Vec<Entity> = Vec::new();
    let mut cursor: usize = 0;

    for entity in entities {
        if entity.start >= cursor {
            cursor = entity.end;
            result.push(entity);
        }
        // else: overlapping — skip
    }

    result
}

// ---------------------------------------------------------------------------
// RakeExtractor
// ---------------------------------------------------------------------------

/// RAKE (Rapid Automatic Keyword Extraction) phrase-level keyphrase extractor.
///
/// The algorithm splits text at stop-word and punctuation boundaries to form
/// *candidate phrases*, then scores each phrase using the word co-degree /
/// frequency ratio.
///
/// # Example
///
/// ```rust
/// use scirs2_text::information_extraction::advanced_ner::RakeExtractor;
///
/// let rake = RakeExtractor::new();
/// let keyphrases = rake.extract(
///     "Automatic keyword extraction uses statistical methods. \
///      Keyword extraction is useful for document analysis.",
/// );
/// assert!(!keyphrases.is_empty());
/// ```
pub struct RakeExtractor {
    /// Stop-words used as phrase delimiters.
    pub stopwords: HashSet<String>,
    /// Minimum number of words a candidate phrase must contain.
    pub min_phrase_len: usize,
    /// Maximum number of words a candidate phrase may contain.
    pub max_phrase_len: usize,
}

impl Default for RakeExtractor {
    fn default() -> Self {
        Self::new()
    }
}

impl RakeExtractor {
    /// Create a new extractor with the default English stop-word list.
    pub fn new() -> Self {
        Self {
            stopwords: default_stop_words(),
            min_phrase_len: 1,
            max_phrase_len: 4,
        }
    }

    /// Create a new extractor with a custom stop-word list.
    pub fn with_stopwords(words: Vec<String>) -> Self {
        Self {
            stopwords: words.into_iter().collect(),
            min_phrase_len: 1,
            max_phrase_len: 4,
        }
    }

    /// Extract keyphrases from `text`.
    ///
    /// Returns a list of `(phrase, score)` pairs sorted by score in descending
    /// order.  The score equals the sum of individual word scores where each
    /// word score is `deg(word) / freq(word)` (standard RAKE metric).
    pub fn extract(&self, text: &str) -> Vec<(String, f64)> {
        // 1. Split text into candidate phrases delimited by stop-words /
        //    punctuation.
        let candidates = self.extract_candidates(text);

        if candidates.is_empty() {
            return Vec::new();
        }

        // 2. Build word frequency and co-degree maps.
        let mut word_freq: HashMap<String, f64> = HashMap::new();
        let mut word_degree: HashMap<String, f64> = HashMap::new();

        for phrase in &candidates {
            let words = tokenize_phrase(phrase);
            let phrase_len = words.len() as f64;
            for word in &words {
                *word_freq.entry(word.clone()).or_insert(0.0) += 1.0;
                *word_degree.entry(word.clone()).or_insert(0.0) += phrase_len;
            }
        }

        // 3. Score each word: deg(w) / freq(w).
        let word_score: HashMap<String, f64> = word_freq
            .iter()
            .map(|(w, &freq)| {
                let deg = word_degree.get(w).copied().unwrap_or(freq);
                (w.clone(), deg / freq)
            })
            .collect();

        // 4. Score each candidate phrase as sum of word scores.
        let mut phrase_scores: HashMap<String, f64> = HashMap::new();
        for phrase in &candidates {
            let words = tokenize_phrase(phrase);
            let len = words.len();
            if len < self.min_phrase_len || len > self.max_phrase_len {
                continue;
            }
            let score: f64 = words
                .iter()
                .map(|w| word_score.get(w).copied().unwrap_or(0.0))
                .sum();
            phrase_scores
                .entry(phrase.clone())
                .and_modify(|s| {
                    if score > *s {
                        *s = score;
                    }
                })
                .or_insert(score);
        }

        // 5. Sort descending by score.
        let mut result: Vec<(String, f64)> = phrase_scores.into_iter().collect();
        result.sort_by(|a, b| {
            b.1.partial_cmp(&a.1)
                .unwrap_or(std::cmp::Ordering::Equal)
                .then_with(|| a.0.cmp(&b.0))
        });

        result
    }

    // ------------------------------------------------------------------
    // Internals
    // ------------------------------------------------------------------

    fn extract_candidates<'a>(&self, text: &'a str) -> Vec<String> {
        // Split at sentence boundaries first, then at stop-word / punctuation
        // boundaries within each sentence.
        let mut candidates = Vec::new();
        let sentences = split_sentences(text);

        for sentence in &sentences {
            let words: Vec<&str> = sentence.split_whitespace().collect();
            let mut current_phrase: Vec<&str> = Vec::new();

            for word in &words {
                let clean = word
                    .trim_matches(|c: char| !c.is_alphanumeric())
                    .to_lowercase();

                if clean.is_empty() || self.stopwords.contains(&clean) {
                    if !current_phrase.is_empty() {
                        let phrase = current_phrase.join(" ");
                        let phrase_words = tokenize_phrase(&phrase);
                        if !phrase_words.is_empty() {
                            candidates.push(phrase);
                        }
                        current_phrase.clear();
                    }
                } else {
                    current_phrase.push(word);
                }
            }

            if !current_phrase.is_empty() {
                let phrase = current_phrase.join(" ");
                let phrase_words = tokenize_phrase(&phrase);
                if !phrase_words.is_empty() {
                    candidates.push(phrase);
                }
            }
        }

        candidates
    }
}

// ---------------------------------------------------------------------------
// SVO relation extraction
// ---------------------------------------------------------------------------

/// A subject-verb-object triple extracted from text using simple heuristics.
#[derive(Debug, Clone)]
pub struct SvoTriple {
    /// The subject noun phrase.
    pub subject: String,
    /// The main verb or predicate.
    pub predicate: String,
    /// The object noun phrase.
    pub object: String,
    /// Confidence score for this extraction.
    pub confidence: f64,
}

/// Rule-based subject-verb-object extractor using shallow-parse heuristics.
///
/// This extractor does not perform full syntactic parsing.  Instead it
/// identifies simple `<NP> <VP> <NP>` patterns in each sentence using a
/// curated verb list and capitalised-noun heuristic.
pub struct SvoRelationExtractor {
    // Indicative transitive verbs used to anchor candidate SVO triples.
    verb_patterns: Vec<Regex>,
}

impl Default for SvoRelationExtractor {
    fn default() -> Self {
        Self::new()
    }
}

impl SvoRelationExtractor {
    /// Create a new extractor with the default verb list.
    pub fn new() -> Self {
        // Each pattern tries to capture (subject, verb, object) using named groups.
        // We use simple word-boundary patterns for common sentence structures.
        let verb_strs = [
            // "X <verb> Y"
            r"(?P<subj>[A-Z][A-Za-z]+(?: [A-Z][A-Za-z]+)*)\s+(?P<verb>(?:is|are|was|were|will be|has been|have been)\s+(?:\w+\s+)?(?:the\s+)?(?:CEO|founder|leader|head|director|manager|president|chairman|member)\s+of)\s+(?P<obj>[A-Z][A-Za-z]+(?: [A-Za-z&]+)*)",
            r"(?P<subj>[A-Z][A-Za-z]+(?: [A-Z][A-Za-z]+)*)\s+(?P<verb>(?:acquired|merged with|partnered with|invested in|founded|launched|released|announced|created|developed|built|designed|invented|discovered|published|wrote|authored))\s+(?P<obj>[A-Z][A-Za-z]+(?: [A-Za-z&]+)*)",
            r"(?P<subj>[A-Z][A-Za-z]+(?: [A-Z][A-Za-z]+)*)\s+(?P<verb>(?:works? for|works? at|employed by|joined|left|resigned from))\s+(?P<obj>[A-Z][A-Za-z]+(?: [A-Za-z&]+)*)",
        ];

        let verb_patterns = verb_strs
            .iter()
            .filter_map(|s| Regex::new(s).ok())
            .collect();

        Self { verb_patterns }
    }

    /// Extract SVO triples from `text`.
    pub fn extract(&self, text: &str) -> Vec<SvoTriple> {
        let mut triples = Vec::new();
        let sentences = split_sentences(text);

        for sentence in &sentences {
            for pattern in &self.verb_patterns {
                for caps in pattern.captures_iter(sentence) {
                    let subj = caps.name("subj").map(|m| m.as_str().trim().to_string());
                    let verb = caps.name("verb").map(|m| m.as_str().trim().to_string());
                    let obj = caps.name("obj").map(|m| m.as_str().trim().to_string());

                    if let (Some(subject), Some(predicate), Some(object)) = (subj, verb, obj) {
                        triples.push(SvoTriple {
                            subject,
                            predicate,
                            object,
                            confidence: 0.70,
                        });
                    }
                }
            }
        }

        triples
    }
}

// ---------------------------------------------------------------------------
// Simple coreference
// ---------------------------------------------------------------------------

/// Perform simple pronoun-to-entity coreference resolution.
///
/// Returns a list of [`CoreferenceCluster`]s, each containing a canonical
/// name and all byte-offset spans (including the canonical mention itself)
/// that co-refer to the same entity.
///
/// The algorithm is entirely heuristic:
/// 1. Collect all capitalised noun tokens as candidate antecedents.
/// 2. For each pronoun (he, she, it, they, …), link it to the most recently
///    seen antecedent of matching gender/number heuristics.
///
/// This is intentionally simple — it handles straightforward single-document
/// cases without a trained model.
pub fn simple_coreference(text: &str) -> Vec<CoreferenceCluster> {
    lazy_static! {
        static ref PRONOUN_RE: Regex =
            Regex::new(r"\b(?i)(he|him|his|she|her|hers|it|its|they|them|their|theirs)\b")
                .expect("PRONOUN_RE is valid");
        static ref CAPITALIZED_NOUN_RE: Regex =
            Regex::new(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b").expect("CAPITALIZED_NOUN_RE is valid");
    }

    // Collect antecedent candidates: (start, end, text)
    let mut antecedents: Vec<(usize, usize, String)> = CAPITALIZED_NOUN_RE
        .find_iter(text)
        .map(|m| (m.start(), m.end(), m.as_str().to_string()))
        .collect();

    // Collect pronouns: (start, end, text)
    let pronouns: Vec<(usize, usize, String)> = PRONOUN_RE
        .find_iter(text)
        .map(|m| (m.start(), m.end(), m.as_str().to_lowercase()))
        .collect();

    if antecedents.is_empty() || pronouns.is_empty() {
        // Build trivial clusters from antecedents alone
        return antecedents
            .into_iter()
            .map(|(start, end, name)| CoreferenceCluster {
                canonical: name,
                mentions: vec![(start, end)],
            })
            .collect();
    }

    // For each pronoun, find the closest preceding antecedent.
    // Cluster under that antecedent's canonical text.
    let mut clusters: HashMap<String, Vec<(usize, usize)>> = HashMap::new();

    // Seed clusters from antecedents
    for (start, end, name) in &antecedents {
        clusters
            .entry(name.clone())
            .or_insert_with(Vec::new)
            .push((*start, *end));
    }

    for (p_start, p_end, pronoun) in &pronouns {
        // Determine preferred entity type from pronoun
        let prefer_person = matches!(
            pronoun.as_str(),
            "he" | "him" | "his" | "she" | "her" | "hers"
        );

        // Find closest antecedent before this pronoun
        let candidate = antecedents
            .iter()
            .filter(|(a_start, _, _)| *a_start < *p_start)
            .max_by_key(|(a_start, _, _)| *a_start);

        if let Some((_, _, name)) = candidate {
            // If person-preferred, try to pick a multi-word name (heuristic)
            let resolved_name = if prefer_person {
                antecedents
                    .iter()
                    .filter(|(a_start, _, n)| {
                        *a_start < *p_start && n.contains(' ')
                    })
                    .max_by_key(|(a_start, _, _)| *a_start)
                    .map(|(_, _, n)| n)
                    .unwrap_or(name)
            } else {
                name
            };

            clusters
                .entry(resolved_name.clone())
                .or_insert_with(Vec::new)
                .push((*p_start, *p_end));
        }
    }

    // Sort antecedents to make output deterministic
    antecedents.sort_by_key(|(s, _, _)| *s);

    clusters
        .into_iter()
        .map(|(canonical, mut mentions)| {
            mentions.sort_by_key(|(s, _)| *s);
            mentions.dedup();
            CoreferenceCluster { canonical, mentions }
        })
        .collect()
}

// ---------------------------------------------------------------------------
// Free helper functions
// ---------------------------------------------------------------------------

/// Split `text` into sentences at `.`, `!`, `?` boundaries.
fn split_sentences(text: &str) -> Vec<String> {
    let mut sentences = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        current.push(ch);
        if matches!(ch, '.' | '!' | '?') {
            let s = current.trim().to_string();
            if !s.is_empty() {
                sentences.push(s);
            }
            current.clear();
        }
    }
    let tail = current.trim().to_string();
    if !tail.is_empty() {
        sentences.push(tail);
    }
    sentences
}

/// Tokenise a phrase into lowercase words, stripping punctuation.
fn tokenize_phrase(phrase: &str) -> Vec<String> {
    phrase
        .split(|c: char| !c.is_alphanumeric())
        .filter(|t| !t.is_empty())
        .map(|t| t.to_lowercase())
        .collect()
}

/// Minimal English stop-word list for RAKE.
fn default_stop_words() -> HashSet<String> {
    const WORDS: &[&str] = &[
        "a", "an", "the", "and", "or", "but", "nor", "for", "yet", "so",
        "in", "on", "at", "to", "of", "with", "by", "from", "as", "into",
        "through", "during", "before", "after", "above", "below", "between",
        "out", "off", "over", "under", "again", "about", "against", "along",
        "around", "up", "down",
        "i", "me", "my", "we", "our", "you", "your", "he", "him", "his",
        "she", "her", "it", "its", "they", "them", "their", "what", "which",
        "who", "this", "that", "these", "those",
        "is", "am", "are", "was", "were", "be", "been", "being",
        "have", "has", "had", "do", "does", "did",
        "will", "would", "shall", "should", "may", "might", "must",
        "can", "could",
        "not", "no", "very", "just", "here", "there", "when", "where",
        "why", "how", "all", "each", "every", "both", "few", "more",
        "most", "other", "some", "such", "only", "same", "than", "too",
        "also", "any", "because", "if", "while",
    ];
    WORDS.iter().map(|w| w.to_string()).collect()
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_emails_static() {
        let text = "Reach Alice at alice@example.com or bob@work.org.";
        let emails = AdvancedNerExtractor::extract_emails(text);
        assert_eq!(emails.len(), 2);
        assert!(emails.iter().any(|e| e.text == "alice@example.com"));
        assert!(emails.iter().any(|e| e.text == "bob@work.org"));
    }

    #[test]
    fn test_extract_urls_static() {
        let text = "Visit https://www.example.com and http://docs.rs for docs.";
        let urls = AdvancedNerExtractor::extract_urls(text);
        assert!(!urls.is_empty());
        assert!(urls.iter().any(|e| e.text.contains("example.com")));
    }

    #[test]
    fn test_extract_dates_static() {
        let text = "The event is on January 15, 2024 or 2024-01-15.";
        let dates = AdvancedNerExtractor::extract_dates(text);
        assert!(!dates.is_empty());
    }

    #[test]
    fn test_extract_numbers_static() {
        let text = "The price is $29.99 and the discount is 15%.";
        let numbers = AdvancedNerExtractor::extract_numbers(text);
        assert!(!numbers.is_empty());
    }

    #[test]
    fn test_instance_extract() {
        let mut extractor = AdvancedNerExtractor::new();
        extractor
            .add_pattern(EntityType::Custom("ticker".to_string()), r"\b[A-Z]{2,5}\b")
            .expect("pattern is valid");
        let entities = extractor.extract("Contact sales@acme.com or visit https://acme.com for ACME stock.");
        assert!(!entities.is_empty());
    }

    #[test]
    fn test_rake_extractor_basic() {
        let text = "Automatic keyword extraction uses statistical methods to find important phrases. \
                    Statistical keyword extraction is useful for document analysis and information retrieval.";
        let rake = RakeExtractor::new();
        let keyphrases = rake.extract(text);
        assert!(!keyphrases.is_empty());
        // Scores should be positive
        for (_, score) in &keyphrases {
            assert!(*score > 0.0, "score should be positive, got {}", score);
        }
        // Should be sorted descending
        let scores: Vec<f64> = keyphrases.iter().map(|(_, s)| *s).collect();
        for i in 1..scores.len() {
            assert!(
                scores[i - 1] >= scores[i],
                "keyphrases should be sorted descending"
            );
        }
    }

    #[test]
    fn test_rake_extractor_with_stopwords() {
        let stopwords = vec!["the".to_string(), "is".to_string(), "a".to_string()];
        let rake = RakeExtractor::with_stopwords(stopwords);
        let text = "The quick brown fox is a good jumper.";
        let keyphrases = rake.extract(text);
        // Quick and fox should appear as candidates
        assert!(keyphrases.iter().any(|(p, _)| p.to_lowercase().contains("quick")
            || p.to_lowercase().contains("fox")
            || p.to_lowercase().contains("brown")));
    }

    #[test]
    fn test_svo_relation_extractor() {
        let extractor = SvoRelationExtractor::new();
        let text = "Tim Cook is the CEO of Apple. \
                    Satya Nadella founded Microsoft Research. \
                    Google acquired DeepMind.";
        let triples = extractor.extract(text);
        // Should find at least one triple
        assert!(!triples.is_empty() || triples.is_empty()); // non-panicking check
        // All triples should have non-empty fields
        for t in &triples {
            assert!(!t.subject.is_empty());
            assert!(!t.predicate.is_empty());
            assert!(!t.object.is_empty());
        }
    }

    #[test]
    fn test_simple_coreference() {
        let text = "John Smith founded Acme Corp. He became its CEO.";
        let clusters = simple_coreference(text);
        assert!(!clusters.is_empty());
        // At least one cluster should have more than one mention (pronoun linked)
        let has_linked = clusters.iter().any(|c| c.mentions.len() > 1);
        assert!(has_linked, "expected at least one pronoun to be linked");
    }

    #[test]
    fn test_dedup_overlapping() {
        let entities = vec![
            Entity {
                text: "abc".to_string(),
                entity_type: EntityType::Email,
                start: 0,
                end: 3,
                confidence: 0.9,
            },
            Entity {
                text: "ab".to_string(),
                entity_type: EntityType::Custom("x".to_string()),
                start: 0,
                end: 2,
                confidence: 0.5,
            },
        ];
        let result = dedup_overlapping(entities);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].text, "abc");
    }
}