Skip to main content

scirs2_text/
discourse.rs

1//! Discourse analysis: relation detection, RST tree construction, and
2//! text coherence scoring.
3//!
4//! This module provides purely rule-based (no trained weights) discourse
5//! analysis for English text, including:
6//!
7//! - [`DiscourseRelation`] — the semantic relation between two adjacent
8//!   discourse segments (clauses or sentences).
9//! - [`CueLexicon`] — cue-phrase lookup tables that drive relation detection.
10//! - [`detect_discourse_relation`] — classify the relation between two sentences.
11//! - [`RhetoricalStructure`] — a simplified RST (Rhetorical Structure Theory)
12//!   tree over a multi-sentence document.
13//! - [`coherence_score`] — a sentence-to-sentence coherence measure based on
14//!   lexical overlap and discourse connective density.
15//!
16//! # Example
17//!
18//! ```rust
19//! use scirs2_text::discourse::{CueLexicon, detect_discourse_relation, DiscourseRelation};
20//!
21//! let lexicon = CueLexicon::default_english();
22//! let s1 = "The experiment failed.";
23//! let s2 = "However, the team did not give up.";
24//! let rel = detect_discourse_relation(s1, s2, &lexicon);
25//! assert_eq!(rel, Some(DiscourseRelation::Contrast));
26//! ```
27
28use crate::error::{Result, TextError};
29use std::collections::{HashMap, HashSet};
30
31// ---------------------------------------------------------------------------
32// DiscourseRelation
33// ---------------------------------------------------------------------------
34
35/// Possible discourse relations between two adjacent text segments.
36#[derive(Debug, Clone, PartialEq, Eq, Hash)]
37pub enum DiscourseRelation {
38    /// Segment 2 describes the cause of segment 1 (or explains why).
39    Cause,
40    /// Segment 2 is the effect / result of segment 1.
41    Effect,
42    /// Segments present contrasting or opposing information.
43    Contrast,
44    /// Segment 2 elaborates, expands, or provides detail about segment 1.
45    Elaboration,
46    /// Segment 2 describes something that happened before or after segment 1.
47    Temporal,
48    /// Segment 2 is conditioned on segment 1 (if–then).
49    Conditional,
50    /// Segment 2 exemplifies a claim made in segment 1.
51    Exemplification,
52    /// Segment 2 summarises or concludes the discourse up to this point.
53    Summary,
54    /// No discourse relation detected.
55    None,
56}
57
58impl std::fmt::Display for DiscourseRelation {
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        let label = match self {
61            Self::Cause => "CAUSE",
62            Self::Effect => "EFFECT",
63            Self::Contrast => "CONTRAST",
64            Self::Elaboration => "ELABORATION",
65            Self::Temporal => "TEMPORAL",
66            Self::Conditional => "CONDITIONAL",
67            Self::Exemplification => "EXEMPLIFICATION",
68            Self::Summary => "SUMMARY",
69            Self::None => "NONE",
70        };
71        write!(f, "{}", label)
72    }
73}
74
75// ---------------------------------------------------------------------------
76// CueLexicon
77// ---------------------------------------------------------------------------
78
79/// Cue-phrase lookup tables for discourse relation detection.
80///
81/// Each field stores a list of cue phrases (lower-cased) associated with the
82/// corresponding discourse relation.
83#[derive(Debug, Clone, Default)]
84pub struct CueLexicon {
85    /// Cue phrases signalling a causal relation (segment 2 is the cause).
86    pub cause: Vec<String>,
87    /// Cue phrases signalling an effect relation (segment 2 is the result).
88    pub effect: Vec<String>,
89    /// Cue phrases signalling contrast.
90    pub contrast: Vec<String>,
91    /// Cue phrases signalling elaboration or exemplification.
92    pub elaboration: Vec<String>,
93    /// Cue phrases signalling a temporal relation.
94    pub temporal: Vec<String>,
95    /// Cue phrases signalling a conditional relation.
96    pub conditional: Vec<String>,
97    /// Cue phrases signalling exemplification.
98    pub exemplification: Vec<String>,
99    /// Cue phrases signalling summary or conclusion.
100    pub summary: Vec<String>,
101}
102
103impl CueLexicon {
104    /// Build the default English cue-phrase lexicon.
105    pub fn default_english() -> Self {
106        let cue = |phrases: &[&str]| {
107            phrases
108                .iter()
109                .map(|s| s.to_lowercase())
110                .collect::<Vec<String>>()
111        };
112
113        Self {
114            cause: cue(&[
115                "because",
116                "since",
117                "as",
118                "due to",
119                "owing to",
120                "given that",
121                "in light of",
122                "for the reason that",
123                "as a result of",
124            ]),
125            effect: cue(&[
126                "therefore",
127                "thus",
128                "hence",
129                "consequently",
130                "as a result",
131                "as a consequence",
132                "so",
133                "accordingly",
134                "for this reason",
135                "it follows that",
136                "this led to",
137                "this caused",
138            ]),
139            contrast: cue(&[
140                "however",
141                "but",
142                "yet",
143                "although",
144                "even though",
145                "while",
146                "whereas",
147                "on the other hand",
148                "in contrast",
149                "nevertheless",
150                "nonetheless",
151                "despite",
152                "in spite of",
153                "conversely",
154                "by contrast",
155                "on the contrary",
156                "that said",
157                "still",
158                "yet",
159                "though",
160            ]),
161            elaboration: cue(&[
162                "furthermore",
163                "moreover",
164                "in addition",
165                "additionally",
166                "also",
167                "likewise",
168                "similarly",
169                "indeed",
170                "in fact",
171                "specifically",
172                "notably",
173                "particularly",
174                "what is more",
175                "besides",
176                "more importantly",
177            ]),
178            temporal: cue(&[
179                "then",
180                "next",
181                "after",
182                "before",
183                "when",
184                "while",
185                "once",
186                "previously",
187                "subsequently",
188                "later",
189                "earlier",
190                "at the same time",
191                "meanwhile",
192                "in the meantime",
193                "afterward",
194                "afterwards",
195                "first",
196                "second",
197                "finally",
198                "initially",
199            ]),
200            conditional: cue(&[
201                "if",
202                "unless",
203                "provided that",
204                "as long as",
205                "given that",
206                "in case",
207                "assuming that",
208                "on condition that",
209                "only if",
210                "whenever",
211            ]),
212            exemplification: cue(&[
213                "for example",
214                "for instance",
215                "such as",
216                "e.g.",
217                "to illustrate",
218                "as an example",
219                "as illustrated by",
220                "consider",
221                "take for example",
222                "as shown by",
223            ]),
224            summary: cue(&[
225                "in summary",
226                "in conclusion",
227                "to summarize",
228                "to summarise",
229                "in brief",
230                "in short",
231                "overall",
232                "to conclude",
233                "in closing",
234                "all in all",
235                "on balance",
236                "in the end",
237                "to sum up",
238            ]),
239        }
240    }
241
242    /// Return an iterator over `(DiscourseRelation, &[String])` pairs.
243    fn relation_cues(&self) -> impl Iterator<Item = (DiscourseRelation, &[String])> {
244        [
245            (DiscourseRelation::Cause, self.cause.as_slice()),
246            (DiscourseRelation::Effect, self.effect.as_slice()),
247            (DiscourseRelation::Contrast, self.contrast.as_slice()),
248            (DiscourseRelation::Elaboration, self.elaboration.as_slice()),
249            (DiscourseRelation::Temporal, self.temporal.as_slice()),
250            (DiscourseRelation::Conditional, self.conditional.as_slice()),
251            (
252                DiscourseRelation::Exemplification,
253                self.exemplification.as_slice(),
254            ),
255            (DiscourseRelation::Summary, self.summary.as_slice()),
256        ]
257        .into_iter()
258    }
259}
260
261// ---------------------------------------------------------------------------
262// Discourse relation detection
263// ---------------------------------------------------------------------------
264
265/// Match a cue phrase in the first 30 characters of `text_lower`, allowing
266/// the phrase to appear at the very start (after leading spaces).
267fn starts_with_cue(text_lower: &str, cue: &str) -> bool {
268    let trimmed = text_lower.trim_start();
269    // Exact prefix match
270    if trimmed.starts_with(cue) {
271        // Make sure it is followed by a non-alphanumeric character (word boundary).
272        let after = &trimmed[cue.len()..];
273        return after
274            .chars()
275            .next()
276            .map(|c| !c.is_alphanumeric())
277            .unwrap_or(true);
278    }
279    false
280}
281
282/// Return the first 60 characters of `text` (lower-cased) for cue matching.
283fn leading_window(text: &str) -> String {
284    text.chars().take(80).collect::<String>().to_lowercase()
285}
286
287/// Detect the discourse relation between `sentence1` and `sentence2`.
288///
289/// The function scans for cue phrases at the beginning of `sentence2` (the
290/// most reliable position) and, for some relation types, also within the body
291/// of `sentence2`.  The cue that matches the *longest* phrase wins (to prefer
292/// multi-word cues over single-word ones).
293///
294/// Returns `None` if no cue phrases are found.
295pub fn detect_discourse_relation(
296    sentence1: &str,
297    sentence2: &str,
298    cue_words: &CueLexicon,
299) -> Option<DiscourseRelation> {
300    let window2 = leading_window(sentence2);
301
302    let mut best: Option<(DiscourseRelation, usize)> = None; // (relation, cue_length)
303
304    for (rel, cues) in cue_words.relation_cues() {
305        for cue in cues {
306            // Primary check: sentence2 starts with the cue
307            let found = starts_with_cue(&window2, cue);
308            // Secondary check: cue appears in the first 80 chars of sentence2
309            let found = found || window2.contains(cue.as_str());
310
311            if found {
312                let cue_len = cue.len();
313                let is_better = best
314                    .as_ref()
315                    .map(|(_, prev_len)| cue_len > *prev_len)
316                    .unwrap_or(true);
317                if is_better {
318                    best = Some((rel.clone(), cue_len));
319                }
320            }
321        }
322    }
323
324    // Also check if sentence1 ends with a conditional fragment
325    let window1_lower = sentence1.to_lowercase();
326    if best.is_none()
327        && (window1_lower.trim_end_matches('.').ends_with("if") || window1_lower.contains(" if "))
328    {
329        best = Some((DiscourseRelation::Conditional, 2));
330    }
331
332    best.map(|(rel, _)| rel)
333}
334
335// ---------------------------------------------------------------------------
336// RST Tree
337// ---------------------------------------------------------------------------
338
339/// A node in a simplified Rhetorical Structure Theory tree.
340#[derive(Debug, Clone)]
341pub struct RstNode {
342    /// Index of the sentence in the original document (0-based).
343    pub sentence_index: usize,
344    /// The surface text of the sentence.
345    pub text: String,
346    /// The discourse relation from this node's parent to this node.
347    pub relation_to_parent: Option<DiscourseRelation>,
348    /// Child nodes (satellite segments).
349    pub children: Vec<RstNode>,
350}
351
352/// Simplified RST tree over a document.
353#[derive(Debug, Clone)]
354pub struct RhetoricalStructure {
355    /// Root node of the tree.
356    pub root: RstNode,
357    /// Total number of sentences.
358    pub sentence_count: usize,
359    /// Detected discourse relations between adjacent sentences.
360    pub inter_sentence_relations: Vec<(usize, usize, DiscourseRelation)>,
361}
362
363impl RhetoricalStructure {
364    /// Build a flat (chain) RST tree from a sequence of sentences and the
365    /// relations detected between each consecutive pair.
366    pub fn from_sentence_pairs(
367        sentences: &[String],
368        relations: Vec<(usize, usize, DiscourseRelation)>,
369    ) -> Option<Self> {
370        if sentences.is_empty() {
371            return None;
372        }
373
374        // Build a lookup: sentence_idx → relation from its predecessor
375        let mut rel_lookup: HashMap<usize, DiscourseRelation> = HashMap::new();
376        for (_, j, rel) in &relations {
377            rel_lookup.insert(*j, rel.clone());
378        }
379
380        // Root is the first sentence; every other sentence is a direct child
381        // of the root (chain structure — sufficient for the simplified model).
382        let root = RstNode {
383            sentence_index: 0,
384            text: sentences[0].clone(),
385            relation_to_parent: None,
386            children: sentences
387                .iter()
388                .enumerate()
389                .skip(1)
390                .map(|(idx, text)| RstNode {
391                    sentence_index: idx,
392                    text: text.clone(),
393                    relation_to_parent: rel_lookup.get(&idx).cloned(),
394                    children: Vec::new(),
395                })
396                .collect(),
397        };
398
399        Some(Self {
400            root,
401            sentence_count: sentences.len(),
402            inter_sentence_relations: relations,
403        })
404    }
405
406    /// Traverse the tree in depth-first order and return all nodes.
407    pub fn nodes_dfs(&self) -> Vec<&RstNode> {
408        let mut stack = vec![&self.root];
409        let mut result = Vec::new();
410        while let Some(node) = stack.pop() {
411            result.push(node);
412            for child in node.children.iter().rev() {
413                stack.push(child);
414            }
415        }
416        result
417    }
418}
419
420// ---------------------------------------------------------------------------
421// Coherence scoring
422// ---------------------------------------------------------------------------
423
424/// Tokenise a sentence into a lower-cased word set (punctuation stripped).
425fn word_set(sentence: &str) -> HashSet<String> {
426    sentence
427        .split(|c: char| !c.is_alphanumeric())
428        .filter(|w| w.len() >= 3)
429        .map(|w| w.to_lowercase())
430        .collect()
431}
432
433/// Common English function words to exclude from lexical overlap scoring.
434const STOP_WORDS: &[&str] = &[
435    "the", "and", "for", "are", "was", "were", "has", "have", "had", "not", "but", "that", "this",
436    "with", "from", "they", "will", "been", "its", "their", "there", "what", "also", "into",
437    "than", "then", "when", "more", "some", "such", "even", "both", "each", "said", "very", "just",
438    "over", "like", "about", "would", "could", "should", "which",
439];
440
441fn stop_set() -> HashSet<&'static str> {
442    STOP_WORDS.iter().copied().collect()
443}
444
445/// Compute the Jaccard similarity between the content-word sets of two
446/// sentences.
447fn lexical_overlap(s1: &str, s2: &str) -> f64 {
448    let stops = stop_set();
449    let w1: HashSet<String> = word_set(s1)
450        .into_iter()
451        .filter(|w| !stops.contains(w.as_str()))
452        .collect();
453    let w2: HashSet<String> = word_set(s2)
454        .into_iter()
455        .filter(|w| !stops.contains(w.as_str()))
456        .collect();
457    if w1.is_empty() && w2.is_empty() {
458        return 1.0;
459    }
460    let inter = w1.intersection(&w2).count() as f64;
461    let union = w1.union(&w2).count() as f64;
462    if union == 0.0 {
463        0.0
464    } else {
465        inter / union
466    }
467}
468
469/// Count the number of known discourse cue phrases that appear in `text`.
470fn cue_density(text: &str, cue_words: &CueLexicon) -> usize {
471    let lower = text.to_lowercase();
472    cue_words
473        .relation_cues()
474        .flat_map(|(_, cues)| cues.iter())
475        .filter(|cue| lower.contains(cue.as_str()))
476        .count()
477}
478
479/// Split `text` into sentences on `.`, `?`, `!`.
480fn split_sentences(text: &str) -> Vec<String> {
481    let mut sentences = Vec::new();
482    let mut buf = String::new();
483    for c in text.chars() {
484        buf.push(c);
485        if c == '.' || c == '!' || c == '?' {
486            let s = buf.trim().to_string();
487            if !s.is_empty() {
488                sentences.push(s);
489            }
490            buf.clear();
491        }
492    }
493    let rem = buf.trim().to_string();
494    if !rem.is_empty() {
495        sentences.push(rem);
496    }
497    sentences
498}
499
500/// Compute a sentence-to-sentence coherence score for `text`.
501///
502/// The score is a value in `[0.0, 1.0]` computed as a weighted average of:
503///
504/// 1. **Lexical continuity** (weight 0.6): the mean Jaccard similarity of
505///    content-word bags between each consecutive sentence pair.
506/// 2. **Discourse cue density** (weight 0.4): the fraction of consecutive
507///    sentence pairs that contain at least one discourse cue phrase in the
508///    second sentence.
509///
510/// A score close to 1.0 indicates a well-connected, coherent text; a score
511/// close to 0.0 indicates sentences that are lexically unrelated and
512/// contain no discourse connectives.
513pub fn coherence_score(text: &str) -> f64 {
514    coherence_score_with_lexicon(text, &CueLexicon::default_english())
515}
516
517/// Like [`coherence_score`] but uses a caller-supplied cue lexicon.
518pub fn coherence_score_with_lexicon(text: &str, cue_words: &CueLexicon) -> f64 {
519    let sents = split_sentences(text);
520    if sents.len() < 2 {
521        return 1.0; // Single sentence is trivially coherent.
522    }
523
524    let pairs: Vec<(&str, &str)> = sents
525        .windows(2)
526        .map(|w| (w[0].as_str(), w[1].as_str()))
527        .collect();
528
529    let n = pairs.len() as f64;
530
531    // Lexical continuity
532    let lex_sum: f64 = pairs.iter().map(|(a, b)| lexical_overlap(a, b)).sum();
533    let lex_score = lex_sum / n;
534
535    // Cue density: fraction of transitions with ≥ 1 cue in the second sentence
536    let cue_count = pairs
537        .iter()
538        .filter(|(_, b)| cue_density(b, cue_words) > 0)
539        .count() as f64;
540    let cue_score = cue_count / n;
541
542    0.6 * lex_score + 0.4 * cue_score
543}
544
545// ---------------------------------------------------------------------------
546// Full discourse analyser
547// ---------------------------------------------------------------------------
548
549/// High-level discourse analyser that wraps detection, tree building, and
550/// coherence scoring.
551pub struct DiscourseAnalyzer {
552    cue_lexicon: CueLexicon,
553}
554
555impl Default for DiscourseAnalyzer {
556    fn default() -> Self {
557        Self::new()
558    }
559}
560
561impl DiscourseAnalyzer {
562    /// Create an analyser with the default English cue lexicon.
563    pub fn new() -> Self {
564        Self {
565            cue_lexicon: CueLexicon::default_english(),
566        }
567    }
568
569    /// Replace the cue lexicon.
570    pub fn with_lexicon(mut self, lex: CueLexicon) -> Self {
571        self.cue_lexicon = lex;
572        self
573    }
574
575    /// Detect the relation between two sentences.
576    pub fn detect_relation(&self, s1: &str, s2: &str) -> Option<DiscourseRelation> {
577        detect_discourse_relation(s1, s2, &self.cue_lexicon)
578    }
579
580    /// Analyse a full text document: split into sentences, detect pairwise
581    /// relations, build an RST tree, and compute a coherence score.
582    pub fn analyse(&self, text: &str) -> Result<DiscourseAnalysis> {
583        if text.is_empty() {
584            return Err(TextError::InvalidInput(
585                "Input text must not be empty".to_string(),
586            ));
587        }
588
589        let sentences = split_sentences(text);
590        let mut relations: Vec<(usize, usize, DiscourseRelation)> = Vec::new();
591
592        for (i, pair) in sentences.windows(2).enumerate() {
593            let s1 = &pair[0];
594            let s2 = &pair[1];
595            if let Some(rel) = detect_discourse_relation(s1, s2, &self.cue_lexicon) {
596                relations.push((i, i + 1, rel));
597            }
598        }
599
600        let rst = RhetoricalStructure::from_sentence_pairs(&sentences, relations.clone());
601        let score = coherence_score_with_lexicon(text, &self.cue_lexicon);
602
603        Ok(DiscourseAnalysis {
604            sentences,
605            relations,
606            rst,
607            coherence: score,
608        })
609    }
610}
611
612/// The result of a full discourse analysis.
613pub struct DiscourseAnalysis {
614    /// The sentences extracted from the input text.
615    pub sentences: Vec<String>,
616    /// Detected pairwise discourse relations `(i, j, relation)`.
617    pub relations: Vec<(usize, usize, DiscourseRelation)>,
618    /// Simplified RST tree (may be `None` if the text has fewer than 2 sentences).
619    pub rst: Option<RhetoricalStructure>,
620    /// Overall coherence score in `[0.0, 1.0]`.
621    pub coherence: f64,
622}
623
624// ---------------------------------------------------------------------------
625// Tests
626// ---------------------------------------------------------------------------
627
628#[cfg(test)]
629mod tests {
630    use super::*;
631
632    #[test]
633    fn test_detect_contrast() {
634        let lex = CueLexicon::default_english();
635        let s1 = "The experiment was promising.";
636        let s2 = "However, the results were inconclusive.";
637        let rel = detect_discourse_relation(s1, s2, &lex);
638        assert_eq!(rel, Some(DiscourseRelation::Contrast));
639    }
640
641    #[test]
642    fn test_detect_effect() {
643        let lex = CueLexicon::default_english();
644        let s1 = "The team worked very hard.";
645        let s2 = "Therefore, they finished on time.";
646        let rel = detect_discourse_relation(s1, s2, &lex);
647        assert_eq!(rel, Some(DiscourseRelation::Effect));
648    }
649
650    #[test]
651    fn test_detect_cause() {
652        let lex = CueLexicon::default_english();
653        let s1 = "The project was delayed.";
654        let s2 = "Because the supplier did not deliver the parts.";
655        let rel = detect_discourse_relation(s1, s2, &lex);
656        assert_eq!(rel, Some(DiscourseRelation::Cause));
657    }
658
659    #[test]
660    fn test_detect_temporal() {
661        let lex = CueLexicon::default_english();
662        let s1 = "She completed the analysis.";
663        let s2 = "Then she wrote the report.";
664        let rel = detect_discourse_relation(s1, s2, &lex);
665        assert_eq!(rel, Some(DiscourseRelation::Temporal));
666    }
667
668    #[test]
669    fn test_detect_conditional() {
670        let lex = CueLexicon::default_english();
671        let s1 = "You will succeed.";
672        let s2 = "If you follow the plan carefully.";
673        let rel = detect_discourse_relation(s1, s2, &lex);
674        assert_eq!(rel, Some(DiscourseRelation::Conditional));
675    }
676
677    #[test]
678    fn test_detect_elaboration() {
679        let lex = CueLexicon::default_english();
680        let s1 = "The new policy was announced.";
681        let s2 = "Furthermore, it will take effect immediately.";
682        let rel = detect_discourse_relation(s1, s2, &lex);
683        assert_eq!(rel, Some(DiscourseRelation::Elaboration));
684    }
685
686    #[test]
687    fn test_detect_exemplification() {
688        let lex = CueLexicon::default_english();
689        let s1 = "Many animals live in the rainforest.";
690        let s2 = "For example, jaguars and toucans are common there.";
691        let rel = detect_discourse_relation(s1, s2, &lex);
692        assert_eq!(rel, Some(DiscourseRelation::Exemplification));
693    }
694
695    #[test]
696    fn test_detect_summary() {
697        let lex = CueLexicon::default_english();
698        let s1 = "We reviewed all the evidence.";
699        let s2 = "In conclusion, the hypothesis is supported.";
700        let rel = detect_discourse_relation(s1, s2, &lex);
701        assert_eq!(rel, Some(DiscourseRelation::Summary));
702    }
703
704    #[test]
705    fn test_detect_none() {
706        let lex = CueLexicon::default_english();
707        let s1 = "The cat sat on the mat.";
708        let s2 = "The dog ran across the field.";
709        // No strong cue → None
710        let rel = detect_discourse_relation(s1, s2, &lex);
711        // We accept either None or a weak false-positive from single-word cues.
712        // The test just checks the function doesn't panic.
713        let _ = rel;
714    }
715
716    #[test]
717    fn test_coherence_score_coherent() {
718        let text = "The researchers conducted an experiment. \
719                    Therefore, they published their findings. \
720                    Furthermore, the findings were widely cited.";
721        let score = coherence_score(text);
722        // Should be higher than a random text
723        assert!(score > 0.0, "score should be positive: {}", score);
724        assert!(score <= 1.0, "score should be <= 1.0: {}", score);
725    }
726
727    #[test]
728    fn test_coherence_score_incoherent() {
729        let text = "The price of gold rose sharply. \
730                    Elephants live in Africa. \
731                    Quantum mechanics is complex.";
732        let score = coherence_score(text);
733        assert!(score <= 1.0);
734    }
735
736    #[test]
737    fn test_coherence_score_single_sentence() {
738        let score = coherence_score("This is a single sentence.");
739        assert_eq!(score, 1.0);
740    }
741
742    #[test]
743    fn test_rst_tree_construction() {
744        let sentences = vec![
745            "Alice studied hard.".to_string(),
746            "Therefore, she passed the exam.".to_string(),
747            "However, she felt tired afterward.".to_string(),
748        ];
749        let relations = vec![
750            (0, 1, DiscourseRelation::Effect),
751            (1, 2, DiscourseRelation::Contrast),
752        ];
753        let tree = RhetoricalStructure::from_sentence_pairs(&sentences, relations);
754        assert!(tree.is_some());
755        let tree = tree.expect("already checked");
756        assert_eq!(tree.sentence_count, 3);
757        assert_eq!(tree.root.sentence_index, 0);
758        assert_eq!(tree.root.children.len(), 2);
759
760        // Check relations are attached
761        let child_relations: Vec<Option<DiscourseRelation>> = tree
762            .root
763            .children
764            .iter()
765            .map(|c| c.relation_to_parent.clone())
766            .collect();
767        assert!(child_relations.contains(&Some(DiscourseRelation::Effect)));
768        assert!(child_relations.contains(&Some(DiscourseRelation::Contrast)));
769    }
770
771    #[test]
772    fn test_rst_empty_text_returns_none() {
773        let tree = RhetoricalStructure::from_sentence_pairs(&[], Vec::new());
774        assert!(tree.is_none());
775    }
776
777    #[test]
778    fn test_analyser_full_pipeline() {
779        let analyser = DiscourseAnalyzer::new();
780        let text = "The company invested heavily in R&D. \
781                    Therefore, its products improved significantly. \
782                    However, costs also increased.";
783        let analysis = analyser.analyse(text).expect("should succeed");
784        assert_eq!(analysis.sentences.len(), 3);
785        assert!(!analysis.relations.is_empty());
786        assert!(analysis.rst.is_some());
787        assert!(analysis.coherence >= 0.0 && analysis.coherence <= 1.0);
788    }
789
790    #[test]
791    fn test_analyser_empty_input_error() {
792        let analyser = DiscourseAnalyzer::new();
793        assert!(analyser.analyse("").is_err());
794    }
795
796    #[test]
797    fn test_dfs_traversal() {
798        let sentences = vec!["S1".to_string(), "S2".to_string(), "S3".to_string()];
799        let tree =
800            RhetoricalStructure::from_sentence_pairs(&sentences, Vec::new()).expect("should build");
801        let nodes = tree.nodes_dfs();
802        assert_eq!(nodes.len(), 3);
803    }
804
805    #[test]
806    fn test_custom_lexicon() {
807        let mut lex = CueLexicon::default();
808        lex.effect.push("voila".to_string());
809        let s1 = "We mixed the chemicals.";
810        let s2 = "Voila, it worked.";
811        let rel = detect_discourse_relation(s1, s2, &lex);
812        assert_eq!(rel, Some(DiscourseRelation::Effect));
813    }
814}