1use crate::error::{Result, TextError};
29use std::collections::{HashMap, HashSet};
30
31#[derive(Debug, Clone, PartialEq, Eq, Hash)]
37pub enum DiscourseRelation {
38 Cause,
40 Effect,
42 Contrast,
44 Elaboration,
46 Temporal,
48 Conditional,
50 Exemplification,
52 Summary,
54 None,
56}
57
58impl std::fmt::Display for DiscourseRelation {
59 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60 let label = match self {
61 Self::Cause => "CAUSE",
62 Self::Effect => "EFFECT",
63 Self::Contrast => "CONTRAST",
64 Self::Elaboration => "ELABORATION",
65 Self::Temporal => "TEMPORAL",
66 Self::Conditional => "CONDITIONAL",
67 Self::Exemplification => "EXEMPLIFICATION",
68 Self::Summary => "SUMMARY",
69 Self::None => "NONE",
70 };
71 write!(f, "{}", label)
72 }
73}
74
75#[derive(Debug, Clone, Default)]
84pub struct CueLexicon {
85 pub cause: Vec<String>,
87 pub effect: Vec<String>,
89 pub contrast: Vec<String>,
91 pub elaboration: Vec<String>,
93 pub temporal: Vec<String>,
95 pub conditional: Vec<String>,
97 pub exemplification: Vec<String>,
99 pub summary: Vec<String>,
101}
102
103impl CueLexicon {
104 pub fn default_english() -> Self {
106 let cue = |phrases: &[&str]| {
107 phrases
108 .iter()
109 .map(|s| s.to_lowercase())
110 .collect::<Vec<String>>()
111 };
112
113 Self {
114 cause: cue(&[
115 "because",
116 "since",
117 "as",
118 "due to",
119 "owing to",
120 "given that",
121 "in light of",
122 "for the reason that",
123 "as a result of",
124 ]),
125 effect: cue(&[
126 "therefore",
127 "thus",
128 "hence",
129 "consequently",
130 "as a result",
131 "as a consequence",
132 "so",
133 "accordingly",
134 "for this reason",
135 "it follows that",
136 "this led to",
137 "this caused",
138 ]),
139 contrast: cue(&[
140 "however",
141 "but",
142 "yet",
143 "although",
144 "even though",
145 "while",
146 "whereas",
147 "on the other hand",
148 "in contrast",
149 "nevertheless",
150 "nonetheless",
151 "despite",
152 "in spite of",
153 "conversely",
154 "by contrast",
155 "on the contrary",
156 "that said",
157 "still",
158 "yet",
159 "though",
160 ]),
161 elaboration: cue(&[
162 "furthermore",
163 "moreover",
164 "in addition",
165 "additionally",
166 "also",
167 "likewise",
168 "similarly",
169 "indeed",
170 "in fact",
171 "specifically",
172 "notably",
173 "particularly",
174 "what is more",
175 "besides",
176 "more importantly",
177 ]),
178 temporal: cue(&[
179 "then",
180 "next",
181 "after",
182 "before",
183 "when",
184 "while",
185 "once",
186 "previously",
187 "subsequently",
188 "later",
189 "earlier",
190 "at the same time",
191 "meanwhile",
192 "in the meantime",
193 "afterward",
194 "afterwards",
195 "first",
196 "second",
197 "finally",
198 "initially",
199 ]),
200 conditional: cue(&[
201 "if",
202 "unless",
203 "provided that",
204 "as long as",
205 "given that",
206 "in case",
207 "assuming that",
208 "on condition that",
209 "only if",
210 "whenever",
211 ]),
212 exemplification: cue(&[
213 "for example",
214 "for instance",
215 "such as",
216 "e.g.",
217 "to illustrate",
218 "as an example",
219 "as illustrated by",
220 "consider",
221 "take for example",
222 "as shown by",
223 ]),
224 summary: cue(&[
225 "in summary",
226 "in conclusion",
227 "to summarize",
228 "to summarise",
229 "in brief",
230 "in short",
231 "overall",
232 "to conclude",
233 "in closing",
234 "all in all",
235 "on balance",
236 "in the end",
237 "to sum up",
238 ]),
239 }
240 }
241
242 fn relation_cues(&self) -> impl Iterator<Item = (DiscourseRelation, &[String])> {
244 [
245 (DiscourseRelation::Cause, self.cause.as_slice()),
246 (DiscourseRelation::Effect, self.effect.as_slice()),
247 (DiscourseRelation::Contrast, self.contrast.as_slice()),
248 (DiscourseRelation::Elaboration, self.elaboration.as_slice()),
249 (DiscourseRelation::Temporal, self.temporal.as_slice()),
250 (DiscourseRelation::Conditional, self.conditional.as_slice()),
251 (
252 DiscourseRelation::Exemplification,
253 self.exemplification.as_slice(),
254 ),
255 (DiscourseRelation::Summary, self.summary.as_slice()),
256 ]
257 .into_iter()
258 }
259}
260
261fn starts_with_cue(text_lower: &str, cue: &str) -> bool {
268 let trimmed = text_lower.trim_start();
269 if trimmed.starts_with(cue) {
271 let after = &trimmed[cue.len()..];
273 return after
274 .chars()
275 .next()
276 .map(|c| !c.is_alphanumeric())
277 .unwrap_or(true);
278 }
279 false
280}
281
282fn leading_window(text: &str) -> String {
284 text.chars().take(80).collect::<String>().to_lowercase()
285}
286
287pub fn detect_discourse_relation(
296 sentence1: &str,
297 sentence2: &str,
298 cue_words: &CueLexicon,
299) -> Option<DiscourseRelation> {
300 let window2 = leading_window(sentence2);
301
302 let mut best: Option<(DiscourseRelation, usize)> = None; for (rel, cues) in cue_words.relation_cues() {
305 for cue in cues {
306 let found = starts_with_cue(&window2, cue);
308 let found = found || window2.contains(cue.as_str());
310
311 if found {
312 let cue_len = cue.len();
313 let is_better = best
314 .as_ref()
315 .map(|(_, prev_len)| cue_len > *prev_len)
316 .unwrap_or(true);
317 if is_better {
318 best = Some((rel.clone(), cue_len));
319 }
320 }
321 }
322 }
323
324 let window1_lower = sentence1.to_lowercase();
326 if best.is_none()
327 && (window1_lower.trim_end_matches('.').ends_with("if") || window1_lower.contains(" if "))
328 {
329 best = Some((DiscourseRelation::Conditional, 2));
330 }
331
332 best.map(|(rel, _)| rel)
333}
334
335#[derive(Debug, Clone)]
341pub struct RstNode {
342 pub sentence_index: usize,
344 pub text: String,
346 pub relation_to_parent: Option<DiscourseRelation>,
348 pub children: Vec<RstNode>,
350}
351
352#[derive(Debug, Clone)]
354pub struct RhetoricalStructure {
355 pub root: RstNode,
357 pub sentence_count: usize,
359 pub inter_sentence_relations: Vec<(usize, usize, DiscourseRelation)>,
361}
362
363impl RhetoricalStructure {
364 pub fn from_sentence_pairs(
367 sentences: &[String],
368 relations: Vec<(usize, usize, DiscourseRelation)>,
369 ) -> Option<Self> {
370 if sentences.is_empty() {
371 return None;
372 }
373
374 let mut rel_lookup: HashMap<usize, DiscourseRelation> = HashMap::new();
376 for (_, j, rel) in &relations {
377 rel_lookup.insert(*j, rel.clone());
378 }
379
380 let root = RstNode {
383 sentence_index: 0,
384 text: sentences[0].clone(),
385 relation_to_parent: None,
386 children: sentences
387 .iter()
388 .enumerate()
389 .skip(1)
390 .map(|(idx, text)| RstNode {
391 sentence_index: idx,
392 text: text.clone(),
393 relation_to_parent: rel_lookup.get(&idx).cloned(),
394 children: Vec::new(),
395 })
396 .collect(),
397 };
398
399 Some(Self {
400 root,
401 sentence_count: sentences.len(),
402 inter_sentence_relations: relations,
403 })
404 }
405
406 pub fn nodes_dfs(&self) -> Vec<&RstNode> {
408 let mut stack = vec![&self.root];
409 let mut result = Vec::new();
410 while let Some(node) = stack.pop() {
411 result.push(node);
412 for child in node.children.iter().rev() {
413 stack.push(child);
414 }
415 }
416 result
417 }
418}
419
420fn word_set(sentence: &str) -> HashSet<String> {
426 sentence
427 .split(|c: char| !c.is_alphanumeric())
428 .filter(|w| w.len() >= 3)
429 .map(|w| w.to_lowercase())
430 .collect()
431}
432
433const STOP_WORDS: &[&str] = &[
435 "the", "and", "for", "are", "was", "were", "has", "have", "had", "not", "but", "that", "this",
436 "with", "from", "they", "will", "been", "its", "their", "there", "what", "also", "into",
437 "than", "then", "when", "more", "some", "such", "even", "both", "each", "said", "very", "just",
438 "over", "like", "about", "would", "could", "should", "which",
439];
440
441fn stop_set() -> HashSet<&'static str> {
442 STOP_WORDS.iter().copied().collect()
443}
444
445fn lexical_overlap(s1: &str, s2: &str) -> f64 {
448 let stops = stop_set();
449 let w1: HashSet<String> = word_set(s1)
450 .into_iter()
451 .filter(|w| !stops.contains(w.as_str()))
452 .collect();
453 let w2: HashSet<String> = word_set(s2)
454 .into_iter()
455 .filter(|w| !stops.contains(w.as_str()))
456 .collect();
457 if w1.is_empty() && w2.is_empty() {
458 return 1.0;
459 }
460 let inter = w1.intersection(&w2).count() as f64;
461 let union = w1.union(&w2).count() as f64;
462 if union == 0.0 {
463 0.0
464 } else {
465 inter / union
466 }
467}
468
469fn cue_density(text: &str, cue_words: &CueLexicon) -> usize {
471 let lower = text.to_lowercase();
472 cue_words
473 .relation_cues()
474 .flat_map(|(_, cues)| cues.iter())
475 .filter(|cue| lower.contains(cue.as_str()))
476 .count()
477}
478
479fn split_sentences(text: &str) -> Vec<String> {
481 let mut sentences = Vec::new();
482 let mut buf = String::new();
483 for c in text.chars() {
484 buf.push(c);
485 if c == '.' || c == '!' || c == '?' {
486 let s = buf.trim().to_string();
487 if !s.is_empty() {
488 sentences.push(s);
489 }
490 buf.clear();
491 }
492 }
493 let rem = buf.trim().to_string();
494 if !rem.is_empty() {
495 sentences.push(rem);
496 }
497 sentences
498}
499
500pub fn coherence_score(text: &str) -> f64 {
514 coherence_score_with_lexicon(text, &CueLexicon::default_english())
515}
516
517pub fn coherence_score_with_lexicon(text: &str, cue_words: &CueLexicon) -> f64 {
519 let sents = split_sentences(text);
520 if sents.len() < 2 {
521 return 1.0; }
523
524 let pairs: Vec<(&str, &str)> = sents
525 .windows(2)
526 .map(|w| (w[0].as_str(), w[1].as_str()))
527 .collect();
528
529 let n = pairs.len() as f64;
530
531 let lex_sum: f64 = pairs.iter().map(|(a, b)| lexical_overlap(a, b)).sum();
533 let lex_score = lex_sum / n;
534
535 let cue_count = pairs
537 .iter()
538 .filter(|(_, b)| cue_density(b, cue_words) > 0)
539 .count() as f64;
540 let cue_score = cue_count / n;
541
542 0.6 * lex_score + 0.4 * cue_score
543}
544
545pub struct DiscourseAnalyzer {
552 cue_lexicon: CueLexicon,
553}
554
555impl Default for DiscourseAnalyzer {
556 fn default() -> Self {
557 Self::new()
558 }
559}
560
561impl DiscourseAnalyzer {
562 pub fn new() -> Self {
564 Self {
565 cue_lexicon: CueLexicon::default_english(),
566 }
567 }
568
569 pub fn with_lexicon(mut self, lex: CueLexicon) -> Self {
571 self.cue_lexicon = lex;
572 self
573 }
574
575 pub fn detect_relation(&self, s1: &str, s2: &str) -> Option<DiscourseRelation> {
577 detect_discourse_relation(s1, s2, &self.cue_lexicon)
578 }
579
580 pub fn analyse(&self, text: &str) -> Result<DiscourseAnalysis> {
583 if text.is_empty() {
584 return Err(TextError::InvalidInput(
585 "Input text must not be empty".to_string(),
586 ));
587 }
588
589 let sentences = split_sentences(text);
590 let mut relations: Vec<(usize, usize, DiscourseRelation)> = Vec::new();
591
592 for (i, pair) in sentences.windows(2).enumerate() {
593 let s1 = &pair[0];
594 let s2 = &pair[1];
595 if let Some(rel) = detect_discourse_relation(s1, s2, &self.cue_lexicon) {
596 relations.push((i, i + 1, rel));
597 }
598 }
599
600 let rst = RhetoricalStructure::from_sentence_pairs(&sentences, relations.clone());
601 let score = coherence_score_with_lexicon(text, &self.cue_lexicon);
602
603 Ok(DiscourseAnalysis {
604 sentences,
605 relations,
606 rst,
607 coherence: score,
608 })
609 }
610}
611
612pub struct DiscourseAnalysis {
614 pub sentences: Vec<String>,
616 pub relations: Vec<(usize, usize, DiscourseRelation)>,
618 pub rst: Option<RhetoricalStructure>,
620 pub coherence: f64,
622}
623
624#[cfg(test)]
629mod tests {
630 use super::*;
631
632 #[test]
633 fn test_detect_contrast() {
634 let lex = CueLexicon::default_english();
635 let s1 = "The experiment was promising.";
636 let s2 = "However, the results were inconclusive.";
637 let rel = detect_discourse_relation(s1, s2, &lex);
638 assert_eq!(rel, Some(DiscourseRelation::Contrast));
639 }
640
641 #[test]
642 fn test_detect_effect() {
643 let lex = CueLexicon::default_english();
644 let s1 = "The team worked very hard.";
645 let s2 = "Therefore, they finished on time.";
646 let rel = detect_discourse_relation(s1, s2, &lex);
647 assert_eq!(rel, Some(DiscourseRelation::Effect));
648 }
649
650 #[test]
651 fn test_detect_cause() {
652 let lex = CueLexicon::default_english();
653 let s1 = "The project was delayed.";
654 let s2 = "Because the supplier did not deliver the parts.";
655 let rel = detect_discourse_relation(s1, s2, &lex);
656 assert_eq!(rel, Some(DiscourseRelation::Cause));
657 }
658
659 #[test]
660 fn test_detect_temporal() {
661 let lex = CueLexicon::default_english();
662 let s1 = "She completed the analysis.";
663 let s2 = "Then she wrote the report.";
664 let rel = detect_discourse_relation(s1, s2, &lex);
665 assert_eq!(rel, Some(DiscourseRelation::Temporal));
666 }
667
668 #[test]
669 fn test_detect_conditional() {
670 let lex = CueLexicon::default_english();
671 let s1 = "You will succeed.";
672 let s2 = "If you follow the plan carefully.";
673 let rel = detect_discourse_relation(s1, s2, &lex);
674 assert_eq!(rel, Some(DiscourseRelation::Conditional));
675 }
676
677 #[test]
678 fn test_detect_elaboration() {
679 let lex = CueLexicon::default_english();
680 let s1 = "The new policy was announced.";
681 let s2 = "Furthermore, it will take effect immediately.";
682 let rel = detect_discourse_relation(s1, s2, &lex);
683 assert_eq!(rel, Some(DiscourseRelation::Elaboration));
684 }
685
686 #[test]
687 fn test_detect_exemplification() {
688 let lex = CueLexicon::default_english();
689 let s1 = "Many animals live in the rainforest.";
690 let s2 = "For example, jaguars and toucans are common there.";
691 let rel = detect_discourse_relation(s1, s2, &lex);
692 assert_eq!(rel, Some(DiscourseRelation::Exemplification));
693 }
694
695 #[test]
696 fn test_detect_summary() {
697 let lex = CueLexicon::default_english();
698 let s1 = "We reviewed all the evidence.";
699 let s2 = "In conclusion, the hypothesis is supported.";
700 let rel = detect_discourse_relation(s1, s2, &lex);
701 assert_eq!(rel, Some(DiscourseRelation::Summary));
702 }
703
704 #[test]
705 fn test_detect_none() {
706 let lex = CueLexicon::default_english();
707 let s1 = "The cat sat on the mat.";
708 let s2 = "The dog ran across the field.";
709 let rel = detect_discourse_relation(s1, s2, &lex);
711 let _ = rel;
714 }
715
716 #[test]
717 fn test_coherence_score_coherent() {
718 let text = "The researchers conducted an experiment. \
719 Therefore, they published their findings. \
720 Furthermore, the findings were widely cited.";
721 let score = coherence_score(text);
722 assert!(score > 0.0, "score should be positive: {}", score);
724 assert!(score <= 1.0, "score should be <= 1.0: {}", score);
725 }
726
727 #[test]
728 fn test_coherence_score_incoherent() {
729 let text = "The price of gold rose sharply. \
730 Elephants live in Africa. \
731 Quantum mechanics is complex.";
732 let score = coherence_score(text);
733 assert!(score <= 1.0);
734 }
735
736 #[test]
737 fn test_coherence_score_single_sentence() {
738 let score = coherence_score("This is a single sentence.");
739 assert_eq!(score, 1.0);
740 }
741
742 #[test]
743 fn test_rst_tree_construction() {
744 let sentences = vec![
745 "Alice studied hard.".to_string(),
746 "Therefore, she passed the exam.".to_string(),
747 "However, she felt tired afterward.".to_string(),
748 ];
749 let relations = vec![
750 (0, 1, DiscourseRelation::Effect),
751 (1, 2, DiscourseRelation::Contrast),
752 ];
753 let tree = RhetoricalStructure::from_sentence_pairs(&sentences, relations);
754 assert!(tree.is_some());
755 let tree = tree.expect("already checked");
756 assert_eq!(tree.sentence_count, 3);
757 assert_eq!(tree.root.sentence_index, 0);
758 assert_eq!(tree.root.children.len(), 2);
759
760 let child_relations: Vec<Option<DiscourseRelation>> = tree
762 .root
763 .children
764 .iter()
765 .map(|c| c.relation_to_parent.clone())
766 .collect();
767 assert!(child_relations.contains(&Some(DiscourseRelation::Effect)));
768 assert!(child_relations.contains(&Some(DiscourseRelation::Contrast)));
769 }
770
771 #[test]
772 fn test_rst_empty_text_returns_none() {
773 let tree = RhetoricalStructure::from_sentence_pairs(&[], Vec::new());
774 assert!(tree.is_none());
775 }
776
777 #[test]
778 fn test_analyser_full_pipeline() {
779 let analyser = DiscourseAnalyzer::new();
780 let text = "The company invested heavily in R&D. \
781 Therefore, its products improved significantly. \
782 However, costs also increased.";
783 let analysis = analyser.analyse(text).expect("should succeed");
784 assert_eq!(analysis.sentences.len(), 3);
785 assert!(!analysis.relations.is_empty());
786 assert!(analysis.rst.is_some());
787 assert!(analysis.coherence >= 0.0 && analysis.coherence <= 1.0);
788 }
789
790 #[test]
791 fn test_analyser_empty_input_error() {
792 let analyser = DiscourseAnalyzer::new();
793 assert!(analyser.analyse("").is_err());
794 }
795
796 #[test]
797 fn test_dfs_traversal() {
798 let sentences = vec!["S1".to_string(), "S2".to_string(), "S3".to_string()];
799 let tree =
800 RhetoricalStructure::from_sentence_pairs(&sentences, Vec::new()).expect("should build");
801 let nodes = tree.nodes_dfs();
802 assert_eq!(nodes.len(), 3);
803 }
804
805 #[test]
806 fn test_custom_lexicon() {
807 let mut lex = CueLexicon::default();
808 lex.effect.push("voila".to_string());
809 let s1 = "We mixed the chemicals.";
810 let s2 = "Voila, it worked.";
811 let rel = detect_discourse_relation(s1, s2, &lex);
812 assert_eq!(rel, Some(DiscourseRelation::Effect));
813 }
814}