#[cfg(not(feature = "std"))]
use alloc::string::String;
#[cfg(not(feature = "std"))]
use alloc::vec;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
use crate::discourse::ListStyle;
use crate::refine::{Diagnoser, Diagnostic, RefineConstraint, RenderedDocument};
use crate::rst::RstRelation;
use crate::style::StyleProfile;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
enum ConnectorFamily {
Continuation,
Similarity,
Contrast,
}
fn classify(connective: &str) -> Option<(ConnectorFamily, RstRelation)> {
for cont in &["Additionally,", "Furthermore,", "It also"] {
if connective.starts_with(cont) {
return Some((ConnectorFamily::Continuation, RstRelation::Elaboration));
}
}
for sim in &["Similarly,", "Likewise,"] {
if connective.starts_with(sim) {
return Some((ConnectorFamily::Similarity, RstRelation::Sequence));
}
}
for con in &["Meanwhile,", "However,", "On the other hand,"] {
if connective.starts_with(con) {
return Some((ConnectorFamily::Contrast, RstRelation::Contrast));
}
}
for (prefix, rst) in &[
("Because of this,", RstRelation::Cause),
("As a result,", RstRelation::Result),
("Nevertheless,", RstRelation::Concession),
("Then,", RstRelation::Sequence),
("If this happens,", RstRelation::Condition),
("In summary,", RstRelation::Summary),
] {
if connective.starts_with(prefix) {
let _ = rst;
return None;
}
}
None
}
#[derive(Debug, Clone)]
pub struct ParagraphOpenerMonotony {
pub threshold: usize,
pub min_paragraphs: usize,
}
impl Default for ParagraphOpenerMonotony {
fn default() -> Self {
Self {
threshold: 3,
min_paragraphs: 4,
}
}
}
impl Diagnoser for ParagraphOpenerMonotony {
fn name(&self) -> &'static str {
"paragraph_opener_monotony"
}
fn diagnose(
&self,
document: &RenderedDocument,
_profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
if document.paragraphs.len() < self.min_paragraphs {
return Vec::new();
}
let mut count = alloc::collections::BTreeMap::<String, usize>::new();
for paragraph in &document.paragraphs {
if let Some(c) = paragraph
.sentences
.iter()
.find_map(|s| s.opening_connective.as_ref())
{
*count.entry(c.clone()).or_insert(0) += 1;
}
}
let mut diagnostics = Vec::new();
for (connective, n) in count {
if n >= self.threshold {
let severity = (n as f32) / (document.paragraphs.len() as f32);
diagnostics.push(Diagnostic {
diagnoser: "paragraph_opener_monotony",
severity,
constraints: vec![RefineConstraint::BlacklistConnective(connective)],
});
}
}
diagnostics
}
}
#[derive(Debug, Clone)]
pub struct ListStyleFatigue {
pub threshold: usize,
pub window: usize,
pub min_emissions: usize,
}
impl Default for ListStyleFatigue {
fn default() -> Self {
Self {
threshold: 3,
window: 4,
min_emissions: 3,
}
}
}
impl Diagnoser for ListStyleFatigue {
fn name(&self) -> &'static str {
"list_style_fatigue"
}
fn diagnose(
&self,
document: &RenderedDocument,
_profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
if document.list_styles_used.len() < self.min_emissions {
return Vec::new();
}
let recent_window = document
.list_styles_used
.iter()
.rev()
.take(self.window)
.collect::<Vec<_>>();
let mut count = alloc::collections::BTreeMap::<ListStyle, usize>::new();
for u in &recent_window {
*count.entry(u.list_style).or_insert(0) += 1;
}
let mut diagnostics = Vec::new();
for (style, n) in count {
if n >= self.threshold {
let severity = (n as f32) / (recent_window.len() as f32);
diagnostics.push(Diagnostic {
diagnoser: "list_style_fatigue",
severity,
constraints: vec![RefineConstraint::BlacklistListStyle(style)],
});
}
}
diagnostics
}
}
#[derive(Debug, Clone)]
pub struct RstRelationImbalance {
pub max_share: f32,
pub min_emissions: usize,
}
impl Default for RstRelationImbalance {
fn default() -> Self {
Self {
max_share: 0.6,
min_emissions: 5,
}
}
}
impl Diagnoser for RstRelationImbalance {
fn name(&self) -> &'static str {
"rst_relation_imbalance"
}
fn diagnose(
&self,
document: &RenderedDocument,
_profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
let classified: Vec<(String, RstRelation)> = document
.connectives_used
.iter()
.filter_map(|c| classify(&c.connective).map(|(_, rst)| (c.connective.clone(), rst)))
.collect();
if classified.len() < self.min_emissions {
return Vec::new();
}
let mut count = alloc::collections::BTreeMap::<RstRelation, Vec<String>>::new();
for (text, rst) in &classified {
count.entry(*rst).or_default().push(text.clone());
}
let mut diagnostics = Vec::new();
let total = classified.len() as f32;
for (_rst, connectives) in count {
let share = connectives.len() as f32 / total;
if share > self.max_share {
let mut occurrence = alloc::collections::BTreeMap::<String, usize>::new();
for c in &connectives {
*occurrence.entry(c.clone()).or_insert(0) += 1;
}
let dominant = occurrence
.into_iter()
.max_by_key(|(_, n)| *n)
.map(|(c, _)| c)
.unwrap_or_default();
diagnostics.push(Diagnostic {
diagnoser: "rst_relation_imbalance",
severity: share,
constraints: vec![RefineConstraint::BlacklistConnective(dominant)],
});
}
}
diagnostics
}
}
#[derive(Debug, Clone)]
pub struct DocumentScopeRhythm {
pub min_stdev: f32,
pub min_sentences: usize,
}
impl Default for DocumentScopeRhythm {
fn default() -> Self {
Self {
min_stdev: 2.0,
min_sentences: 6,
}
}
}
impl Diagnoser for DocumentScopeRhythm {
fn name(&self) -> &'static str {
"document_scope_rhythm"
}
fn diagnose(
&self,
document: &RenderedDocument,
_profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
if document.sentences.len() < self.min_sentences {
return Vec::new();
}
let lengths: Vec<f32> = document
.sentences
.iter()
.map(|s| s.word_count as f32)
.collect();
let n = lengths.len() as f32;
let mean = lengths.iter().sum::<f32>() / n;
let variance = lengths
.iter()
.map(|x| {
let d = x - mean;
d * d
})
.sum::<f32>()
/ n;
let stdev = approx_sqrt(variance);
if stdev < self.min_stdev {
let target = crate::style::LengthDistribution {
short: 0.4,
medium: 0.3,
long: 0.3,
short_max_words: 8,
medium_max_words: 18,
};
return vec![Diagnostic {
diagnoser: "document_scope_rhythm",
severity: (self.min_stdev - stdev).max(0.0) / self.min_stdev,
constraints: vec![RefineConstraint::TightenLengthDistribution(target)],
}];
}
Vec::new()
}
}
#[derive(Debug, Clone)]
pub struct ConnectiveFamilySaturation {
pub max_per_family: usize,
}
impl Default for ConnectiveFamilySaturation {
fn default() -> Self {
Self { max_per_family: 4 }
}
}
impl Diagnoser for ConnectiveFamilySaturation {
fn name(&self) -> &'static str {
"connective_family_saturation"
}
fn diagnose(
&self,
document: &RenderedDocument,
_profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
let mut by_family = alloc::collections::BTreeMap::<ConnectorFamily, Vec<String>>::new();
for u in &document.connectives_used {
if let Some((family, _)) = classify(&u.connective) {
by_family
.entry(family)
.or_default()
.push(u.connective.clone());
}
}
let mut diagnostics = Vec::new();
for (_family, list) in by_family {
if list.len() > self.max_per_family {
let mut occurrence = alloc::collections::BTreeMap::<String, usize>::new();
for c in &list {
*occurrence.entry(c.clone()).or_insert(0) += 1;
}
let dominant = occurrence
.into_iter()
.max_by_key(|(_, n)| *n)
.map(|(c, _)| c)
.unwrap_or_default();
diagnostics.push(Diagnostic {
diagnoser: "connective_family_saturation",
severity: (list.len() as f32) / (self.max_per_family as f32),
constraints: vec![RefineConstraint::BlacklistConnective(dominant)],
});
}
}
diagnostics
}
}
#[derive(Debug, Clone)]
pub struct ProfileDistributionDrift {
pub delta: f32,
}
impl Default for ProfileDistributionDrift {
fn default() -> Self {
Self { delta: 0.25 }
}
}
impl Diagnoser for ProfileDistributionDrift {
fn name(&self) -> &'static str {
"profile_distribution_drift"
}
fn diagnose(
&self,
document: &RenderedDocument,
profile: Option<&StyleProfile>,
) -> Vec<Diagnostic> {
let Some(profile) = profile else {
return Vec::new();
};
let mut diagnostics = Vec::new();
if !profile.sentence_length.is_neutral() && !document.sentences.is_empty() {
let dist = &profile.sentence_length;
let mut counts = [0usize; 3];
for sentence in &document.sentences {
let bucket = if sentence.word_count <= dist.short_max_words as usize {
0
} else if sentence.word_count <= dist.medium_max_words as usize {
1
} else {
2
};
counts[bucket] += 1;
}
let total = document.sentences.len() as f32;
let observed = [
counts[0] as f32 / total,
counts[1] as f32 / total,
counts[2] as f32 / total,
];
let target_sum = dist.short + dist.medium + dist.long;
if target_sum > 0.0 {
let target = [
dist.short / target_sum,
dist.medium / target_sum,
dist.long / target_sum,
];
let max_diff = (0..3)
.map(|i| (observed[i] - target[i]).abs())
.fold(0.0_f32, f32::max);
if max_diff > self.delta {
diagnostics.push(Diagnostic {
diagnoser: "profile_distribution_drift",
severity: max_diff,
constraints: vec![RefineConstraint::TightenLengthDistribution(
dist.clone(),
)],
});
}
}
}
diagnostics
}
}
fn approx_sqrt(x: f32) -> f32 {
if x <= 0.0 {
return 0.0;
}
let mut g = if x >= 1.0 { x } else { 1.0 };
for _ in 0..6 {
g = 0.5 * (g + x / g);
}
g
}
pub fn default_set() -> Vec<alloc::sync::Arc<dyn Diagnoser>> {
use alloc::sync::Arc;
vec![
Arc::new(ParagraphOpenerMonotony::default()),
Arc::new(ListStyleFatigue::default()),
Arc::new(RstRelationImbalance::default()),
Arc::new(DocumentScopeRhythm::default()),
Arc::new(ConnectiveFamilySaturation::default()),
Arc::new(ProfileDistributionDrift::default()),
]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::refine::{EventMeta, ParagraphRender, RenderedDocument};
fn doc_with_paragraph_openers(openers: &[Option<&str>]) -> RenderedDocument {
let paragraphs: Vec<ParagraphRender> = openers
.iter()
.enumerate()
.map(|(i, opener)| {
let text = match opener {
Some(o) => format!("Lead in para {i}. {o} continuation here."),
None => format!("Lead in para {i}. Continuation here."),
};
ParagraphRender {
text,
events: vec![
EventMeta {
connective: None,
list_style: None,
},
EventMeta {
connective: opener.map(|s| s.to_string()),
list_style: None,
},
],
}
})
.collect();
RenderedDocument::from_paragraphs(paragraphs)
}
fn doc_with_list_styles(styles: &[ListStyle]) -> RenderedDocument {
let paragraphs: Vec<ParagraphRender> = styles
.iter()
.enumerate()
.map(|(i, ls)| ParagraphRender {
text: format!("Sentence {i} containing items."),
events: vec![EventMeta {
connective: None,
list_style: Some(*ls),
}],
})
.collect();
RenderedDocument::from_paragraphs(paragraphs)
}
fn doc_with_connectives(connectives: &[&str]) -> RenderedDocument {
let paragraphs: Vec<ParagraphRender> = connectives
.iter()
.enumerate()
.map(|(i, c)| ParagraphRender {
text: format!("{c} sentence number {i}."),
events: vec![EventMeta {
connective: Some((*c).to_string()),
list_style: None,
}],
})
.collect();
RenderedDocument::from_paragraphs(paragraphs)
}
fn doc_with_sentence_lengths(lengths: &[usize]) -> RenderedDocument {
let paragraphs: Vec<ParagraphRender> = lengths
.iter()
.map(|&n| {
let words = (0..n).map(|_| "word").collect::<Vec<_>>().join(" ");
ParagraphRender {
text: format!("{words}."),
events: vec![EventMeta::default()],
}
})
.collect();
RenderedDocument::from_paragraphs(paragraphs)
}
#[test]
fn paragraph_opener_monotony_fires_at_threshold() {
let doc = doc_with_paragraph_openers(&[
Some("Additionally,"),
Some("Additionally,"),
Some("Additionally,"),
Some("However,"),
]);
let d = ParagraphOpenerMonotony::default().diagnose(&doc, None);
assert_eq!(d.len(), 1);
assert!(matches!(
&d[0].constraints[0],
RefineConstraint::BlacklistConnective(s) if s.starts_with("Additionally,")
));
}
#[test]
fn paragraph_opener_monotony_silent_below_threshold() {
let doc = doc_with_paragraph_openers(&[
Some("Additionally,"),
Some("Additionally,"),
Some("Furthermore,"),
Some("However,"),
]);
let d = ParagraphOpenerMonotony::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn paragraph_opener_monotony_silent_for_short_docs() {
let doc = doc_with_paragraph_openers(&[Some("Additionally,"), Some("Additionally,")]);
let d = ParagraphOpenerMonotony::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn list_style_fatigue_fires_when_one_style_dominates_window() {
let doc = doc_with_list_styles(&[
ListStyle::Including,
ListStyle::Including,
ListStyle::Including,
ListStyle::SuchAs,
]);
let d = ListStyleFatigue::default().diagnose(&doc, None);
assert_eq!(d.len(), 1);
assert!(matches!(
d[0].constraints[0],
RefineConstraint::BlacklistListStyle(ListStyle::Including)
));
}
#[test]
fn list_style_fatigue_silent_when_diverse() {
let doc = doc_with_list_styles(&[
ListStyle::Including,
ListStyle::SuchAs,
ListStyle::Dash,
ListStyle::Bracketed,
]);
let d = ListStyleFatigue::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn rst_imbalance_fires_when_one_relation_dominates() {
let doc = doc_with_connectives(&[
"Additionally,", "Additionally,",
"Furthermore,", "Additionally,",
"However,", ]);
let d = RstRelationImbalance::default().diagnose(&doc, None);
assert_eq!(d.len(), 1);
}
#[test]
fn rst_imbalance_silent_when_balanced() {
let doc = doc_with_connectives(&[
"Additionally,",
"Additionally,",
"However,",
"However,",
"Similarly,",
]);
let d = RstRelationImbalance::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn document_scope_rhythm_fires_when_lengths_are_flat() {
let doc = doc_with_sentence_lengths(&[10, 10, 10, 10, 10, 10]);
let d = DocumentScopeRhythm::default().diagnose(&doc, None);
assert_eq!(d.len(), 1);
assert!(matches!(
d[0].constraints[0],
RefineConstraint::TightenLengthDistribution(_)
));
}
#[test]
fn document_scope_rhythm_silent_when_lengths_vary() {
let doc = doc_with_sentence_lengths(&[3, 12, 5, 18, 7, 14]);
let d = DocumentScopeRhythm::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn connective_family_saturation_fires_above_budget() {
let doc = doc_with_connectives(&[
"Additionally,",
"Additionally,",
"Additionally,",
"Additionally,",
"Additionally,", ]);
let d = ConnectiveFamilySaturation::default().diagnose(&doc, None);
assert_eq!(d.len(), 1);
}
#[test]
fn connective_family_saturation_silent_at_budget() {
let doc =
doc_with_connectives(&["Additionally,", "Additionally,", "Furthermore,", "It also"]);
let d = ConnectiveFamilySaturation::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn profile_drift_silent_without_profile() {
let doc = doc_with_sentence_lengths(&[3, 5, 7, 9]);
let d = ProfileDistributionDrift::default().diagnose(&doc, None);
assert!(d.is_empty());
}
#[test]
fn profile_drift_silent_with_neutral_profile() {
let doc = doc_with_sentence_lengths(&[3, 5, 7, 9]);
let p = StyleProfile::neutral();
let d = ProfileDistributionDrift::default().diagnose(&doc, Some(&p));
assert!(d.is_empty());
}
#[test]
fn profile_drift_fires_when_observed_misses_target() {
let doc = doc_with_sentence_lengths(&[3, 4, 5, 4, 3, 5]);
let target = crate::style::LengthDistribution {
short: 0.0,
medium: 0.0,
long: 1.0,
short_max_words: 8,
medium_max_words: 18,
};
let p = StyleProfile::builder("long-target")
.sentence_length(target)
.build()
.unwrap();
let d = ProfileDistributionDrift::default().diagnose(&doc, Some(&p));
assert_eq!(d.len(), 1);
}
}