use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;
#[derive(Serialize)]
pub struct InformationGain {
pub score: u32,
pub counts: SignalCounts,
pub samples: Vec<Sample>,
}
#[derive(Serialize)]
pub struct SignalCounts {
pub named_quotes: usize,
pub sample_sizes: usize,
pub yoy_deltas: usize,
pub first_person_evidence: usize,
pub method_disclosure: usize,
pub numbered_citations: usize,
}
#[derive(Serialize)]
pub struct Sample {
pub kind: &'static str,
pub snippet: String,
}
static NAMED_QUOTE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"["“][^"”\n]{20,}["”]\s*[—–-]\s*[A-Z][a-z]+(?:\s+[A-Z][a-zA-Z\-']+)+"#,
)
.unwrap()
});
static SAMPLE_SIZE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)\b(n\s*=\s*\d+|\d+(?:,\d{3})*\s+(patients?|participants?|respondents?|samples?|subjects?|stores?|customers?|transactions?|users?|companies|firms))\b",
)
.unwrap()
});
static YOY_DELTA: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)\b(up|down|rose|fell|grew|declined|increased|decreased)\s+(by\s+)?\d+(\.\d+)?\s*(%|percentage\s+points?|pp)\s+(from|since|vs\.?|versus|compared\s+to|year[-\s]over[-\s]year|YoY)\b",
)
.unwrap()
});
static FIRST_PERSON_EVIDENCE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)\b(we\s+(analy[sz]ed|measured|tracked|tested|surveyed|ran|conducted|sampled|interviewed|observed|sequenced|benchmarked|profiled|simulated|modell?ed|replicated|built|build|trained?|train|develop(ed)?|designed|engineered|prototyped|deployed|ship(ped)?|launched|redirect|repair|treat)|our\s+(dataset|study|analysis|sample|cohort|lab|experiment|results|findings|patients|approach|team\s+(found|measured|observed|built|developed)))\b",
)
.unwrap()
});
static METHOD_DISCLOSURE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)\b(methodology|inclusion\s+criteria|exclusion\s+criteria|study\s+design|sample\s+size|control\s+group|double[-\s]blind|randomi[sz]ed|cross[-\s]sectional)\b",
)
.unwrap()
});
static NUMBERED_CITATION: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\[\d{1,3}\]").unwrap());
pub fn extract(body_text: &str) -> InformationGain {
let named_quotes = NAMED_QUOTE.find_iter(body_text).count();
let sample_sizes = SAMPLE_SIZE.find_iter(body_text).count();
let yoy_deltas = YOY_DELTA.find_iter(body_text).count();
let first_person_evidence = FIRST_PERSON_EVIDENCE.find_iter(body_text).count();
let method_disclosure = METHOD_DISCLOSURE.find_iter(body_text).count();
let numbered_citations = NUMBERED_CITATION.find_iter(body_text).count();
let total = named_quotes
+ sample_sizes
+ yoy_deltas
+ first_person_evidence
+ method_disclosure
+ numbered_citations.min(5);
let score = (total as u32).min(10);
let mut samples: Vec<Sample> = Vec::new();
push_samples(&mut samples, "named_quote", &NAMED_QUOTE, body_text, 3);
push_samples(&mut samples, "sample_size", &SAMPLE_SIZE, body_text, 3);
push_samples(&mut samples, "yoy_delta", &YOY_DELTA, body_text, 3);
push_samples(
&mut samples,
"first_person_evidence",
&FIRST_PERSON_EVIDENCE,
body_text,
3,
);
push_samples(&mut samples, "method_disclosure", &METHOD_DISCLOSURE, body_text, 3);
InformationGain {
score,
counts: SignalCounts {
named_quotes,
sample_sizes,
yoy_deltas,
first_person_evidence,
method_disclosure,
numbered_citations,
},
samples,
}
}
fn push_samples(
out: &mut Vec<Sample>,
kind: &'static str,
re: &Regex,
text: &str,
max: usize,
) {
for m in re.find_iter(text).take(max) {
let snippet: String = m
.as_str()
.chars()
.take(120)
.collect::<String>()
.trim()
.replace('\n', " ");
out.push(Sample {
kind,
snippet,
});
}
}
pub fn suggestion(ig: &InformationGain, word_count: usize) -> Option<String> {
if word_count < 300 {
return None;
}
match ig.score {
0..=1 => Some(format!(
"Information Gain {}/10. Rewritten / templated content reads weak. Add named-source quotes, sample sizes (n=…), YoY deltas, first-party evidence.",
ig.score
)),
2..=4 => Some(format!(
"Information Gain {}/10. Below the competitive band (5..7). Add first-party data: named quotes, methodology, sample sizes.",
ig.score
)),
_ => None,
}
}