use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;
use std::collections::{BTreeMap, HashSet};
#[derive(Serialize)]
pub struct Keywords {
pub primary: Vec<KeywordHit>,
pub questions: Vec<String>,
pub density: BTreeMap<String, f64>,
}
#[derive(Serialize)]
pub struct KeywordHit {
pub term: String,
pub count: usize,
}
static STOP_WORDS: &[&str] = &[
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
"any", "are", "as", "at", "be", "because", "been", "before", "being", "below",
"between", "both", "but", "by", "can", "did", "do", "does", "doing", "down",
"during", "each", "few", "for", "from", "further", "had", "has", "have",
"having", "he", "her", "here", "hers", "herself", "him", "himself", "his",
"how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me",
"might", "more", "most", "my", "myself", "no", "nor", "not", "now", "of",
"off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out",
"over", "own", "same", "she", "should", "so", "some", "such", "than", "that",
"the", "their", "theirs", "them", "themselves", "then", "there", "these",
"they", "this", "those", "through", "to", "too", "under", "until", "up",
"very", "was", "we", "were", "what", "when", "where", "which", "while", "who",
"whom", "why", "will", "with", "would", "you", "your", "yours", "yourself",
"yourselves",
];
static SENTENCE_END: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]+").unwrap());
static WORD: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b[a-zA-Z][a-zA-Z\-']{2,}\b").unwrap());
pub fn extract(body_text: &str) -> Keywords {
let stop: HashSet<&str> = STOP_WORDS.iter().copied().collect();
let mut counts: BTreeMap<String, usize> = BTreeMap::new();
let mut total: usize = 0;
for m in WORD.find_iter(body_text) {
let w = m.as_str().to_ascii_lowercase();
if stop.contains(w.as_str()) {
continue;
}
total += 1;
*counts.entry(w).or_insert(0) += 1;
}
let mut sorted: Vec<(String, usize)> = counts.into_iter().collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
let primary: Vec<KeywordHit> = sorted
.iter()
.take(5)
.map(|(t, c)| KeywordHit {
term: t.clone(),
count: *c,
})
.collect();
let mut density: BTreeMap<String, f64> = BTreeMap::new();
if total > 0 {
for (t, c) in sorted.iter().take(5) {
density.insert(t.clone(), ((*c as f64 / total as f64) * 1000.0).round() / 10.0);
}
}
let questions: Vec<String> = SENTENCE_END
.split(body_text)
.map(str::trim)
.filter(|s| !s.is_empty())
.filter_map(|s| {
None.or_else(|| {
let lower = s.to_ascii_lowercase();
let starts_with_question = [
"what ", "why ", "how ", "when ", "where ", "who ", "which ", "is ", "are ",
"do ", "does ", "can ", "should ", "will ",
]
.iter()
.any(|w| lower.starts_with(w));
if starts_with_question {
Some(format!("{s}?"))
} else {
None
}
})
})
.take(10)
.collect();
Keywords {
primary,
questions,
density,
}
}