use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct LinguisticFeatures {
pub char_count: usize,
pub word_count: usize,
pub sentence_count: usize,
pub avg_word_length: f32,
pub avg_sentence_length: f32,
pub long_word_ratio: f32,
pub reading_grade_level: f32,
pub exclamation_ratio: f32,
pub question_ratio: f32,
pub caps_word_count: usize,
pub caps_ratio: f32,
pub hedge_count: usize,
pub hedge_density: f32,
pub certainty_count: usize,
pub negative_emotion_count: usize,
pub negative_emotion_density: f32,
pub absolutist_count: usize,
pub absolutist_density: f32,
pub contraction_ratio: f32,
pub politeness_count: usize,
pub first_person_ratio: f32,
pub urgency_word_count: usize,
pub imperative_count: usize,
pub words_per_minute: f32,
pub filler_ratio: f32,
}
#[derive(Clone, Debug)]
pub struct ExtractorConfig {
pub long_word_threshold: usize,
pub min_caps_word_length: usize,
}
impl Default for ExtractorConfig {
fn default() -> Self {
Self {
long_word_threshold: 6,
min_caps_word_length: 3,
}
}
}
#[derive(Clone, Debug, Default)]
pub struct LinguisticExtractor {
config: ExtractorConfig,
}
impl LinguisticExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_config(config: ExtractorConfig) -> Self {
Self { config }
}
pub fn extract(&self, text: &str) -> LinguisticFeatures {
let chars: Vec<char> = text.chars().collect();
let char_count = chars.len();
if char_count == 0 {
return LinguisticFeatures::default();
}
let words = self.tokenize_words(text);
let word_count = words.len();
if word_count == 0 {
return LinguisticFeatures {
char_count,
..Default::default()
};
}
let sentence_count = self.count_sentences(text).max(1);
let total_word_chars: usize = words.iter().map(|w| w.len()).sum();
let avg_word_length = total_word_chars as f32 / word_count as f32;
let avg_sentence_length = word_count as f32 / sentence_count as f32;
let long_words = words
.iter()
.filter(|w| w.len() > self.config.long_word_threshold)
.count();
let long_word_ratio = long_words as f32 / word_count as f32;
let syllables = self.estimate_syllables(&words);
let reading_grade_level = self.flesch_kincaid_grade(word_count, sentence_count, syllables);
let exclamation_count = chars.iter().filter(|&&c| c == '!').count();
let question_count = chars.iter().filter(|&&c| c == '?').count();
let exclamation_ratio = exclamation_count as f32 / sentence_count as f32;
let question_ratio = question_count as f32 / sentence_count as f32;
let caps_words: Vec<_> = words
.iter()
.filter(|w| {
w.len() >= self.config.min_caps_word_length
&& w.chars().all(|c| c.is_uppercase() || !c.is_alphabetic())
&& w.chars().any(|c| c.is_alphabetic())
})
.collect();
let caps_word_count = caps_words.len();
let caps_ratio = caps_word_count as f32 / word_count as f32;
let hedge_count = self.count_hedge_words(&words);
let hedge_density = hedge_count as f32 / sentence_count as f32;
let certainty_count = self.count_certainty_markers(&words);
let contraction_ratio = self.estimate_contraction_ratio(text);
let politeness_count = self.count_politeness_markers(&words);
let first_person_ratio = self.first_person_ratio(&words);
let urgency_word_count = self.count_urgency_words(&words);
let imperative_count = self.count_imperatives(text);
let filler_count = self.count_filler_words(&words);
let filler_ratio = filler_count as f32 / word_count as f32;
let negative_emotion_count = self.count_negative_emotion_words(&words);
let negative_emotion_density = negative_emotion_count as f32 / sentence_count as f32;
let absolutist_count = self.count_absolutist_words(&words);
let absolutist_density = absolutist_count as f32 / sentence_count as f32;
LinguisticFeatures {
char_count,
word_count,
sentence_count,
avg_word_length,
avg_sentence_length,
long_word_ratio,
reading_grade_level,
exclamation_ratio,
question_ratio,
caps_word_count,
caps_ratio,
hedge_count,
hedge_density,
certainty_count,
negative_emotion_count,
negative_emotion_density,
absolutist_count,
absolutist_density,
contraction_ratio,
politeness_count,
first_person_ratio,
urgency_word_count,
imperative_count,
words_per_minute: 0.0, filler_ratio,
}
}
fn tokenize_words<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.split(|c: char| c.is_whitespace() || c == ',' || c == ';' || c == ':')
.map(|w| w.trim_matches(|c: char| !c.is_alphanumeric() && c != '\''))
.filter(|w| !w.is_empty())
.collect()
}
fn count_sentences(&self, text: &str) -> usize {
let mut count = 0;
let mut prev_char = ' ';
for c in text.chars() {
if (c == '.' || c == '!' || c == '?') && prev_char != '.' {
count += 1;
}
prev_char = c;
}
if count == 0 && !text.trim().is_empty() {
count = 1;
}
count
}
fn estimate_syllables(&self, words: &[&str]) -> usize {
words.iter().map(|w| self.syllables_in_word(w)).sum()
}
fn syllables_in_word(&self, word: &str) -> usize {
let word = word.to_lowercase();
let vowels = ['a', 'e', 'i', 'o', 'u', 'y'];
let mut count = 0;
let mut prev_was_vowel = false;
for c in word.chars() {
let is_vowel = vowels.contains(&c);
if is_vowel && !prev_was_vowel {
count += 1;
}
prev_was_vowel = is_vowel;
}
if word.ends_with('e') && count > 1 {
count -= 1;
}
count.max(1)
}
fn flesch_kincaid_grade(&self, words: usize, sentences: usize, syllables: usize) -> f32 {
if words == 0 || sentences == 0 {
return 0.0;
}
let asl = words as f32 / sentences as f32; let asw = syllables as f32 / words as f32;
(0.39 * asl + 11.8 * asw - 15.59).clamp(0.0, 20.0)
}
fn count_hedge_words(&self, words: &[&str]) -> usize {
const HEDGE_WORDS: &[&str] = &[
"maybe",
"perhaps",
"possibly",
"probably",
"might",
"could",
"seem",
"seems",
"seemed",
"appear",
"appears",
"appeared",
"think",
"believe",
"guess",
"suppose",
"assume",
"somewhat",
"fairly",
"rather",
"quite",
"sort",
"kind",
"mostly",
"generally",
"usually",
"often",
"uncertain",
"unsure",
"unclear",
];
words
.iter()
.filter(|w| HEDGE_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn count_certainty_markers(&self, words: &[&str]) -> usize {
const CERTAINTY_WORDS: &[&str] = &[
"definitely",
"absolutely",
"certainly",
"clearly",
"obviously",
"surely",
"undoubtedly",
"always",
"never",
"must",
"will",
"proven",
"fact",
"guarantee",
"positive",
"confident",
];
words
.iter()
.filter(|w| CERTAINTY_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn estimate_contraction_ratio(&self, text: &str) -> f32 {
let contractions = ["n't", "'re", "'ve", "'ll", "'m", "'d", "'s"];
let text_lower = text.to_lowercase();
let contraction_count = contractions
.iter()
.map(|c| text_lower.matches(c).count())
.sum::<usize>();
let opportunities = text_lower.matches(" i ").count()
+ text_lower.matches(" you ").count()
+ text_lower.matches(" we ").count()
+ text_lower.matches(" they ").count()
+ text_lower.matches(" he ").count()
+ text_lower.matches(" she ").count()
+ text_lower.matches(" it ").count()
+ text_lower.matches(" not ").count()
+ text_lower.matches(" will ").count()
+ text_lower.matches(" would ").count()
+ text_lower.matches(" have ").count()
+ text_lower.matches(" has ").count()
+ text_lower.matches(" is ").count()
+ text_lower.matches(" are ").count();
if opportunities == 0 {
return 0.5; }
(contraction_count as f32 / opportunities as f32).clamp(0.0, 1.0)
}
fn count_politeness_markers(&self, words: &[&str]) -> usize {
const POLITE_WORDS: &[&str] = &[
"please",
"thanks",
"thank",
"appreciate",
"grateful",
"sorry",
"apologies",
"apologize",
"excuse",
"pardon",
"kindly",
"welcome",
"regards",
];
words
.iter()
.filter(|w| POLITE_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn first_person_ratio(&self, words: &[&str]) -> f32 {
const FIRST_PERSON: &[&str] =
&["i", "me", "my", "mine", "myself", "we", "us", "our", "ours"];
let count = words
.iter()
.filter(|w| FIRST_PERSON.contains(&w.to_lowercase().as_str()))
.count();
if words.is_empty() {
return 0.0;
}
count as f32 / words.len() as f32
}
fn count_urgency_words(&self, words: &[&str]) -> usize {
const URGENCY_WORDS: &[&str] = &[
"urgent",
"urgently",
"asap",
"immediately",
"emergency",
"critical",
"crucial",
"vital",
"essential",
"pressing",
"now",
"today",
"deadline",
"hurry",
"quick",
"quickly",
"fast",
"rush",
"priority",
"important",
];
words
.iter()
.filter(|w| URGENCY_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn count_imperatives(&self, text: &str) -> usize {
const IMPERATIVE_STARTERS: &[&str] = &[
"do ", "don't ", "please ", "make ", "let ", "get ", "take ", "give ", "tell ",
"show ", "help ", "send ", "check ", "read ", "write ", "call ", "stop ", "start ",
"go ", "come ",
];
let text_lower = text.to_lowercase();
let mut count = 0;
for starter in IMPERATIVE_STARTERS {
if text_lower.starts_with(starter) {
count += 1;
break;
}
}
for boundary in [". ", "! ", "? "] {
for part in text_lower.split(boundary) {
let trimmed = part.trim();
for starter in IMPERATIVE_STARTERS {
if trimmed.starts_with(starter) {
count += 1;
break;
}
}
}
}
count
}
fn count_filler_words(&self, words: &[&str]) -> usize {
const FILLER_WORDS: &[&str] = &[
"just",
"actually",
"basically",
"really",
"very",
"literally",
"honestly",
"like",
"so",
"well",
"anyway",
"anyways",
"totally",
"completely",
"definitely",
"absolutely",
];
words
.iter()
.filter(|w| FILLER_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn count_negative_emotion_words(&self, words: &[&str]) -> usize {
const NEGATIVE_EMOTION_WORDS: &[&str] = &[
"worried",
"worry",
"worries",
"worrying",
"anxious",
"anxiety",
"nervous",
"nervously",
"afraid",
"fear",
"fears",
"feared",
"fearful",
"scared",
"scary",
"panic",
"panicked",
"panicking",
"stressed",
"stress",
"stressful",
"tense",
"tension",
"uneasy",
"dread",
"dreading",
"dreaded",
"upset",
"upsetting",
"frustrated",
"frustrating",
"frustration",
"annoyed",
"annoying",
"annoyance",
"angry",
"anger",
"mad",
"sad",
"sadness",
"depressed",
"depressing",
"depression",
"hopeless",
"hopelessness",
"miserable",
"terrible",
"terribly",
"awful",
"horrible",
"horribly",
"worst",
"struggling",
"struggle",
"struggles",
"suffering",
"suffer",
"suffers",
"overwhelmed",
"overwhelming",
"exhausted",
"exhausting",
"exhaustion",
"desperate",
"desperately",
"desperation",
"helpless",
"helplessness",
"stuck",
"lost",
];
words
.iter()
.filter(|w| NEGATIVE_EMOTION_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
fn count_absolutist_words(&self, words: &[&str]) -> usize {
const ABSOLUTIST_WORDS: &[&str] = &[
"always",
"never",
"nothing",
"everything",
"completely",
"totally",
"absolutely",
"entirely",
"impossible",
"perfectly",
"forever",
"everyone",
"nobody",
"nowhere",
"anywhere",
"constant",
"constantly",
];
words
.iter()
.filter(|w| ABSOLUTIST_WORDS.contains(&w.to_lowercase().as_str()))
.count()
}
}
impl LinguisticFeatures {
pub fn complexity_score(&self) -> f32 {
let grade_component = (self.reading_grade_level / 16.0).clamp(0.0, 1.0);
let length_component = (self.avg_sentence_length / 30.0).clamp(0.0, 1.0);
let word_component = (self.avg_word_length / 8.0).clamp(0.0, 1.0);
(0.4 * grade_component + 0.3 * length_component + 0.3 * word_component).clamp(0.0, 1.0)
}
pub fn emotional_intensity(&self) -> f32 {
let exclaim = (self.exclamation_ratio * 2.0).clamp(0.0, 1.0);
let caps = (self.caps_ratio * 10.0).clamp(0.0, 1.0);
(0.6 * exclaim + 0.4 * caps).clamp(0.0, 1.0)
}
pub fn uncertainty_score(&self) -> f32 {
let hedge = (self.hedge_density / 2.0).clamp(0.0, 1.0);
let question = (self.question_ratio).clamp(0.0, 1.0);
let certainty_inverse = 1.0 - (self.certainty_count as f32 / 3.0).clamp(0.0, 1.0);
(0.5 * hedge + 0.3 * question + 0.2 * certainty_inverse).clamp(0.0, 1.0)
}
pub fn anxiety_score(&self) -> f32 {
let neg_emotion = (self.negative_emotion_density / 2.0).clamp(0.0, 1.0);
let first_person = (self.first_person_ratio * 5.0).clamp(0.0, 1.0); let uncertainty = self.uncertainty_score();
let absolutist = (self.absolutist_density / 2.0).clamp(0.0, 1.0);
(0.35 * neg_emotion + 0.35 * first_person + 0.20 * uncertainty + 0.10 * absolutist)
.clamp(0.0, 1.0)
}
pub fn urgency_score(&self) -> f32 {
let words = (self.urgency_word_count as f32 / 3.0).clamp(0.0, 1.0);
let imperatives = (self.imperative_count as f32 / 2.0).clamp(0.0, 1.0);
let exclaim = (self.exclamation_ratio).clamp(0.0, 1.0);
(0.5 * words + 0.3 * imperatives + 0.2 * exclaim).clamp(0.0, 1.0)
}
pub fn formality_score(&self) -> f32 {
let contraction_inverse = 1.0 - self.contraction_ratio;
let complexity = self.complexity_score();
let emotional_inverse = 1.0 - self.emotional_intensity();
(0.4 * contraction_inverse + 0.3 * complexity + 0.3 * emotional_inverse).clamp(0.0, 1.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_extraction() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract("Hello, this is a simple test.");
assert_eq!(features.word_count, 6);
assert_eq!(features.sentence_count, 1);
assert!(features.avg_word_length > 0.0);
}
#[test]
fn test_empty_text() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract("");
assert_eq!(features.word_count, 0);
assert_eq!(features.char_count, 0);
}
#[test]
fn test_urgency_detection() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract("URGENT! I need help immediately! This is critical!!!");
assert!(features.urgency_word_count >= 3);
assert!(features.urgency_score() > 0.5);
assert!(features.exclamation_ratio >= 1.0);
}
#[test]
fn test_hedge_detection() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract(
"I think maybe this might possibly work, but I'm not sure. Perhaps we should try.",
);
assert!(features.hedge_count >= 4);
assert!(features.uncertainty_score() > 0.3);
}
#[test]
fn test_formality_contrast() {
let extractor = LinguisticExtractor::new();
let casual = extractor.extract("hey what's up! can't wait to see ya there lol");
let formal = extractor.extract(
"I am writing to inquire about the status of my application. Thank you for your consideration.",
);
assert!(formal.formality_score() > casual.formality_score());
}
#[test]
fn test_caps_detection() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract("This is VERY IMPORTANT and you MUST read it NOW!");
assert!(features.caps_word_count >= 3);
assert!(features.caps_ratio > 0.1);
}
#[test]
fn test_complexity_score() {
let extractor = LinguisticExtractor::new();
let simple = extractor.extract("I like cats. They are cute.");
let complex = extractor.extract(
"The epistemological implications of quantum mechanical phenomena necessitate \
a fundamental reconsideration of our ontological presuppositions regarding \
the nature of observable reality.",
);
assert!(complex.complexity_score() > simple.complexity_score());
}
#[test]
fn test_flesch_kincaid() {
let extractor = LinguisticExtractor::new();
let features = extractor.extract("The cat sat on the mat.");
assert!(features.reading_grade_level < 5.0);
}
#[test]
fn test_negative_emotion_detection() {
let extractor = LinguisticExtractor::new();
let anxious = extractor.extract(
"I'm so worried and stressed about this. I feel anxious and scared. \
The situation is terrible and I'm struggling to cope.",
);
let calm = extractor.extract(
"The project is going well. We have made good progress and the team \
is confident about the outcome.",
);
assert!(anxious.negative_emotion_count >= 5);
assert!(anxious.negative_emotion_density > 0.5);
assert!(calm.negative_emotion_count <= 1);
assert!(anxious.anxiety_score() > calm.anxiety_score());
}
#[test]
fn test_absolutist_detection() {
let extractor = LinguisticExtractor::new();
let absolutist = extractor.extract(
"Everything is always terrible. Nothing ever works. I can never do anything right.",
);
let balanced =
extractor.extract("Sometimes things work out. Other times they don't. It varies.");
assert!(absolutist.absolutist_count >= 4);
assert!(balanced.absolutist_count <= 1);
}
#[test]
fn test_anxiety_score_vs_uncertainty() {
let extractor = LinguisticExtractor::new();
let stressed = extractor.extract(
"I am so stressed and worried. I feel overwhelmed and exhausted. \
This is terrible and I don't know what to do.",
);
let hedging = extractor.extract(
"I think maybe we could possibly try this approach. \
Perhaps it might work, but I'm not entirely sure.",
);
assert!(stressed.anxiety_score() > hedging.anxiety_score());
assert!(hedging.uncertainty_score() > stressed.uncertainty_score());
}
}