use regex::Regex;
use std::collections::HashSet;
use std::sync::LazyLock;
pub struct PatternGroup {
pub patterns: Vec<Regex>,
pub weight: f64,
}
pub fn score_patterns(input: &str, groups: &[PatternGroup]) -> f64 {
let mut score = 0.0;
for group in groups {
for pattern in &group.patterns {
if pattern.is_match(input) {
score += group.weight;
break; }
}
}
score.min(1.0)
}
static STOP_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had",
"her", "was", "one", "our", "out", "has", "his", "how", "its", "let",
"may", "new", "now", "old", "see", "way", "who", "did", "got", "get",
"him", "yet", "say", "she", "too", "use", "own", "why", "try", "ran",
"run", "set", "put", "add", "big", "end", "far", "few", "saw", "men",
"two", "ask", "ago", "per", "any",
"this", "that", "with", "from", "have", "been", "will", "would", "could",
"should", "about", "their", "there", "these", "those", "which", "where",
"when", "what", "into", "also", "more", "most", "some", "such", "than",
"then", "them", "they", "very", "just", "only", "does", "each", "other",
"being", "were", "here", "both", "between", "through", "during", "before",
"after", "above", "below", "under", "over", "again", "further", "once",
]
.into_iter()
.collect()
});
static TECH_TERMS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"ai", "ml", "go", "ci", "cd", "k8s", "s3", "ec2", "ui", "ux",
"db", "os", "ip", "vm", "io", "rx", "dl", "lr", "qa",
]
.into_iter()
.collect()
});
static WORD_SPLITTER: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[\s,;:!?\.\(\)\[\]\{\}/\\]+").expect("valid regex")
});
pub fn extract_meaningful_words(text: &str, min_length: usize) -> HashSet<String> {
let lower = text.to_lowercase();
let mut result = HashSet::new();
for word in WORD_SPLITTER.split(&lower) {
let word = word.trim_matches(|c: char| !c.is_alphanumeric());
if word.is_empty() {
continue;
}
if STOP_WORDS.contains(word) {
continue;
}
if word.len() >= min_length || TECH_TERMS.contains(word) {
result.insert(word.to_string());
}
}
result
}
pub fn extract_topics(text: &str) -> Vec<String> {
let words = extract_meaningful_words(text, 3);
let mut topics: Vec<String> = words.into_iter().collect();
topics.sort();
topics.truncate(10);
topics
}
pub fn to_topic_set(topics: &[String]) -> HashSet<String> {
topics.iter().map(|t| t.to_lowercase()).collect()
}
pub fn count_topic_overlap(topics: &[String], topic_set: &HashSet<String>) -> usize {
topics
.iter()
.filter(|t| topic_set.contains(&t.to_lowercase()))
.count()
}
pub fn topic_overlap_ratio(a: &[String], b: &[String]) -> f64 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let set_b = to_topic_set(b);
let overlap = count_topic_overlap(a, &set_b);
let min_len = a.len().min(b.len());
if min_len == 0 {
return 0.0;
}
overlap as f64 / min_len as f64
}
pub fn build_topic_cluster(topics: &[String]) -> String {
let mut sorted: Vec<String> = topics
.iter()
.filter(|t| t.len() >= 3)
.map(|t| t.to_lowercase())
.collect();
sorted.sort();
sorted.truncate(2);
sorted.join("+")
}
pub fn compute_question_ratio(content: &str) -> f64 {
let questions = content.matches('?').count();
let sentences = content
.chars()
.filter(|c| *c == '.' || *c == '!' || *c == '?' || *c == '\n')
.count()
.max(1);
questions as f64 / sentences as f64
}
pub fn detect_response_strategy(content: &str) -> crate::types::world::ResponseStrategy {
detect_response_strategy_safe(content)
.unwrap_or(crate::types::world::ResponseStrategy::DirectAnswer)
}
pub fn detect_response_strategy_safe(
content: &str,
) -> Option<crate::types::world::ResponseStrategy> {
use crate::types::world::ResponseStrategy;
let len = content.len();
let questions = content.matches('?').count();
if questions >= 2 && len < 800 {
return Some(ResponseStrategy::ClarifyFirst);
}
static STEP_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?m)^\s*\d+[\.\)]\s").expect("valid regex")
});
static SEQUENCE_MARKERS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)\b(first|then|next|finally|step\s*\d)\b").expect("valid regex")
});
let step_count = STEP_PATTERN.find_iter(content).count();
let seq_count = SEQUENCE_MARKERS.find_iter(content).count();
if step_count >= 2 && seq_count >= 2 {
return Some(ResponseStrategy::StepByStep);
}
static HEADING_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?m)^#{1,4}\s").expect("valid regex")
});
let heading_count = HEADING_PATTERN.find_iter(content).count();
if heading_count >= 3 || (content.contains('|') && content.lines().count() > 3) {
return Some(ResponseStrategy::StructuredAnalysis);
}
static CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"```[\s\S]*?```").expect("valid regex")
});
let code_len: usize = CODE_BLOCK
.find_iter(content)
.map(|m| m.as_str().len())
.sum();
let code_count = CODE_BLOCK.find_iter(content).count();
if (len > 0 && code_len as f64 / len as f64 >= 0.4) || code_count >= 3 {
return Some(ResponseStrategy::ExecuteTask);
}
const DIRECT_ANSWER_MAX_LEN: usize = 200;
if len <= DIRECT_ANSWER_MAX_LEN
&& len > 0
&& questions == 0
&& step_count == 0
&& heading_count == 0
&& code_count == 0
{
return Some(ResponseStrategy::DirectAnswer);
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_topics_filters_stop_words() {
let topics = extract_topics("the quick brown fox jumps over the lazy dog");
assert!(!topics.contains(&"the".to_string()));
assert!(topics.contains(&"quick".to_string()));
assert!(topics.contains(&"brown".to_string()));
assert!(topics.contains(&"jumps".to_string()));
}
#[test]
fn extract_topics_keeps_tech_terms() {
let _topics = extract_topics("use ai and ml for QA");
let words = extract_meaningful_words("use ai and ml for QA", 3);
assert!(words.contains("ai"));
assert!(words.contains("ml"));
assert!(words.contains("qa"));
}
#[test]
fn topic_overlap_identical() {
let a = vec!["rust".into(), "async".into()];
let b = vec!["rust".into(), "async".into()];
assert!((topic_overlap_ratio(&a, &b) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn topic_overlap_no_match() {
let a = vec!["rust".into()];
let b = vec!["python".into()];
assert!((topic_overlap_ratio(&a, &b)).abs() < f64::EPSILON);
}
#[test]
fn topic_overlap_partial() {
let a = vec!["rust".into(), "async".into(), "tokio".into()];
let b = vec!["rust".into(), "python".into()];
let ratio = topic_overlap_ratio(&a, &b);
assert!((ratio - 0.5).abs() < f64::EPSILON); }
#[test]
fn topic_overlap_empty() {
let a: Vec<String> = vec![];
let b = vec!["rust".into()];
assert_eq!(topic_overlap_ratio(&a, &b), 0.0);
}
#[test]
fn score_patterns_basic() {
let groups = vec![PatternGroup {
patterns: vec![Regex::new(r"\berror\b").unwrap()],
weight: 0.5,
}];
assert!((score_patterns("got an error", &groups) - 0.5).abs() < f64::EPSILON);
assert!((score_patterns("all good", &groups)).abs() < f64::EPSILON);
}
#[test]
fn score_patterns_caps_at_one() {
let groups = vec![
PatternGroup {
patterns: vec![Regex::new(r"a").unwrap()],
weight: 0.7,
},
PatternGroup {
patterns: vec![Regex::new(r"b").unwrap()],
weight: 0.7,
},
];
assert!((score_patterns("a b", &groups) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn question_ratio() {
assert!((compute_question_ratio("What? How?") - 1.0).abs() < f64::EPSILON);
assert!((compute_question_ratio("Hello world.")).abs() < f64::EPSILON);
}
#[test]
fn detect_strategy_clarify() {
use crate::types::world::ResponseStrategy;
let content = "What version are you using? What error do you see?";
assert_eq!(detect_response_strategy(content), ResponseStrategy::ClarifyFirst);
}
#[test]
fn detect_strategy_step_by_step_capitalized() {
use crate::types::world::ResponseStrategy;
let content =
"1. First, identify the issue.\n\
2. Then, check the logs.\n\
3. Next, apply the fix.\n\
4. Finally, verify it works.";
assert_eq!(
detect_response_strategy(content),
ResponseStrategy::StepByStep,
"Capitalized sequence markers must be detected (case-insensitive)"
);
}
#[test]
fn detect_strategy_step_by_step_lowercase() {
use crate::types::world::ResponseStrategy;
let content =
"1. first identify the issue\n\
2. then check the logs\n\
3. next apply the fix";
assert_eq!(
detect_response_strategy(content),
ResponseStrategy::StepByStep,
);
}
#[test]
fn safe_positive_matches_clarify() {
use crate::types::world::ResponseStrategy;
let content = "What version? What error?";
assert_eq!(
detect_response_strategy_safe(content),
Some(ResponseStrategy::ClarifyFirst)
);
}
#[test]
fn safe_positive_matches_step_by_step() {
use crate::types::world::ResponseStrategy;
let content =
"1. First, identify.\n\
2. Then, check.\n\
3. Finally, verify.";
assert_eq!(
detect_response_strategy_safe(content),
Some(ResponseStrategy::StepByStep)
);
}
#[test]
fn safe_positive_matches_short_direct_answer() {
use crate::types::world::ResponseStrategy;
let content = "The default port for Postgres is 5432.";
assert_eq!(
detect_response_strategy_safe(content),
Some(ResponseStrategy::DirectAnswer)
);
}
#[test]
fn safe_ambiguous_single_line_numbered_returns_none() {
let content = "1. First. 2. Then. 3. Finally.";
assert_eq!(
detect_response_strategy_safe(content),
None,
"Single-line numbered list is ambiguous — must return None"
);
}
#[test]
fn safe_ambiguous_single_question_clarifying_returns_none() {
let content = "What do you mean exactly?";
assert_eq!(
detect_response_strategy_safe(content),
None,
"Single-question clarifying response is ambiguous — must return None"
);
}
#[test]
fn safe_ambiguous_medium_prose_returns_none() {
let content = "The system architecture evolved over several iterations. \
Initial prototypes focused on correctness rather than performance, \
and subsequent versions refined the data pipeline while preserving \
the original semantic guarantees. Feedback from early adopters \
informed the subsequent redesign.";
assert_eq!(
detect_response_strategy_safe(content),
None,
"Medium narrative prose must not be misclassified as DirectAnswer"
);
}
#[test]
fn safe_empty_returns_none() {
assert_eq!(detect_response_strategy_safe(""), None);
}
#[test]
fn backcompat_wrapper_defaults_to_direct_answer() {
use crate::types::world::ResponseStrategy;
let content = "1. First. 2. Then. 3. Finally.";
assert_eq!(detect_response_strategy_safe(content), None);
assert_eq!(
detect_response_strategy(content),
ResponseStrategy::DirectAnswer,
"Back-compat wrapper defaults to DirectAnswer (unchanged behavior for callers)"
);
}
}