use std::collections::HashSet;
use std::sync::OnceLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::manifest::Tool;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptimizationOutcome {
pub tool: String,
pub original_tokens: usize,
pub rewritten_tokens: usize,
pub reduction_pct: f32,
pub semantic_similarity: f32,
pub original: String,
pub rewritten: String,
pub applied_rules: Vec<String>,
}
pub trait DescriptionOptimizer {
fn rewrite(&self, tool: &Tool) -> OptimizationOutcome;
}
#[derive(Debug, Default)]
pub struct HeuristicOptimizer {
pub min_jaccard: Option<f32>,
}
impl HeuristicOptimizer {
pub fn new() -> Self {
Self::default()
}
}
impl DescriptionOptimizer for HeuristicOptimizer {
fn rewrite(&self, tool: &Tool) -> OptimizationOutcome {
let original = tool.description.clone();
let original_tokens = token_count(&original);
let mut applied: Vec<String> = Vec::new();
let floor = self.min_jaccard.unwrap_or(0.7);
let orig_set = stem_set(&original);
let stages: Vec<(&'static str, fn(&str) -> String)> = vec![
("squash_whitespace", rule_squash_whitespace),
("strip_markdown_emphasis", rule_strip_markdown_emphasis),
("strip_politeness", rule_strip_politeness),
("collapse_self_reference", rule_collapse_self_reference),
("drop_use_when_clauses", rule_drop_use_when),
("drop_generic_filler", rule_drop_generic_filler),
("dedupe_sentences", rule_dedupe_sentences),
];
let mut current = original.clone();
for (name, f) in stages {
let candidate = f(¤t);
let cand_set = stem_set(&candidate);
let j = jaccard(&orig_set, &cand_set);
if j >= floor {
if candidate != current {
applied.push(name.to_string());
current = candidate;
}
} }
let rewritten_tokens = token_count(¤t);
let reduction_pct = if original_tokens == 0 {
0.0
} else {
(original_tokens as f32 - rewritten_tokens as f32) / original_tokens as f32 * 100.0
};
let similarity = tf_cosine(&original, ¤t);
OptimizationOutcome {
tool: tool.name.clone(),
original_tokens,
rewritten_tokens,
reduction_pct,
semantic_similarity: similarity,
original,
rewritten: current,
applied_rules: applied,
}
}
}
fn rule_squash_whitespace(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"\s+").unwrap());
re.replace_all(s, " ").trim().to_string()
}
fn rule_strip_markdown_emphasis(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"\*+([^*]+)\*+|_+([^_]+)_+").unwrap());
re.replace_all(s, |c: ®ex::Captures<'_>| {
c.get(1).or_else(|| c.get(2)).map(|m| m.as_str()).unwrap_or("").to_string()
})
.to_string()
}
fn rule_strip_politeness(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(
r"(?ix)\b(?:please\s+|kindly\s+|note\s+that\s+|be\s+sure\s+to\s+|make\s+sure\s+to\s+|in\s+order\s+to\s+|simply\s+|just\s+)",
)
.unwrap()
});
re.replace_all(s, "").to_string()
}
fn rule_collapse_self_reference(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(
r"(?ix)\bthis\s+(?:tool|function|endpoint|api)\s+(?:is\s+(?:a|an)\s+(?:tool|function)\s+that\s+|allows\s+you\s+to\s+|can\s+be\s+used\s+to\s+|will\s+|is\s+used\s+to\s+|is\s+designed\s+to\s+)",
)
.unwrap()
});
re.replace_all(s, "").to_string()
}
fn rule_dedupe_sentences(s: &str) -> String {
if !s.contains('.') {
return s.to_string();
}
let trailing_dot = s.trim_end().ends_with('.');
let mut seen: HashSet<String> = HashSet::new();
let mut out: Vec<String> = Vec::new();
for chunk in s.split('.') {
let trimmed = chunk.trim();
if trimmed.is_empty() {
continue;
}
let mut key_vec: Vec<String> = stem_set_of_str(trimmed).into_iter().collect();
key_vec.sort();
let key = key_vec.join(" ");
if seen.insert(key) {
out.push(trimmed.to_string());
}
}
if out.is_empty() {
String::new()
} else if trailing_dot {
out.join(". ") + "."
} else {
out.join(". ")
}
}
fn rule_drop_use_when(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(
r"(?ix)(^|\.)\s*use\s+this\s+(?:tool|function)\s+(?:when\s+you\s+(?:need|want)\s+to|to)[^.]*\.",
)
.unwrap()
});
re.replace_all(s, "$1").trim_start_matches('.').trim().to_string()
}
fn rule_drop_generic_filler(s: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(
r"(?ix)\b(?:
appropriately\s*,?\s*properly\s*,?\s*and\s+correctly
| properly\s*,?\s*and\s+correctly
| in\s+the\s+system
| for\s+downstream\s+use
| (?:it\s+handles\s+various\s+\w+\s+things(?:\s+and\s+returns\s+relevant\s+(?:stuff|results))?)
| (?:returns\s+relevant\s+(?:stuff|results))
| stuff\s+like\s+that(?:\s*,?\s*really)?
| this\s+or\s+that(?:\s+context)?
| (?:simply\s+)?just\s+by\s+passing\s+the\s+id
| the\s+resulting\s+\w+
)\b",
)
.unwrap()
});
re.replace_all(s, "").to_string()
}
fn token_count(s: &str) -> usize {
s.split_whitespace().filter(|w| !w.is_empty()).count()
}
fn stem_set(s: &str) -> HashSet<String> {
stem_set_of_str(s)
}
fn stem_set_of_str(s: &str) -> HashSet<String> {
s.split(|c: char| !c.is_alphanumeric())
.filter(|w| w.len() > 2)
.map(|w| w.to_ascii_lowercase())
.filter(|w| !is_stopword(w))
.map(|w| {
for suf in ["ing", "ed", "es", "s", "ly"] {
if w.ends_with(suf) && w.len() > suf.len() + 2 {
return w[..w.len() - suf.len()].to_string();
}
}
w
})
.collect()
}
fn is_stopword(w: &str) -> bool {
matches!(
w,
"the" | "and" | "for" | "with" | "that" | "this" | "from" | "into"
| "you" | "your" | "yours" | "use" | "uses" | "using" | "used"
| "are" | "was" | "were" | "will" | "would" | "could" | "should"
| "can" | "may" | "might" | "have" | "has" | "had" | "been" | "being"
| "its" | "their" | "them" | "they" | "our" | "out" | "any" | "all"
| "such" | "also" | "than" | "then" | "but" | "not" | "via"
| "onto" | "upon" | "either" | "both" | "where" | "when"
| "need" | "needs" | "needed" | "want" | "wants" | "wanted"
| "please" | "kindly" | "note" | "noted" | "simply" | "just" | "sure"
| "tool" | "tools" | "function" | "endpoint" | "api"
| "appropriately" | "properly" | "correctly"
| "various" | "things" | "thing" | "stuff" | "relevant"
| "system" | "downstream" | "context" | "back"
| "passing" | "pass" | "passed"
| "resulting" | "result" | "results" | "returned"
| "allows" | "allow" | "allowed" | "designed"
| "operation" | "operations"
| "really" | "actually" | "essentially" | "basically"
)
}
fn jaccard(a: &HashSet<String>, b: &HashSet<String>) -> f32 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
let inter = a.intersection(b).count() as f32;
let union = a.union(b).count() as f32;
if union == 0.0 {
0.0
} else {
inter / union
}
}
fn tf_cosine(a: &str, b: &str) -> f32 {
use std::collections::HashMap;
fn tf(s: &str) -> HashMap<String, f32> {
let mut m: HashMap<String, f32> = HashMap::new();
for w in s.split(|c: char| !c.is_alphanumeric()) {
if w.len() <= 2 {
continue;
}
let lower = w.to_ascii_lowercase();
if is_stopword(&lower) {
continue;
}
let stemmed = {
let mut out = lower.clone();
for suf in ["ing", "ed", "es", "s", "ly"] {
if out.ends_with(suf) && out.len() > suf.len() + 2 {
out.truncate(out.len() - suf.len());
break;
}
}
out
};
*m.entry(stemmed).or_insert(0.0) += 1.0;
}
m
}
let ta = tf(a);
let tb = tf(b);
if ta.is_empty() && tb.is_empty() {
return 1.0;
}
let mut dot = 0.0_f32;
let mut na2 = 0.0_f32;
let mut nb2 = 0.0_f32;
for v in ta.values() {
na2 += v * v;
}
for v in tb.values() {
nb2 += v * v;
}
for (k, va) in ta.iter() {
if let Some(vb) = tb.get(k) {
dot += va * vb;
}
}
let denom = na2.sqrt() * nb2.sqrt();
if denom == 0.0 {
0.0
} else {
(dot / denom).clamp(0.0, 1.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
fn t(desc: &str) -> Tool {
Tool {
name: "x".into(),
description: desc.into(),
input_schema: json!({}),
}
}
#[test]
fn shrinks_a_bloated_description() {
let bloated = "Please note that this tool is a tool that allows you to compute the sum \
of two numbers. Use this tool when you need to add numbers. Simply pass \
two numbers and you will get the sum back.";
let opt = HeuristicOptimizer::new();
let out = opt.rewrite(&t(bloated));
assert!(out.reduction_pct >= 25.0, "got {}%", out.reduction_pct);
assert!(out.semantic_similarity >= 0.7, "jaccard {}", out.semantic_similarity);
}
#[test]
fn preserves_short_descriptions() {
let opt = HeuristicOptimizer::new();
let out = opt.rewrite(&t("Adds two numbers."));
assert!(out.reduction_pct >= 0.0);
assert!(out.semantic_similarity >= 0.9);
}
#[test]
fn squashes_whitespace() {
let opt = HeuristicOptimizer::new();
let out = opt.rewrite(&t("hello world"));
assert_eq!(out.rewritten, "hello world");
}
#[test]
fn dedupes_duplicate_sentences() {
let opt = HeuristicOptimizer::new();
let out = opt.rewrite(&t("Returns the user. Returns the user. Returns the user."));
assert!(out.rewritten_tokens < out.original_tokens);
}
#[test]
fn strips_markdown_emphasis() {
let opt = HeuristicOptimizer::new();
let out = opt.rewrite(&t("**Adds** _two_ numbers"));
assert!(!out.rewritten.contains('*'));
assert!(!out.rewritten.contains('_'));
}
#[test]
fn jaccard_guardrail_holds() {
let opt = HeuristicOptimizer::new();
let big = "Search the customer database for matching contact records by full name, email \
address, phone number, mailing address, or any combination thereof. Please \
note that this tool is a tool that allows you to perform such a search.";
let out = opt.rewrite(&t(big));
assert!(
out.semantic_similarity >= 0.7,
"jaccard {} dropped too low",
out.semantic_similarity
);
}
}