ironcontext_core/
optimizer.rs1use std::collections::HashSet;
17use std::sync::OnceLock;
18
19use regex::Regex;
20use serde::{Deserialize, Serialize};
21
22use crate::manifest::Tool;
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct OptimizationOutcome {
26 pub tool: String,
27 pub original_tokens: usize,
28 pub rewritten_tokens: usize,
29 pub reduction_pct: f32,
30 pub semantic_similarity: f32,
34 pub original: String,
35 pub rewritten: String,
36 pub applied_rules: Vec<String>,
37}
38
39pub trait DescriptionOptimizer {
41 fn rewrite(&self, tool: &Tool) -> OptimizationOutcome;
42}
43
44#[derive(Debug, Default)]
46pub struct HeuristicOptimizer {
47 pub min_jaccard: Option<f32>,
52}
53
54impl HeuristicOptimizer {
55 pub fn new() -> Self {
56 Self::default()
57 }
58}
59
60impl DescriptionOptimizer for HeuristicOptimizer {
61 fn rewrite(&self, tool: &Tool) -> OptimizationOutcome {
62 let original = tool.description.clone();
63 let original_tokens = token_count(&original);
64 let mut applied: Vec<String> = Vec::new();
65
66 let floor = self.min_jaccard.unwrap_or(0.7);
71 let orig_set = stem_set(&original);
72
73 let stages: Vec<(&'static str, fn(&str) -> String)> = vec![
74 ("squash_whitespace", rule_squash_whitespace),
75 ("strip_markdown_emphasis", rule_strip_markdown_emphasis),
76 ("strip_politeness", rule_strip_politeness),
77 ("collapse_self_reference", rule_collapse_self_reference),
78 ("drop_use_when_clauses", rule_drop_use_when),
79 ("drop_generic_filler", rule_drop_generic_filler),
80 ("dedupe_sentences", rule_dedupe_sentences),
81 ];
82
83 let mut current = original.clone();
84 for (name, f) in stages {
85 let candidate = f(¤t);
86 let cand_set = stem_set(&candidate);
87 let j = jaccard(&orig_set, &cand_set);
88 if j >= floor {
89 if candidate != current {
90 applied.push(name.to_string());
91 current = candidate;
92 }
93 } }
95
96 let rewritten_tokens = token_count(¤t);
97 let reduction_pct = if original_tokens == 0 {
98 0.0
99 } else {
100 (original_tokens as f32 - rewritten_tokens as f32) / original_tokens as f32 * 100.0
101 };
102 let similarity = tf_cosine(&original, ¤t);
109
110 OptimizationOutcome {
111 tool: tool.name.clone(),
112 original_tokens,
113 rewritten_tokens,
114 reduction_pct,
115 semantic_similarity: similarity,
116 original,
117 rewritten: current,
118 applied_rules: applied,
119 }
120 }
121}
122
123fn rule_squash_whitespace(s: &str) -> String {
126 static RE: OnceLock<Regex> = OnceLock::new();
127 let re = RE.get_or_init(|| Regex::new(r"\s+").unwrap());
128 re.replace_all(s, " ").trim().to_string()
129}
130
131fn rule_strip_markdown_emphasis(s: &str) -> String {
132 static RE: OnceLock<Regex> = OnceLock::new();
133 let re = RE.get_or_init(|| Regex::new(r"\*+([^*]+)\*+|_+([^_]+)_+").unwrap());
134 re.replace_all(s, |c: ®ex::Captures<'_>| {
135 c.get(1).or_else(|| c.get(2)).map(|m| m.as_str()).unwrap_or("").to_string()
136 })
137 .to_string()
138}
139
140fn rule_strip_politeness(s: &str) -> String {
141 static RE: OnceLock<Regex> = OnceLock::new();
142 let re = RE.get_or_init(|| {
143 Regex::new(
144 r"(?ix)\b(?:please\s+|kindly\s+|note\s+that\s+|be\s+sure\s+to\s+|make\s+sure\s+to\s+|in\s+order\s+to\s+|simply\s+|just\s+)",
145 )
146 .unwrap()
147 });
148 re.replace_all(s, "").to_string()
149}
150
151fn rule_collapse_self_reference(s: &str) -> String {
152 static RE: OnceLock<Regex> = OnceLock::new();
153 let re = RE.get_or_init(|| {
154 Regex::new(
155 r"(?ix)\bthis\s+(?:tool|function|endpoint|api)\s+(?:is\s+(?:a|an)\s+(?:tool|function)\s+that\s+|allows\s+you\s+to\s+|can\s+be\s+used\s+to\s+|will\s+|is\s+used\s+to\s+|is\s+designed\s+to\s+)",
156 )
157 .unwrap()
158 });
159 re.replace_all(s, "").to_string()
160}
161
162fn rule_dedupe_sentences(s: &str) -> String {
163 if !s.contains('.') {
166 return s.to_string();
167 }
168 let trailing_dot = s.trim_end().ends_with('.');
169 let mut seen: HashSet<String> = HashSet::new();
170 let mut out: Vec<String> = Vec::new();
171 for chunk in s.split('.') {
172 let trimmed = chunk.trim();
173 if trimmed.is_empty() {
174 continue;
175 }
176 let mut key_vec: Vec<String> = stem_set_of_str(trimmed).into_iter().collect();
177 key_vec.sort();
178 let key = key_vec.join(" ");
179 if seen.insert(key) {
180 out.push(trimmed.to_string());
181 }
182 }
183 if out.is_empty() {
184 String::new()
185 } else if trailing_dot {
186 out.join(". ") + "."
187 } else {
188 out.join(". ")
189 }
190}
191
192fn rule_drop_use_when(s: &str) -> String {
193 static RE: OnceLock<Regex> = OnceLock::new();
194 let re = RE.get_or_init(|| {
197 Regex::new(
198 r"(?ix)(^|\.)\s*use\s+this\s+(?:tool|function)\s+(?:when\s+you\s+(?:need|want)\s+to|to)[^.]*\.",
199 )
200 .unwrap()
201 });
202 re.replace_all(s, "$1").trim_start_matches('.').trim().to_string()
203}
204
205fn rule_drop_generic_filler(s: &str) -> String {
209 static RE: OnceLock<Regex> = OnceLock::new();
210 let re = RE.get_or_init(|| {
211 Regex::new(
212 r"(?ix)\b(?:
213 appropriately\s*,?\s*properly\s*,?\s*and\s+correctly
214 | properly\s*,?\s*and\s+correctly
215 | in\s+the\s+system
216 | for\s+downstream\s+use
217 | (?:it\s+handles\s+various\s+\w+\s+things(?:\s+and\s+returns\s+relevant\s+(?:stuff|results))?)
218 | (?:returns\s+relevant\s+(?:stuff|results))
219 | stuff\s+like\s+that(?:\s*,?\s*really)?
220 | this\s+or\s+that(?:\s+context)?
221 | (?:simply\s+)?just\s+by\s+passing\s+the\s+id
222 | the\s+resulting\s+\w+
223 )\b",
224 )
225 .unwrap()
226 });
227 re.replace_all(s, "").to_string()
228}
229
230fn token_count(s: &str) -> usize {
233 s.split_whitespace().filter(|w| !w.is_empty()).count()
234}
235
236fn stem_set(s: &str) -> HashSet<String> {
237 stem_set_of_str(s)
238}
239
240fn stem_set_of_str(s: &str) -> HashSet<String> {
241 s.split(|c: char| !c.is_alphanumeric())
242 .filter(|w| w.len() > 2)
243 .map(|w| w.to_ascii_lowercase())
244 .filter(|w| !is_stopword(w))
245 .map(|w| {
246 for suf in ["ing", "ed", "es", "s", "ly"] {
248 if w.ends_with(suf) && w.len() > suf.len() + 2 {
249 return w[..w.len() - suf.len()].to_string();
250 }
251 }
252 w
253 })
254 .collect()
255}
256
257fn is_stopword(w: &str) -> bool {
258 matches!(
262 w,
263 "the" | "and" | "for" | "with" | "that" | "this" | "from" | "into"
265 | "you" | "your" | "yours" | "use" | "uses" | "using" | "used"
266 | "are" | "was" | "were" | "will" | "would" | "could" | "should"
267 | "can" | "may" | "might" | "have" | "has" | "had" | "been" | "being"
268 | "its" | "their" | "them" | "they" | "our" | "out" | "any" | "all"
269 | "such" | "also" | "than" | "then" | "but" | "not" | "via"
270 | "onto" | "upon" | "either" | "both" | "where" | "when"
271 | "need" | "needs" | "needed" | "want" | "wants" | "wanted"
272 | "please" | "kindly" | "note" | "noted" | "simply" | "just" | "sure"
274 | "tool" | "tools" | "function" | "endpoint" | "api"
276 | "appropriately" | "properly" | "correctly"
279 | "various" | "things" | "thing" | "stuff" | "relevant"
280 | "system" | "downstream" | "context" | "back"
281 | "passing" | "pass" | "passed"
282 | "resulting" | "result" | "results" | "returned"
283 | "allows" | "allow" | "allowed" | "designed"
285 | "operation" | "operations"
286 | "really" | "actually" | "essentially" | "basically"
287 )
288}
289
290fn jaccard(a: &HashSet<String>, b: &HashSet<String>) -> f32 {
291 if a.is_empty() && b.is_empty() {
292 return 1.0;
293 }
294 let inter = a.intersection(b).count() as f32;
295 let union = a.union(b).count() as f32;
296 if union == 0.0 {
297 0.0
298 } else {
299 inter / union
300 }
301}
302
303fn tf_cosine(a: &str, b: &str) -> f32 {
307 use std::collections::HashMap;
308 fn tf(s: &str) -> HashMap<String, f32> {
309 let mut m: HashMap<String, f32> = HashMap::new();
310 for w in s.split(|c: char| !c.is_alphanumeric()) {
311 if w.len() <= 2 {
312 continue;
313 }
314 let lower = w.to_ascii_lowercase();
315 if is_stopword(&lower) {
316 continue;
317 }
318 let stemmed = {
321 let mut out = lower.clone();
322 for suf in ["ing", "ed", "es", "s", "ly"] {
323 if out.ends_with(suf) && out.len() > suf.len() + 2 {
324 out.truncate(out.len() - suf.len());
325 break;
326 }
327 }
328 out
329 };
330 *m.entry(stemmed).or_insert(0.0) += 1.0;
331 }
332 m
333 }
334 let ta = tf(a);
335 let tb = tf(b);
336 if ta.is_empty() && tb.is_empty() {
337 return 1.0;
338 }
339 let mut dot = 0.0_f32;
340 let mut na2 = 0.0_f32;
341 let mut nb2 = 0.0_f32;
342 for v in ta.values() {
343 na2 += v * v;
344 }
345 for v in tb.values() {
346 nb2 += v * v;
347 }
348 for (k, va) in ta.iter() {
349 if let Some(vb) = tb.get(k) {
350 dot += va * vb;
351 }
352 }
353 let denom = na2.sqrt() * nb2.sqrt();
354 if denom == 0.0 {
355 0.0
356 } else {
357 (dot / denom).clamp(0.0, 1.0)
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364 use serde_json::json;
365
366 fn t(desc: &str) -> Tool {
367 Tool {
368 name: "x".into(),
369 description: desc.into(),
370 input_schema: json!({}),
371 }
372 }
373
374 #[test]
375 fn shrinks_a_bloated_description() {
376 let bloated = "Please note that this tool is a tool that allows you to compute the sum \
377 of two numbers. Use this tool when you need to add numbers. Simply pass \
378 two numbers and you will get the sum back.";
379 let opt = HeuristicOptimizer::new();
380 let out = opt.rewrite(&t(bloated));
381 assert!(out.reduction_pct >= 25.0, "got {}%", out.reduction_pct);
385 assert!(out.semantic_similarity >= 0.7, "jaccard {}", out.semantic_similarity);
386 }
387
388 #[test]
389 fn preserves_short_descriptions() {
390 let opt = HeuristicOptimizer::new();
391 let out = opt.rewrite(&t("Adds two numbers."));
392 assert!(out.reduction_pct >= 0.0);
393 assert!(out.semantic_similarity >= 0.9);
394 }
395
396 #[test]
397 fn squashes_whitespace() {
398 let opt = HeuristicOptimizer::new();
399 let out = opt.rewrite(&t("hello world"));
400 assert_eq!(out.rewritten, "hello world");
401 }
402
403 #[test]
404 fn dedupes_duplicate_sentences() {
405 let opt = HeuristicOptimizer::new();
406 let out = opt.rewrite(&t("Returns the user. Returns the user. Returns the user."));
407 assert!(out.rewritten_tokens < out.original_tokens);
408 }
409
410 #[test]
411 fn strips_markdown_emphasis() {
412 let opt = HeuristicOptimizer::new();
413 let out = opt.rewrite(&t("**Adds** _two_ numbers"));
414 assert!(!out.rewritten.contains('*'));
415 assert!(!out.rewritten.contains('_'));
416 }
417
418 #[test]
419 fn jaccard_guardrail_holds() {
420 let opt = HeuristicOptimizer::new();
421 let big = "Search the customer database for matching contact records by full name, email \
422 address, phone number, mailing address, or any combination thereof. Please \
423 note that this tool is a tool that allows you to perform such a search.";
424 let out = opt.rewrite(&t(big));
425 assert!(
428 out.semantic_similarity >= 0.7,
429 "jaccard {} dropped too low",
430 out.semantic_similarity
431 );
432 }
433}