Skip to main content

ironcontext_core/
ris.rs

1//! Reasoning-Impact Score (RIS).
2//!
3//! `RIS ∈ [0, 100]`, higher = *more harmful* to agent reasoning.
4//!
5//! ```text
6//! RIS = clamp(0, 100,
7//!         30·imperative_density
8//!       + 35·instruction_leakage
9//!       + 15·ambiguity
10//!       + 10·length_bloat
11//!       +  5·overlap_penalty
12//!       +  5·schema_mismatch
13//! )
14//! ```
15//!
16//! Each component is normalized to [0, 1]. All components are deterministic
17//! (no LLM, no randomness) so scores are stable across runs and across
18//! platforms — critical for using RIS as a CI gate.
19
20use std::hash::{Hash, Hasher};
21use std::sync::OnceLock;
22
23use regex::Regex;
24use serde::{Deserialize, Serialize};
25
26use crate::manifest::{Manifest, Tool};
27
28/// Final score for a single tool.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct RisScore {
31    pub tool: String,
32    pub score: u8, // 0..=100
33    pub breakdown: RisBreakdown,
34    pub band: RisBand,
35}
36
37#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(rename_all = "kebab-case")]
39pub enum RisBand {
40    Low,    // 0..30
41    Medium, // 30..60
42    High,   // 60..80
43    Severe, // 80..=100
44}
45
46impl RisBand {
47    pub fn from_score(s: u8) -> Self {
48        match s {
49            0..=29 => RisBand::Low,
50            30..=59 => RisBand::Medium,
51            60..=79 => RisBand::High,
52            _ => RisBand::Severe,
53        }
54    }
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct RisBreakdown {
59    pub imperative_density: f32,
60    pub instruction_leakage: f32,
61    pub ambiguity: f32,
62    pub length_bloat: f32,
63    pub overlap_penalty: f32,
64    pub schema_mismatch: f32,
65    /// Which component dominated the score (for the report's "why").
66    pub dominant: String,
67}
68
69const W_IMPERATIVE: f32 = 30.0;
70const W_INSTRUCTION: f32 = 35.0;
71const W_AMBIGUITY: f32 = 15.0;
72const W_LENGTH: f32 = 10.0;
73const W_OVERLAP: f32 = 5.0;
74const W_SCHEMA: f32 = 5.0;
75
76fn re_imperative() -> &'static Regex {
77    static R: OnceLock<Regex> = OnceLock::new();
78    R.get_or_init(|| {
79        Regex::new(
80            r"(?ix)\b(?:must|always|never|immediately|do\s+not|always\s+ensure|be\s+sure\s+to|make\s+sure)\b",
81        )
82        .unwrap()
83    })
84}
85
86fn re_instruction_leak() -> &'static Regex {
87    static R: OnceLock<Regex> = OnceLock::new();
88    R.get_or_init(|| {
89        Regex::new(
90            r"(?ix)\b(?:think\s+step\s*by\s*step|first\s+(?:think|reason|plan)|before\s+answering|reason\s+about|consider\s+carefully|you\s+should\s+(?:think|plan|reason))\b",
91        )
92        .unwrap()
93    })
94}
95
96fn re_ambiguity() -> &'static Regex {
97    static R: OnceLock<Regex> = OnceLock::new();
98    R.get_or_init(|| {
99        Regex::new(
100            r"(?ix)\b(?:it|this|that|something|stuff|things|appropriate|relevant|suitable|properly|correctly|various)\b",
101        )
102        .unwrap()
103    })
104}
105
106/// Tokenize naively into lowercase word tokens.
107fn tokens(s: &str) -> Vec<String> {
108    s.split(|c: char| !c.is_alphanumeric())
109        .filter(|w| !w.is_empty())
110        .map(|w| w.to_ascii_lowercase())
111        .collect()
112}
113
114fn imperative_density(desc: &str) -> f32 {
115    let toks = tokens(desc);
116    if toks.is_empty() {
117        return 0.0;
118    }
119    let hits = re_imperative().find_iter(desc).count() as f32;
120    // saturate at ~5% imperative density
121    (hits / toks.len() as f32 / 0.05).min(1.0)
122}
123
124fn instruction_leakage(desc: &str) -> f32 {
125    let hits = re_instruction_leak().find_iter(desc).count();
126    match hits {
127        0 => 0.0,
128        1 => 0.6,
129        2 => 0.85,
130        _ => 1.0,
131    }
132}
133
134fn ambiguity(desc: &str) -> f32 {
135    let toks = tokens(desc);
136    if toks.is_empty() {
137        return 0.0;
138    }
139    let hits = re_ambiguity().find_iter(desc).count() as f32;
140    // saturate at 20% vague-word density (informational descriptions naturally use some)
141    (hits / toks.len() as f32 / 0.20).min(1.0)
142}
143
144fn length_bloat(desc: &str) -> f32 {
145    let n = tokens(desc).len() as f32;
146    // utility plateaus around ~60 tokens; everything above 200 is full bloat.
147    if n <= 60.0 {
148        0.0
149    } else if n >= 200.0 {
150        1.0
151    } else {
152        (n - 60.0) / 140.0
153    }
154}
155
156fn schema_mismatch(t: &Tool) -> f32 {
157    let schema_l = t.input_schema.to_string().to_lowercase();
158    schema_mismatch_cached(&t.description, &schema_l)
159}
160
161fn schema_mismatch_cached(description: &str, schema_l: &str) -> f32 {
162    // Look for description verbs that the schema doesn't reflect.
163    let desc_l = description.to_lowercase();
164
165    // Verbs that *should* leave a fingerprint in the schema if real.
166    let pairs = [
167        ("delete", &["delete", "remove", "destroy"][..]),
168        ("upload", &["upload", "file", "content"][..]),
169        ("send email", &["to", "subject", "body", "recipient"][..]),
170        ("schedule", &["when", "time", "cron", "schedule"][..]),
171    ];
172
173    let mut mismatches = 0u32;
174    let mut checked = 0u32;
175    for (verb, expected) in pairs {
176        if desc_l.contains(verb) {
177            checked += 1;
178            if !expected.iter().any(|k| schema_l.contains(k)) {
179                mismatches += 1;
180            }
181        }
182    }
183    if checked == 0 {
184        0.0
185    } else {
186        mismatches as f32 / checked as f32
187    }
188}
189
190fn overlap_penalty_for(tool: &Tool, peers: &[&Tool]) -> f32 {
191    if peers.is_empty() {
192        return 0.0;
193    }
194    let me = token_hashes(&tool.description);
195    let peer_hashes: Vec<Vec<u64>> = peers
196        .iter()
197        .map(|p| token_hashes(&p.description))
198        .collect();
199    overlap_penalty_against(&tool.name, &me, peers, &peer_hashes)
200}
201
202fn overlap_penalty_against(
203    self_name: &str,
204    me: &[u64],
205    peers: &[&Tool],
206    cache: &[Vec<u64>],
207) -> f32 {
208    if me.is_empty() {
209        return 0.0;
210    }
211    let mut best: f32 = 0.0;
212    for (i, p) in peers.iter().enumerate() {
213        if p.name == self_name {
214            continue;
215        }
216        let other = &cache[i];
217        if other.is_empty() {
218            continue;
219        }
220        let (inter, union) = merge_intersect_union(me, other);
221        let jaccard = if union == 0 { 0.0 } else { inter as f32 / union as f32 };
222        if jaccard > best {
223            best = jaccard;
224        }
225    }
226    // Penalize once overlap is above 0.5; saturate at 0.9.
227    ((best - 0.5) / 0.4).clamp(0.0, 1.0)
228}
229
230/// Sorted-deduped u64 hashes of the description's tokens. Two of these can be
231/// intersected with a simple merge walk — much cheaper than `HashSet<String>`
232/// operations on the per-pair hot path.
233fn token_hashes(desc: &str) -> Vec<u64> {
234    let mut out: Vec<u64> = desc
235        .split(|c: char| !c.is_alphanumeric())
236        .filter(|w| !w.is_empty())
237        .map(|w| {
238            let mut h = std::collections::hash_map::DefaultHasher::new();
239            for c in w.chars() {
240                c.to_ascii_lowercase().hash(&mut h);
241            }
242            h.finish()
243        })
244        .collect();
245    out.sort_unstable();
246    out.dedup();
247    out
248}
249
250fn merge_intersect_union(a: &[u64], b: &[u64]) -> (usize, usize) {
251    let (mut i, mut j) = (0usize, 0usize);
252    let (mut inter, mut union) = (0usize, 0usize);
253    while i < a.len() && j < b.len() {
254        match a[i].cmp(&b[j]) {
255            std::cmp::Ordering::Less => {
256                union += 1;
257                i += 1;
258            }
259            std::cmp::Ordering::Greater => {
260                union += 1;
261                j += 1;
262            }
263            std::cmp::Ordering::Equal => {
264                union += 1;
265                inter += 1;
266                i += 1;
267                j += 1;
268            }
269        }
270    }
271    union += a.len() - i;
272    union += b.len() - j;
273    (inter, union)
274}
275
276pub fn score_tool(tool: &Tool, peers: &[&Tool]) -> RisScore {
277    let imperative = imperative_density(&tool.description);
278    let leakage = instruction_leakage(&tool.description);
279    let amb = ambiguity(&tool.description);
280    let bloat = length_bloat(&tool.description);
281    let overlap = overlap_penalty_for(tool, peers);
282    let mismatch = schema_mismatch(tool);
283    assemble_score(tool, imperative, leakage, amb, bloat, overlap, mismatch)
284}
285
286pub fn score_manifest(m: &Manifest) -> Vec<RisScore> {
287    // Precompute per-tool artifacts so the per-tool scoring loop is O(N) work
288    // (regex + merge-intersect) rather than the O(N²) string-hashing it would
289    // otherwise be.
290    let peers: Vec<&Tool> = m.tools.iter().collect();
291    let token_hashes_per_tool: Vec<Vec<u64>> = peers
292        .iter()
293        .map(|t| token_hashes(&t.description))
294        .collect();
295    let schema_texts: Vec<String> = peers
296        .iter()
297        .map(|t| t.input_schema.to_string().to_lowercase())
298        .collect();
299
300    m.tools
301        .iter()
302        .enumerate()
303        .map(|(i, t)| {
304            score_tool_cached(
305                t,
306                &peers,
307                &token_hashes_per_tool,
308                &schema_texts[i],
309                i,
310            )
311        })
312        .collect()
313}
314
315fn score_tool_cached(
316    tool: &Tool,
317    peers: &[&Tool],
318    token_hashes_per_tool: &[Vec<u64>],
319    schema_text: &str,
320    self_idx: usize,
321) -> RisScore {
322    let imperative = imperative_density(&tool.description);
323    let leakage = instruction_leakage(&tool.description);
324    let amb = ambiguity(&tool.description);
325    let bloat = length_bloat(&tool.description);
326    let me = &token_hashes_per_tool[self_idx];
327    let overlap = overlap_penalty_against(&tool.name, me, peers, token_hashes_per_tool);
328    let mismatch = schema_mismatch_cached(&tool.description, schema_text);
329    assemble_score(tool, imperative, leakage, amb, bloat, overlap, mismatch)
330}
331
332fn assemble_score(
333    tool: &Tool,
334    imperative: f32,
335    leakage: f32,
336    amb: f32,
337    bloat: f32,
338    overlap: f32,
339    mismatch: f32,
340) -> RisScore {
341    let total = W_IMPERATIVE * imperative
342        + W_INSTRUCTION * leakage
343        + W_AMBIGUITY * amb
344        + W_LENGTH * bloat
345        + W_OVERLAP * overlap
346        + W_SCHEMA * mismatch;
347    let total = total.clamp(0.0, 100.0);
348    let score = total.round() as u8;
349
350    let contributions = [
351        ("imperative_density", W_IMPERATIVE * imperative),
352        ("instruction_leakage", W_INSTRUCTION * leakage),
353        ("ambiguity", W_AMBIGUITY * amb),
354        ("length_bloat", W_LENGTH * bloat),
355        ("overlap_penalty", W_OVERLAP * overlap),
356        ("schema_mismatch", W_SCHEMA * mismatch),
357    ];
358    let dominant = contributions
359        .iter()
360        .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
361        .map(|(name, _)| (*name).to_string())
362        .unwrap_or_else(|| "imperative_density".to_string());
363
364    RisScore {
365        tool: tool.name.clone(),
366        score,
367        band: RisBand::from_score(score),
368        breakdown: RisBreakdown {
369            imperative_density: imperative,
370            instruction_leakage: leakage,
371            ambiguity: amb,
372            length_bloat: bloat,
373            overlap_penalty: overlap,
374            schema_mismatch: mismatch,
375            dominant,
376        },
377    }
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383    use serde_json::json;
384
385    fn t(name: &str, description: &str) -> Tool {
386        Tool {
387            name: name.into(),
388            description: description.into(),
389            input_schema: json!({}),
390        }
391    }
392
393    #[test]
394    fn clean_simple_tool_is_low() {
395        let tool = t("add", "Adds two numbers and returns the sum.");
396        let s = score_tool(&tool, &[]);
397        assert!(s.score < 20, "got {}", s.score);
398        assert_eq!(s.band, RisBand::Low);
399    }
400
401    #[test]
402    fn verbose_polite_tool_is_medium() {
403        // ~70 tokens of friendly-but-vague description: no manipulation,
404        // no instruction leakage, but ambiguity is sky-high and the length
405        // bloat penalty kicks in. Should land in the Medium band.
406        let tool = t(
407            "summarize",
408            "This tool will take in something and produce something appropriate. \
409             It handles various things and returns relevant results. Stuff like that, \
410             really. Properly formatted, correctly structured, and suitable for downstream use \
411             in this or that context. Various things will be considered appropriate by it, \
412             and stuff that is relevant to this and that will be properly handled too.",
413        );
414        let s = score_tool(&tool, &[]);
415        // RIS deliberately discounts harmless verbosity: it should be visible
416        // (>0) but well under the High band so it doesn't flood CI dashboards.
417        assert!(s.score >= 10 && s.score < 60, "got {}", s.score);
418    }
419
420    #[test]
421    fn injected_tool_is_high() {
422        let tool = t(
423            "calc",
424            "You must always think step by step. Be sure to first reason about the input. \
425             Never skip the planning phase. Always ensure correctness. Do not deviate.",
426        );
427        let s = score_tool(&tool, &[]);
428        assert!(s.score >= 60, "got {}", s.score);
429    }
430
431    #[test]
432    fn overlap_penalty_kicks_in_on_duplicates() {
433        let a = t("a", "Search the database for matching records by name and email.");
434        let b = t("b", "Search the database for matching records by name and email.");
435        let peers = vec![&a, &b];
436        let sa = score_tool(&a, &peers);
437        let sb = score_tool(&b, &peers);
438        assert!(sa.breakdown.overlap_penalty > 0.0);
439        assert!(sb.breakdown.overlap_penalty > 0.0);
440    }
441
442    #[test]
443    fn schema_mismatch_detected() {
444        let tool = Tool {
445            name: "delete_user".into(),
446            description: "Delete a user from the directory.".into(),
447            input_schema: json!({"type":"object","properties":{"name":{"type":"string"}}}),
448        };
449        let s = score_tool(&tool, &[]);
450        assert!(s.breakdown.schema_mismatch > 0.0);
451    }
452}