Skip to main content

innate_core/
refine.rs

1use crate::errors::Result;
2use crate::utils::{sanitize, SanitizeAction};
3use serde_json::Value;
4
5// ---------------------------------------------------------------------------
6// Sanitizer — injectable content sanitizer (§二·六)
7// ---------------------------------------------------------------------------
8
9/// Replaceable sanitizer. Inject via `KnowledgeBase::open_with`.
10/// Default: `DefaultSanitizer` (wraps built-in heuristics).
11pub trait Sanitizer: Send + Sync {
12    fn sanitize(&self, content: &str) -> (String, SanitizeAction);
13}
14
15/// Built-in sanitizer — wraps `utils::sanitize()`.
16pub struct DefaultSanitizer;
17
18impl Sanitizer for DefaultSanitizer {
19    fn sanitize(&self, content: &str) -> (String, SanitizeAction) {
20        sanitize(content)
21    }
22}
23
24/// No-op sanitizer — passes content through unchanged (use to disable sanitization).
25pub struct NoopSanitizer;
26
27impl Sanitizer for NoopSanitizer {
28    fn sanitize(&self, content: &str) -> (String, SanitizeAction) {
29        (content.to_string(), SanitizeAction::Allow)
30    }
31}
32
33// ---------------------------------------------------------------------------
34// Refiner — online trim / adapt
35// ---------------------------------------------------------------------------
36
37/// Online refiner — trims or adapts recalled chunks.
38pub trait Refiner: Send + Sync {
39    fn refine(&self, chunks: Vec<Value>, budget_tokens: Option<usize>) -> Result<Vec<Value>>;
40
41    /// Trim a block to fit within `budget_tokens` given the active `query`.
42    /// Returns `None` if trimming is not supported or the block cannot be trimmed while
43    /// preserving hard-dep closure integrity.
44    fn trim(&self, _block: &[Value], _query: &str, _budget_tokens: usize) -> Option<Vec<Value>> {
45        None
46    }
47}
48
49/// No-op refiner (default): returns chunks unchanged, trim is unsupported.
50pub struct NullRefiner;
51
52impl Refiner for NullRefiner {
53    fn refine(&self, chunks: Vec<Value>, _budget: Option<usize>) -> Result<Vec<Value>> {
54        Ok(chunks)
55    }
56}
57
58/// Distiller — episodic logs → zero or more pending chunks per input log.
59pub trait Distiller: Send + Sync {
60    fn distill(&self, log_entries: &[Value]) -> Result<Vec<DistilledChunk>>;
61
62    fn distill_with_context(
63        &self,
64        primary: &Value,
65        _related_logs: &[Value],
66    ) -> Result<Vec<DistilledChunk>> {
67        self.distill(std::slice::from_ref(primary))
68    }
69
70    fn provenance(&self) -> DistillProvenance {
71        DistillProvenance::default()
72    }
73}
74
75#[derive(Debug, Default, Clone)]
76pub struct DistillProvenance {
77    pub provider: Option<String>,
78    pub model: Option<String>,
79    pub prompt_version: Option<String>,
80}
81
82#[derive(Debug, Clone)]
83pub struct DistilledChunk {
84    pub content: String,
85    pub trigger_desc: Option<String>,
86    pub anti_trigger_desc: Option<String>,
87    pub source_log_id: String,
88    pub nomination: Option<String>,
89}
90
91/// Heuristic distiller: extracts chunks from log output / nomination fields.
92pub struct HeuristicDistiller;
93
94impl Distiller for HeuristicDistiller {
95    fn distill(&self, log_entries: &[Value]) -> Result<Vec<DistilledChunk>> {
96        let mut out = Vec::new();
97        for entry in log_entries {
98            let id = entry["id"].as_str().unwrap_or("").to_string();
99            let nomination = entry["nomination"].as_str();
100            let text = nomination.or_else(|| entry["output_summary"].as_str());
101            if let Some(t) = text {
102                let t = t.trim();
103                if !t.is_empty() {
104                    let query = entry["query"].as_str().map(str::trim).unwrap_or("");
105                    let outcome = entry["outcome"].as_str().unwrap_or("");
106
107                    // Use query as trigger_desc for embedding — it caused this log and
108                    // gives the chunk a useful retrieval signal without baking the query
109                    // into the content (which creates retrieval-overfit chunks).
110                    let trigger_desc = entry["query"]
111                        .as_str()
112                        .map(|q| q.trim().chars().take(80).collect::<String>())
113                        .filter(|q| !q.is_empty())
114                        .or_else(|| {
115                            t.lines()
116                                .map(str::trim)
117                                .find(|l| l.len() > 10)
118                                .map(|l| l.chars().take(80).collect())
119                        });
120
121                    // Keep content query-agnostic so the chunk is reusable across
122                    // similar but not identical queries. Nominations are preserved as-is.
123                    let content = if nomination.is_some() {
124                        t.to_string()
125                    } else if outcome == "fail" {
126                        format!("Avoid: {t}")
127                    } else {
128                        t.to_string()
129                    };
130
131                    // For failed tasks, discourage re-triggering in the same query context.
132                    let anti_trigger_desc = if outcome == "fail" && !query.is_empty() {
133                        Some(query.chars().take(60).collect::<String>())
134                    } else {
135                        None
136                    };
137
138                    out.push(DistilledChunk {
139                        content,
140                        trigger_desc,
141                        anti_trigger_desc,
142                        source_log_id: id,
143                        nomination: entry["nomination"].as_str().map(str::to_string),
144                    });
145                }
146            }
147        }
148        Ok(out)
149    }
150
151    fn provenance(&self) -> DistillProvenance {
152        DistillProvenance {
153            provider: Some("heuristic".to_string()),
154            model: None,
155            prompt_version: Some("2".to_string()),
156        }
157    }
158}