use crate::cognition::knowledge::KnowledgeIndex;
use crate::cognition::memory::ColdStore;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeDistiller {
pub threshold: f64,
pub top_k: usize,
ingested: HashSet<u64>,
pub total_ingested: usize,
}
impl KnowledgeDistiller {
pub fn new(threshold: f64, top_k: usize) -> Self {
Self {
threshold: threshold.max(0.0),
top_k: top_k.max(1),
ingested: HashSet::new(),
total_ingested: 0,
}
}
pub fn distill(&mut self, cold: &ColdStore, index: &mut KnowledgeIndex) -> usize {
let entries = cold.all();
if entries.is_empty() {
return 0;
}
let df = compute_doc_freq(entries.iter().map(|e| e.content.as_str()));
let n = entries.len() as f64;
let mut scored: Vec<(f64, &str)> = entries
.iter()
.filter(|e| e.content.split_whitespace().count() >= 4)
.map(|e| {
let idf_sum: f64 = tokenise(&e.content)
.iter()
.map(|t| {
let df_t = *df.get(t).unwrap_or(&1) as f64;
(n / df_t).ln() + 1.0
})
.sum::<f64>();
let importance = e.score * idf_sum;
(importance, e.content.as_str())
})
.filter(|(imp, _)| *imp >= self.threshold)
.collect();
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(Ordering::Equal));
let mut added = 0;
for (_, text) in scored.into_iter().take(self.top_k) {
let fp = fnv1a(text);
if self.ingested.insert(fp) {
let chunks = index.ingest_text("distilled", text);
added += chunks;
self.total_ingested += chunks;
}
}
added
}
}
fn tokenise(text: &str) -> Vec<String> {
text.split_whitespace()
.map(|w| {
w.chars()
.filter(|c| c.is_alphanumeric())
.collect::<String>()
.to_ascii_lowercase()
})
.filter(|s| s.len() >= 3)
.collect()
}
fn compute_doc_freq<'a>(texts: impl Iterator<Item = &'a str>) -> HashMap<String, usize> {
let mut df: HashMap<String, usize> = HashMap::new();
for text in texts {
let mut seen: HashSet<String> = HashSet::new();
for token in tokenise(text) {
if seen.insert(token.clone()) {
*df.entry(token).or_insert(0) += 1;
}
}
}
df
}
fn fnv1a(s: &str) -> u64 {
const BASIS: u64 = 0xcbf29ce484222325;
const PRIME: u64 = 0x100000001b3;
s.bytes()
.fold(BASIS, |h, b| (h ^ b as u64).wrapping_mul(PRIME))
}