use crate::core::{Node, SourceType};
use crate::llm;
use futures::{stream, StreamExt};
use semchunk_rs::Chunker;
use std::sync::OnceLock;
use std::time::Instant;
use tiktoken_rs::{cl100k_base, CoreBPE};
#[derive(Debug, Clone)]
pub struct ExtractedInsight {
pub content: String,
pub confidence: f32,
}
impl ExtractedInsight {
pub fn into_node(self, source: String, source_type: SourceType, embedding: Vec<f32>) -> Node {
Node::new(self.content, source, source_type, embedding, self.confidence)
}
}
const MIN_CONTENT_LEN: usize = 50;
const CHUNK_THRESHOLD_TOKENS: usize = 12000;
const MAX_CHUNK_TOKENS: usize = 10000;
const PARALLEL_DISTILLATIONS: usize = 4;
static BPE: OnceLock<CoreBPE> = OnceLock::new();
fn get_bpe() -> &'static CoreBPE {
BPE.get_or_init(|| cl100k_base().expect("Failed to load cl100k_base tokenizer"))
}
fn count_tokens(text: &str) -> usize {
get_bpe().encode_with_special_tokens(text).len()
}
fn chunk_transcript(transcript: &str) -> Vec<String> {
let chunker = Chunker::new(MAX_CHUNK_TOKENS, Box::new(count_tokens));
chunker.chunk(transcript)
}
pub struct ExtractionResult {
pub insights: Vec<ExtractedInsight>,
pub chunks_used: usize,
}
pub async fn extract_knowledge(transcript: &str) -> Result<ExtractionResult, String> {
if transcript.trim().is_empty() {
return Ok(ExtractionResult {
insights: Vec::new(),
chunks_used: 0,
});
}
let token_count = count_tokens(transcript);
if token_count <= CHUNK_THRESHOLD_TOKENS {
let content = call_extraction_llm(transcript).await?;
let content = content.trim().to_string();
if content.len() < MIN_CONTENT_LEN {
eprintln!(" Distillation too short ({} chars < {}), skipping", content.len(), MIN_CONTENT_LEN);
return Ok(ExtractionResult {
insights: Vec::new(),
chunks_used: 1,
});
}
return Ok(ExtractionResult {
insights: vec![ExtractedInsight {
content,
confidence: 1.0,
}],
chunks_used: 1,
});
}
let chunks = chunk_transcript(transcript);
let chunk_count = chunks.len();
eprintln!(
" Distilling {} chunks ({} parallel)...",
chunk_count, PARALLEL_DISTILLATIONS
);
let chunk_start = Instant::now();
let chunk_results: Vec<(usize, Result<String, String>)> = stream::iter(chunks.into_iter().enumerate())
.map(|(i, chunk)| async move {
let result = call_extraction_llm(&chunk).await;
(i, result)
})
.buffer_unordered(PARALLEL_DISTILLATIONS)
.collect()
.await;
eprintln!(" Chunk distillation completed in {:.1}s", chunk_start.elapsed().as_secs_f32());
let mut indexed_distillations: Vec<(usize, String)> = Vec::new();
let mut dropped_count = 0;
for (i, result) in chunk_results {
match result {
Ok(distillation) => {
let trimmed = distillation.trim();
if !trimmed.is_empty() && trimmed.len() >= MIN_CONTENT_LEN {
indexed_distillations.push((i, distillation));
} else {
dropped_count += 1;
eprintln!(" Chunk {} distillation too short ({} chars), skipping", i + 1, trimmed.len());
}
}
Err(e) => {
dropped_count += 1;
eprintln!(" Warning: chunk {} failed: {}", i + 1, e);
}
}
}
if dropped_count > 0 {
eprintln!(" Dropped {} of {} chunks", dropped_count, chunk_count);
}
indexed_distillations.sort_by_key(|(i, _)| *i);
let insights: Vec<ExtractedInsight> = indexed_distillations
.into_iter()
.map(|(_, content)| ExtractedInsight {
content,
confidence: 1.0,
})
.collect();
eprintln!(" Extracted {} insights from {} chunks", insights.len(), chunk_count);
Ok(ExtractionResult {
insights,
chunks_used: chunk_count,
})
}
async fn call_extraction_llm(transcript: &str) -> Result<String, String> {
let system_prompt = r#"You are extracting knowledge from an AI <> Human session transcript for a knowledge graph.
Your output will be stored and retrieved via semantic search in future sessions.
Extract these types of knowledge:
**CONCEPTS** - Ideas, patterns, and principles discussed
- Architectural patterns applied or considered
- Design principles referenced
- Technical concepts explained
**DECISIONS** - Choices made with their rationale
- Why one approach was chosen over another
- Trade-offs considered
- Constraints that influenced choices
**INTENTS** - Goals, desires, and plans expressed
- What the user is trying to achieve
- Future work mentioned
- Success criteria discussed
**ENTITIES** - Significant named things
- Projects, tools, libraries mentioned
- People or teams referenced
- Services or systems involved
**QUESTIONS** - Open/unresolved items
- Things left for later
- Uncertainties acknowledged
- Areas needing investigation
Write a concise summary capturing the key knowledge. Use whatever format (prose, bullets, headers) best fits the content. Each insight should be self-contained and useful without the original transcript.
Skip routine actions (ran tests, fixed typo). Focus on knowledge valuable for future sessions.
If the session has no substantive knowledge, just say so briefly.
IMPORTANT: Keep your response under 2000 words. Be concise and focus."#;
let message = format!("TRANSCRIPT:\n{}", transcript);
llm::call_claude(system_prompt, &message).await
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::EMBEDDING_DIM;
#[test]
fn test_insight_into_node() {
let insight = ExtractedInsight {
content: "Test insight content".to_string(),
confidence: 0.9,
};
let embedding = vec![0.0f32; EMBEDDING_DIM];
let node = insight.into_node("session-123".to_string(), SourceType::Session, embedding);
assert_eq!(node.content, "Test insight content");
assert_eq!(node.confidence, 0.9);
assert_eq!(node.source, "session-123");
assert!(matches!(node.source_type, SourceType::Session));
assert_eq!(node.embedding.len(), EMBEDDING_DIM);
}
#[tokio::test]
async fn test_extract_knowledge_empty_transcript() {
let result = extract_knowledge("").await.unwrap();
assert!(result.insights.is_empty());
assert_eq!(result.chunks_used, 0);
let result = extract_knowledge(" ").await.unwrap();
assert!(result.insights.is_empty());
assert_eq!(result.chunks_used, 0);
}
}