use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub type Token = u32;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SegmentType {
SystemPrompt,
Context,
UserTurn,
AssistantTurn,
ToolCall,
ToolResult,
Continuation,
}
impl std::fmt::Display for SegmentType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Self::SystemPrompt => "system_prompt",
Self::Context => "context",
Self::UserTurn => "user_turn",
Self::AssistantTurn => "assistant_turn",
Self::ToolCall => "tool_call",
Self::ToolResult => "tool_result",
Self::Continuation => "continuation",
};
write!(f, "{s}")
}
}
impl std::str::FromStr for SegmentType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"system_prompt" => Ok(Self::SystemPrompt),
"context" => Ok(Self::Context),
"user_turn" => Ok(Self::UserTurn),
"assistant_turn" => Ok(Self::AssistantTurn),
"tool_call" => Ok(Self::ToolCall),
"tool_result" => Ok(Self::ToolResult),
"continuation" => Ok(Self::Continuation),
other => Err(format!("unknown segment type: {other}")),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Segment {
pub segment_type: SegmentType,
pub tokens: Vec<Token>,
pub metadata: Option<HashMap<String, serde_json::Value>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentHash(pub String);
impl std::fmt::Display for SegmentHash {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SegmentRef {
pub segment_type: SegmentType,
pub hash: SegmentHash,
pub token_count: u32,
pub position: u32,
}
pub const MANIFEST_SCHEMA_VERSION: u32 = 1;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConversationManifest {
#[serde(default = "default_manifest_schema_version")]
pub schema_version: u32,
pub id: String,
pub application: Option<String>,
pub model: String,
pub tokenizer: String,
pub total_tokens: u64,
pub segments: Vec<SegmentRef>,
pub created_at: chrono::DateTime<chrono::Utc>,
pub metadata: Option<HashMap<String, serde_json::Value>>,
}
fn default_manifest_schema_version() -> u32 {
MANIFEST_SCHEMA_VERSION
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Conversation {
pub id: Option<String>,
pub application: Option<String>,
pub model: String,
pub tokenizer: String,
pub messages: Vec<Message>,
pub metadata: Option<HashMap<String, serde_json::Value>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Message {
pub role: String,
pub content: MessageContent,
pub name: Option<String>,
pub tool_call_id: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MessageContent {
Text(String),
Tokens(Vec<Token>),
}
impl MessageContent {
pub fn as_tokens(&self) -> Option<&[Token]> {
match self {
Self::Tokens(t) => Some(t),
Self::Text(_) => None,
}
}
}
#[derive(Debug, Clone)]
pub struct StoredSegment {
pub hash: SegmentHash,
pub segment_type: SegmentType,
pub tokenizer: String,
pub token_count: u32,
pub compressed_data: Vec<u8>,
pub raw_size: u32,
pub compressed_size: u32,
pub ref_count: u64,
pub created_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StowkenConfig {
pub enable_compression: bool,
pub near_dedup_threshold: Option<f64>,
}
impl Default for StowkenConfig {
fn default() -> Self {
Self {
enable_compression: true,
near_dedup_threshold: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StoreResult {
pub id: String,
pub total_segments: u64,
pub new_segments: u64,
pub deduped_segments: u64,
pub bytes_saved: u64,
pub compression_ratio: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedConversation {
pub manifest: ConversationManifest,
pub segments: Vec<RetrievedSegment>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConversationText {
pub conversation_id: String,
pub model: String,
pub application: Option<String>,
pub text: String,
pub turns: Vec<ConversationTurn>,
pub created_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConversationTurn {
pub role: String,
pub text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedSegment {
pub segment_type: SegmentType,
pub hash: SegmentHash,
pub tokens: Vec<Token>,
pub token_count: u32,
pub position: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenUsageStats {
pub total_tokens: u64,
pub total_conversations: u64,
pub unique_segments: u64,
pub total_segments: u64,
pub dedup_ratio: f64,
pub compression_ratio: f64,
pub storage_bytes: u64,
pub naive_bytes: u64,
pub savings_percentage: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SegmentTypeStats {
pub segment_type: SegmentType,
pub unique_count: u64,
pub total_references: u64,
pub dedup_ratio: f64,
pub avg_token_count: f64,
pub total_tokens: u64,
pub compressed_bytes: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct AnalyticsQuery {
pub model: Option<String>,
pub application: Option<String>,
pub segment_type: Option<SegmentType>,
pub date_from: Option<chrono::DateTime<chrono::Utc>>,
pub date_to: Option<chrono::DateTime<chrono::Utc>>,
pub group_by: Option<Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExportConfig {
pub format: ExportFormat,
pub include_system_prompts: bool,
pub include_context: bool,
pub deduplicate_pairs: bool,
pub tokenizer: Option<String>,
pub model: Option<String>,
pub application: Option<String>,
pub max_conversations: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ExportFormat {
Jsonl,
HuggingFace,
Parquet,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExportStats {
pub total_pairs: u64,
pub unique_pairs: u64,
pub tokens_exported: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemPromptInfo {
pub hash: SegmentHash,
pub token_count: u32,
pub ref_count: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReindexStats {
pub conversations_indexed: u64,
pub unique_segments_indexed: u64,
pub segments_missing: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SubstringCompactStats {
pub segments_examined: u64,
pub segments_rewritten: u64,
pub bytes_saved: u64,
pub segments_skipped: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SubstringGcStats {
pub registry_size_before: u64,
pub registry_size_after: u64,
pub substrings_dropped: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NearDuplicateCluster {
pub canonical: SegmentHash,
pub canonical_token_count: u32,
pub canonical_ref_count: u64,
pub variants: Vec<NearDuplicateVariant>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NearDuplicateVariant {
pub hash: SegmentHash,
pub similarity: f64,
pub token_count: u32,
pub ref_count: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateSegment {
pub hash: SegmentHash,
pub segment_type: SegmentType,
pub token_count: u32,
pub ref_count: u64,
pub wasted_bytes: u64,
}
pub trait TokenizerAdapter: Send + Sync {
fn tokenize(&self, text: &str) -> Vec<Token>;
fn detokenize(&self, tokens: &[Token]) -> String;
fn vocab_size(&self) -> u32;
fn name(&self) -> &str;
}
#[cfg(feature = "semantic-search")]
mod semantic {
use super::*;
use std::sync::Arc;
pub trait EmbeddingAdapter: Send + Sync {
fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, String>;
fn dimension(&self) -> usize;
fn model_name(&self) -> &str;
}
pub trait SummarizerAdapter: Send + Sync {
fn summarize(&self, conversation_text: &str) -> Result<String, String>;
fn model_name(&self) -> &str;
}
#[derive(Clone)]
pub enum SummaryStrategy {
ConcatTruncate { max_chars: usize },
LlmGenerated(Arc<dyn SummarizerAdapter>),
}
impl Default for SummaryStrategy {
fn default() -> Self {
Self::ConcatTruncate { max_chars: 24_000 }
}
}
impl SummaryStrategy {
pub fn id(&self) -> String {
match self {
Self::ConcatTruncate { .. } => "concat".to_owned(),
Self::LlmGenerated(s) => format!("llm:{}", s.model_name()),
}
}
}
impl std::fmt::Debug for SummaryStrategy {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ConcatTruncate { max_chars } => f
.debug_struct("ConcatTruncate")
.field("max_chars", max_chars)
.finish(),
Self::LlmGenerated(s) => f
.debug_struct("LlmGenerated")
.field("model", &s.model_name())
.finish(),
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SearchGranularity {
Segment,
Conversation,
Both,
}
impl Default for SearchGranularity {
fn default() -> Self {
Self::Both
}
}
impl std::str::FromStr for SearchGranularity {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"segment" => Ok(Self::Segment),
"conversation" => Ok(Self::Conversation),
"both" => Ok(Self::Both),
other => Err(format!("unknown granularity: {other}")),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticSearchQuery {
pub text: String,
#[serde(default)]
pub granularity: SearchGranularity,
pub model: Option<String>,
pub application: Option<String>,
pub segment_type: Option<SegmentType>,
pub date_from: Option<chrono::DateTime<chrono::Utc>>,
pub date_to: Option<chrono::DateTime<chrono::Utc>>,
pub limit: usize,
pub min_score: f32,
}
impl SemanticSearchQuery {
pub fn new(text: impl Into<String>) -> Self {
Self {
text: text.into(),
granularity: SearchGranularity::Both,
model: None,
application: None,
segment_type: None,
date_from: None,
date_to: None,
limit: 10,
min_score: 0.0,
}
}
}
impl Serialize for SearchGranularity {
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
let v = match self {
Self::Segment => "segment",
Self::Conversation => "conversation",
Self::Both => "both",
};
s.serialize_str(v)
}
}
impl<'de> Deserialize<'de> for SearchGranularity {
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
let s = String::deserialize(d)?;
s.parse().map_err(serde::de::Error::custom)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticSearchHit {
pub conversation_id: String,
pub score: f32,
pub matched_via: MatchedVia,
pub application: Option<String>,
pub model: String,
pub created_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum MatchedVia {
Segment {
hash: SegmentHash,
segment_type: SegmentType,
},
Conversation,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbedStats {
pub segments_embedded: u64,
pub segments_skipped: u64,
pub segments_already_done: u64,
pub conversations_embedded: u64,
pub conversations_already_done: u64,
pub embedding_model: String,
pub summary_strategy: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConversationCluster {
pub cluster_id: u32,
pub size: usize,
pub representative_ids: Vec<String>,
pub members: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutlierConversation {
pub conversation_id: String,
pub isolation_score: f32,
pub application: Option<String>,
pub model: String,
pub created_at: chrono::DateTime<chrono::Utc>,
}
}
#[cfg(feature = "semantic-search")]
pub use semantic::*;