datasphere 0.1.0

Background daemon that distills knowledge from Claude Code sessions into a searchable graph
Documentation
use chrono::{DateTime, Utc};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use crate::transcript::types::TranscriptEntry;

/// Error type for transcript reading
#[derive(Debug)]
pub enum TranscriptError {
    IoError(std::io::Error),
}

impl std::fmt::Display for TranscriptError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TranscriptError::IoError(e) => write!(f, "IO error: {}", e),
        }
    }
}

impl std::error::Error for TranscriptError {}

impl From<std::io::Error> for TranscriptError {
    fn from(e: std::io::Error) -> Self {
        TranscriptError::IoError(e)
    }
}

/// Read and parse a transcript JSONL file
///
/// Skips malformed lines rather than failing entirely
pub fn read_transcript(path: &Path) -> Result<Vec<TranscriptEntry>, TranscriptError> {
    let file = File::open(path)?;
    let reader = BufReader::new(file);
    let mut entries = Vec::new();

    for (line_num, line_result) in reader.lines().enumerate() {
        let line = line_result?;
        if line.trim().is_empty() {
            continue;
        }

        match serde_json::from_str::<TranscriptEntry>(&line) {
            Ok(entry) => entries.push(entry),
            Err(e) => {
                // Log warning but continue - don't fail on malformed lines
                eprintln!(
                    "Warning: skipping malformed line {} in transcript: {}",
                    line_num + 1,
                    e
                );
            }
        }
    }

    Ok(entries)
}

/// Get messages in a time window, optionally filtered by session
pub fn get_messages_in_window<'a>(
    entries: &'a [TranscriptEntry],
    start: DateTime<Utc>,
    end: DateTime<Utc>,
    session_id: Option<&str>,
) -> Vec<&'a TranscriptEntry> {
    let session_filter = |e: &&TranscriptEntry| -> bool {
        match session_id {
            Some(sid) => e.session_id() == Some(sid),
            None => true,
        }
    };

    let content_filter = |e: &&TranscriptEntry| e.is_message() || e.is_summary();

    entries
        .iter()
        .filter(content_filter)
        .filter(session_filter)
        .filter(|e| {
            e.timestamp()
                .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok())
                .map(|ts| ts >= start && ts < end)
                .unwrap_or(false)
        })
        .collect()
}

/// Get messages since a given timestamp, optionally filtered by session
/// AIDEV-NOTE: This is the primary context selection method for wm extraction.
/// When session_id is provided, only messages from that session are included
/// to prevent cross-session context bleed.
pub fn get_messages_since<'a>(
    entries: &'a [TranscriptEntry],
    since: Option<DateTime<Utc>>,
    session_id: Option<&str>,
) -> Vec<&'a TranscriptEntry> {
    let session_filter = |e: &&TranscriptEntry| -> bool {
        match session_id {
            Some(sid) => e.session_id() == Some(sid),
            None => true, // No session filter - include all (backward compat)
        }
    };

    // Include messages AND summaries (summaries provide context after compaction)
    let content_filter = |e: &&TranscriptEntry| e.is_message() || e.is_summary();

    match since {
        Some(cutoff) => {
            entries
                .iter()
                .filter(content_filter)
                .filter(session_filter)
                .filter(|e| {
                    // Include if timestamp is after cutoff (or if no timestamp)
                    // Summaries don't have timestamps, so they pass through
                    e.timestamp()
                        .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok())
                        .map(|ts| ts > cutoff)
                        .unwrap_or(true)
                })
                .collect()
        }
        None => {
            // No previous extraction - include all messages + summaries (for this session)
            entries
                .iter()
                .filter(content_filter)
                .filter(session_filter)
                .collect()
        }
    }
}

/// Strip ALL <system-reminder>...</system-reminder> blocks
/// AIDEV-NOTE: System reminders contain CLAUDE.md content - already explicit instructions,
/// not tacit knowledge. For extraction, we strip them entirely to avoid redundant capture.
/// (sg keeps last one for evaluation context; wm doesn't need them at all)
fn strip_system_reminders(text: &str) -> String {
    const OPEN: &str = "<system-reminder>";
    const CLOSE: &str = "</system-reminder>";

    let mut result = String::with_capacity(text.len());
    let mut search_start = 0;

    while let Some(open_offset) = text[search_start..].find(OPEN) {
        let open_pos = search_start + open_offset;
        // Add text before this reminder
        result.push_str(&text[search_start..open_pos]);

        let after_open = open_pos + OPEN.len();
        if let Some(close_offset) = text[after_open..].find(CLOSE) {
            // Skip past the closing tag
            search_start = after_open + close_offset + CLOSE.len();
        } else {
            // Unclosed tag - skip to end
            search_start = text.len();
            break;
        }
    }

    // Add remaining text after last reminder
    result.push_str(&text[search_start..]);
    result.trim().to_string()
}

/// Extract filename from a path (last component)
fn basename(path: &str) -> &str {
    path.rsplit('/').next().unwrap_or(path)
}

/// Extract key identifier from tool input (file path, command, pattern)
/// Paths are shortened to basename to reduce tokens
fn tool_summary(name: &str, input: Option<&serde_json::Value>) -> String {
    let input = match input {
        Some(v) => v,
        None => return String::new(),
    };
    match name {
        "Edit" | "Write" | "Read" => input
            .get("file_path")
            .and_then(|v| v.as_str())
            .map(basename)
            .unwrap_or("")
            .to_string(),
        "Bash" => input
            .get("command")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string(),
        "Glob" | "Grep" => input
            .get("pattern")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string(),
        _ => String::new(),
    }
}

/// Options for formatting transcript context
/// AIDEV-NOTE: tool_results and thinking are disabled by default because they
/// can be huge (full file contents, verbose reasoning) and blow up context size.
#[derive(Default)]
pub struct FormatOptions {
    /// Include full tool results (file contents, grep output, etc.)
    pub include_tool_results: bool,
    /// Include Claude's thinking blocks
    pub include_thinking: bool,
}

/// Format messages for context (for sending to extraction LLM)
/// Uses default options (no tool_results, no thinking) for compact output.
pub fn format_context(messages: &[&TranscriptEntry]) -> String {
    format_context_with_options(messages, &FormatOptions::default())
}

/// Format messages for context with custom options
/// AIDEV-NOTE: Uses compact prefixes to reduce tokens. Legend added at top.
pub fn format_context_with_options(messages: &[&TranscriptEntry], options: &FormatOptions) -> String {
    let mut output = String::new();

    // Legend for compact prefixes
    output.push_str("[U=User A=Assistant T=Tools S=Summary R=Result TH=Thinking E=Edit W=Write B=Bash]\n");

    for entry in messages {
        match entry {
            TranscriptEntry::Summary { .. } => {
                if let Some(text) = entry.summary_text() {
                    output.push_str("S: ");
                    output.push_str(text);
                    output.push('\n');
                }
            }
            TranscriptEntry::User { .. } => {
                // Include tool results only if requested (can be huge)
                if options.include_tool_results {
                    let tool_results = entry.tool_results();
                    for (_id, content) in &tool_results {
                        output.push_str("R: ");
                        output.push_str(content);
                        output.push('\n');
                    }
                }

                if let Some(text) = entry.user_text() {
                    let cleaned = strip_system_reminders(&text);
                    if !cleaned.is_empty() {
                        output.push_str("U: ");
                        output.push_str(&cleaned);
                        output.push('\n');
                    }
                }
            }
            TranscriptEntry::Assistant { .. } => {
                let tool_uses = entry.tool_uses();

                // Include thinking only if requested (can be verbose)
                if options.include_thinking {
                    if let Some(thinking) = entry.assistant_thinking() {
                        output.push_str("TH: ");
                        output.push_str(&thinking);
                        output.push('\n');
                    }
                }

                // Only include mutation tools (Edit, Write, Bash) - skip read-only tools
                // (Read, Grep, Glob, LS, etc.) since knowledge is in the assistant's analysis
                let mutation_tools: Vec<_> = tool_uses
                    .iter()
                    .filter(|(name, _)| matches!(*name, "Edit" | "Write" | "Bash"))
                    .collect();

                if !mutation_tools.is_empty() {
                    output.push_str("T: ");
                    for (name, input) in mutation_tools {
                        // Compress tool names: Edit→E, Write→W, Bash→B
                        let short_name = match *name {
                            "Edit" => "E",
                            "Write" => "W",
                            "Bash" => "B",
                            _ => name,
                        };
                        output.push_str(short_name);
                        let summary = tool_summary(name, *input);
                        if !summary.is_empty() {
                            output.push('(');
                            output.push_str(&summary);
                            output.push(')');
                        }
                        output.push(' ');
                    }
                    output.push('\n');
                }

                if let Some(text) = entry.assistant_text() {
                    output.push_str("A: ");
                    output.push_str(&text);
                    output.push('\n');
                }
            }
            _ => {}
        }
    }

    output
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_user_entry() {
        let json = r#"{"type":"user","uuid":"abc","parentUuid":null,"sessionId":"sess-1","timestamp":"2025-01-15T10:00:00Z","message":{"role":"user","content":"hello"}}"#;
        let entry: TranscriptEntry = serde_json::from_str(json).unwrap();
        assert!(entry.is_user());
        assert_eq!(entry.session_id(), Some("sess-1"));
        assert_eq!(entry.user_text(), Some("hello".to_string()));
    }

    #[test]
    fn test_parse_assistant_entry() {
        let json = r#"{"type":"assistant","uuid":"def","parentUuid":"abc","sessionId":"sess-1","timestamp":"2025-01-15T10:00:01Z","message":{"role":"assistant","content":[{"type":"text","text":"hi there"}]}}"#;
        let entry: TranscriptEntry = serde_json::from_str(json).unwrap();
        assert!(entry.is_assistant());
        assert_eq!(entry.assistant_text(), Some("hi there".to_string()));
    }

    #[test]
    fn test_parse_unknown_type() {
        let json = r#"{"type":"some-new-type","data":"whatever"}"#;
        let entry: TranscriptEntry = serde_json::from_str(json).unwrap();
        assert!(matches!(entry, TranscriptEntry::Unknown));
    }

    #[test]
    fn test_strip_system_reminders_single() {
        let text = "Hello <system-reminder>workflow stuff</system-reminder> world";
        assert_eq!(strip_system_reminders(text), "Hello  world");
    }

    #[test]
    fn test_strip_system_reminders_multiple() {
        let text = "<system-reminder>first</system-reminder>content<system-reminder>second</system-reminder>";
        assert_eq!(strip_system_reminders(text), "content");
    }

    #[test]
    fn test_strip_system_reminders_none() {
        let text = "Just normal text";
        assert_eq!(strip_system_reminders(text), "Just normal text");
    }

    #[test]
    fn test_session_filtering() {
        let msg_s1 = r#"{"type":"user","uuid":"a","sessionId":"s1","timestamp":"2025-01-15T10:00:00Z","message":{"role":"user","content":"Session 1"}}"#;
        let msg_s2 = r#"{"type":"user","uuid":"b","sessionId":"s2","timestamp":"2025-01-15T10:00:00Z","message":{"role":"user","content":"Session 2"}}"#;

        let entries: Vec<TranscriptEntry> = vec![
            serde_json::from_str(msg_s1).unwrap(),
            serde_json::from_str(msg_s2).unwrap(),
        ];

        // Filter by session s1
        let result = get_messages_since(&entries, None, Some("s1"));
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].user_text(), Some("Session 1".to_string()));

        // Filter by session s2
        let result = get_messages_since(&entries, None, Some("s2"));
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].user_text(), Some("Session 2".to_string()));

        // No filter - get both
        let result = get_messages_since(&entries, None, None);
        assert_eq!(result.len(), 2);
    }
}