Skip to main content

coding_agent_search/pages/
analytics.rs

1//! Pre-computed analytics generator for pages export.
2//!
3//! Generates pre-computed analytics data files (statistics.json, timeline.json, etc.)
4//! during export that enable instant dashboard rendering in the browser without
5//! expensive SQL aggregations.
6//!
7//! # Generated Files
8//!
9//! All files are encrypted with the main database and included in the payload:
10//!
11//! - `statistics.json` - Overall metrics (counts, time range)
12//! - `agent_summary.json` - Per-agent breakdown
13//! - `workspace_summary.json` - Per-workspace breakdown
14//! - `timeline.json` - Activity over time (daily/weekly/monthly)
15//! - `top_terms.json` - Common topics/terms from titles
16//!
17//! # Example
18//!
19//! ```ignore
20//! use crate::pages::analytics::AnalyticsGenerator;
21//!
22//! let generator = AnalyticsGenerator::new(&db_conn)?;
23//! let bundle = generator.generate_all()?;
24//! bundle.write_to_dir(&output_dir)?;
25//! ```
26
27use anyhow::{Context, Result};
28use chrono::{DateTime, Datelike, NaiveDate, Utc};
29use frankensqlite::compat::{ConnectionExt, RowExt};
30use frankensqlite::{Connection, Row};
31use serde::{Deserialize, Serialize};
32use std::collections::{BTreeMap, HashMap, HashSet};
33use std::path::Path;
34use std::time::Instant;
35use tracing::info;
36
37/// Stop words to filter out from term extraction.
38const STOP_WORDS: &[&str] = &[
39    "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
40    "from", "is", "it", "as", "was", "be", "are", "been", "being", "have", "has", "had", "do",
41    "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall", "can",
42    "need", "this", "that", "these", "those", "i", "you", "he", "she", "we", "they", "what",
43    "which", "who", "when", "where", "why", "how", "all", "each", "every", "both", "few", "more",
44    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
45    "too", "very", "just", "also", "now", "here", "there", "then", "once", "about", "after",
46    "again", "into", "over", "under", "out", "up", "down", "off", "any", "its", "your", "my",
47    "our", "their", "his", "her", "him", "them", "me", "us", "if", "else", "while", "during",
48    "before",
49];
50
51/// Overall statistics for the archive.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct Statistics {
54    pub total_conversations: usize,
55    pub total_messages: usize,
56    pub total_characters: usize,
57    // BTreeMap so statistics.json serialization is byte-deterministic
58    // across runs. `pub write_to_dir` emits these via
59    // `serde_json::to_string_pretty`; a HashMap here would make every
60    // regenerate emit a diff even when the data is unchanged, breaking
61    // reproducible builds, git hygiene, and any content-hash checks.
62    pub agents: BTreeMap<String, AgentStats>,
63    pub roles: BTreeMap<String, usize>,
64    pub time_range: TimeRange,
65    /// RFC3339 timestamp
66    pub computed_at: String,
67}
68
69/// Per-agent statistics.
70#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
71pub struct AgentStats {
72    pub conversations: usize,
73    pub messages: usize,
74}
75
76impl Statistics {
77    /// Packet-driven counterpart to [`AnalyticsGenerator::generate_statistics`].
78    ///
79    /// `coding_agent_session_search-ibuuh.32` (sink #2): the analytics
80    /// derivation consumer can now produce the same `Statistics` struct
81    /// from a slice of `ConversationPacket`s without re-running per-row
82    /// SQL aggregations. Callers that already hold packets (e.g. the
83    /// rebuild pipeline) feed them directly; the SQL path stays for
84    /// callers that only have a database handle. The
85    /// `analytics_statistics_from_packets_matches_sql_for_canonical_corpus`
86    /// equivalence test pins that both paths agree on every counted
87    /// field for representative inputs.
88    ///
89    /// `computed_at` is set to `now` so callers can timestamp the
90    /// derivation; equivalence comparisons should stamp the SQL-path
91    /// `computed_at` onto the packet-path result before equality
92    /// checks (or compare every other field individually).
93    pub fn from_packets(packets: &[crate::model::conversation_packet::ConversationPacket]) -> Self {
94        let mut total_messages: usize = 0;
95        let mut total_characters: usize = 0;
96        let mut agents: BTreeMap<String, AgentStats> = BTreeMap::new();
97        let mut roles: BTreeMap<String, usize> = BTreeMap::new();
98        let mut earliest_started_at: Option<i64> = None;
99        let mut latest_started_at: Option<i64> = None;
100
101        for packet in packets {
102            let payload = &packet.payload;
103            let agent_slug = payload.identity.agent_slug.clone();
104            let agent_entry = agents.entry(agent_slug).or_insert(AgentStats {
105                conversations: 0,
106                messages: 0,
107            });
108            agent_entry.conversations = agent_entry.conversations.saturating_add(1);
109
110            // Each ConversationPacketMessage corresponds to one row in
111            // the canonical `messages` table, so projecting "all messages"
112            // here equals SELECT COUNT(*) FROM messages on the same DB.
113            let conv_message_count = payload.messages.len();
114            total_messages = total_messages.saturating_add(conv_message_count);
115            agent_entry.messages = agent_entry.messages.saturating_add(conv_message_count);
116
117            // Char totals follow SUM(LENGTH(content)). SQLite LENGTH()
118            // on TEXT counts Unicode scalar values, not UTF-8 bytes; use
119            // `.chars().count()` so multibyte content stays equivalent.
120            for message in &payload.messages {
121                total_characters = total_characters.saturating_add(message.content.chars().count());
122            }
123
124            // Role counts mirror the SQL path's raw GROUP BY role
125            // surface. Packet canonical replay normalizes Agent turns to
126            // "assistant", while storage writes MessageRole::Agent as
127            // "agent"; map that spelling back and preserve every other
128            // role string instead of collapsing it into "other".
129            for message in &payload.messages {
130                let role = if message.role == "assistant" {
131                    "agent"
132                } else {
133                    message.role.as_str()
134                };
135                *roles.entry(role.to_string()).or_insert(0) += 1;
136            }
137
138            if let Some(started_at) = payload.timestamps.started_at {
139                earliest_started_at = Some(match earliest_started_at {
140                    Some(current) => current.min(started_at),
141                    None => started_at,
142                });
143                latest_started_at = Some(match latest_started_at {
144                    Some(current) => current.max(started_at),
145                    None => started_at,
146                });
147            }
148        }
149
150        Self {
151            total_conversations: packets.len(),
152            total_messages,
153            total_characters,
154            agents,
155            roles,
156            time_range: TimeRange {
157                earliest: earliest_started_at
158                    .and_then(DateTime::from_timestamp_millis)
159                    .map(|dt| dt.to_rfc3339()),
160                latest: latest_started_at
161                    .and_then(DateTime::from_timestamp_millis)
162                    .map(|dt| dt.to_rfc3339()),
163            },
164            computed_at: Utc::now().to_rfc3339(),
165        }
166    }
167}
168
169/// Time range for the archive.
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct TimeRange {
172    /// RFC3339 timestamp or None
173    pub earliest: Option<String>,
174    /// RFC3339 timestamp or None
175    pub latest: Option<String>,
176}
177
178/// Timeline data with daily/weekly/monthly aggregations.
179#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct Timeline {
181    pub daily: Vec<DailyEntry>,
182    pub weekly: Vec<WeeklyEntry>,
183    pub monthly: Vec<MonthlyEntry>,
184    // BTreeMap for deterministic timeline.json serialization (see
185    // Statistics.agents comment for rationale).
186    pub by_agent: BTreeMap<String, AgentTimeline>,
187}
188
189/// Agent-specific timeline.
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct AgentTimeline {
192    pub daily: Vec<DailyEntry>,
193    pub weekly: Vec<WeeklyEntry>,
194    pub monthly: Vec<MonthlyEntry>,
195}
196
197/// Daily activity entry.
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct DailyEntry {
200    pub date: String,
201    pub messages: usize,
202    pub conversations: usize,
203}
204
205/// Weekly activity entry.
206#[derive(Debug, Clone, Serialize, Deserialize)]
207pub struct WeeklyEntry {
208    pub week: String,
209    pub messages: usize,
210    pub conversations: usize,
211}
212
213/// Monthly activity entry.
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct MonthlyEntry {
216    pub month: String,
217    pub messages: usize,
218    pub conversations: usize,
219}
220
221/// Per-workspace summary.
222#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct WorkspaceSummary {
224    pub workspaces: Vec<WorkspaceEntry>,
225}
226
227/// Individual workspace entry.
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct WorkspaceEntry {
230    pub path: String,
231    pub display_name: String,
232    pub conversations: usize,
233    pub messages: usize,
234    pub agents: Vec<String>,
235    pub date_range: TimeRange,
236    pub recent_titles: Vec<String>,
237}
238
239/// Per-agent summary.
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AgentSummary {
242    pub agents: Vec<AgentEntry>,
243}
244
245/// Individual agent entry.
246#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct AgentEntry {
248    pub name: String,
249    pub conversations: usize,
250    pub messages: usize,
251    pub workspaces: Vec<String>,
252    pub date_range: TimeRange,
253    pub avg_messages_per_conversation: f64,
254}
255
256/// Top terms extracted from conversation titles.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct TopTerms {
259    pub terms: Vec<(String, usize)>,
260}
261
262/// Bundle of all analytics data.
263#[derive(Debug, Clone, Serialize, Deserialize)]
264pub struct AnalyticsBundle {
265    pub statistics: Statistics,
266    pub timeline: Timeline,
267    pub workspace_summary: WorkspaceSummary,
268    pub agent_summary: AgentSummary,
269    pub top_terms: TopTerms,
270}
271
272impl AnalyticsBundle {
273    /// Write all analytics files to a directory.
274    pub fn write_to_dir(&self, dir: &Path) -> Result<()> {
275        std::fs::create_dir_all(dir).context("Failed to create analytics directory")?;
276
277        // Write statistics.json
278        let stats_path = dir.join("statistics.json");
279        let stats_json = serde_json::to_string_pretty(&self.statistics)
280            .context("Failed to serialize statistics")?;
281        crate::pages::write_file_durably(&stats_path, stats_json.as_bytes())
282            .context("Failed to write statistics.json")?;
283
284        // Write timeline.json
285        let timeline_path = dir.join("timeline.json");
286        let timeline_json =
287            serde_json::to_string_pretty(&self.timeline).context("Failed to serialize timeline")?;
288        crate::pages::write_file_durably(&timeline_path, timeline_json.as_bytes())
289            .context("Failed to write timeline.json")?;
290
291        // Write workspace_summary.json
292        let workspace_path = dir.join("workspace_summary.json");
293        let workspace_json = serde_json::to_string_pretty(&self.workspace_summary)
294            .context("Failed to serialize workspace_summary")?;
295        crate::pages::write_file_durably(&workspace_path, workspace_json.as_bytes())
296            .context("Failed to write workspace_summary.json")?;
297
298        // Write agent_summary.json
299        let agent_path = dir.join("agent_summary.json");
300        let agent_json = serde_json::to_string_pretty(&self.agent_summary)
301            .context("Failed to serialize agent_summary")?;
302        crate::pages::write_file_durably(&agent_path, agent_json.as_bytes())
303            .context("Failed to write agent_summary.json")?;
304
305        // Write top_terms.json
306        let terms_path = dir.join("top_terms.json");
307        let terms_json = serde_json::to_string_pretty(&self.top_terms)
308            .context("Failed to serialize top_terms")?;
309        crate::pages::write_file_durably(&terms_path, terms_json.as_bytes())
310            .context("Failed to write top_terms.json")?;
311
312        info!(
313            "Analytics written to {:?}: statistics.json, timeline.json, workspace_summary.json, agent_summary.json, top_terms.json",
314            dir
315        );
316
317        Ok(())
318    }
319}
320
321/// Generator for pre-computed analytics data.
322pub struct AnalyticsGenerator<'a> {
323    db: &'a Connection,
324}
325
326impl<'a> AnalyticsGenerator<'a> {
327    /// Create a new analytics generator for the given database connection.
328    pub fn new(db: &'a Connection) -> Self {
329        Self { db }
330    }
331
332    /// Generate all analytics data.
333    pub fn generate_all(&self) -> Result<AnalyticsBundle> {
334        info!("Generating pre-computed analytics...");
335
336        let statistics = self.generate_statistics()?;
337        let timeline = self.generate_timeline()?;
338        let workspace_summary = self.generate_workspace_summary()?;
339        let agent_summary = self.generate_agent_summary()?;
340        let top_terms = self.generate_top_terms()?;
341
342        Ok(AnalyticsBundle {
343            statistics,
344            timeline,
345            workspace_summary,
346            agent_summary,
347            top_terms,
348        })
349    }
350
351    /// Generate overall statistics.
352    fn generate_statistics(&self) -> Result<Statistics> {
353        info!("Generating statistics...");
354
355        // Total conversations
356        let total_conversations: i64 = self
357            .db
358            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row: &Row| {
359                row.get_typed(0)
360            })
361            .context("Failed to count conversations")?;
362
363        // Total messages
364        let total_messages: i64 = self
365            .db
366            .query_row_map("SELECT COUNT(*) FROM messages", &[], |row: &Row| {
367                row.get_typed(0)
368            })
369            .context("Failed to count messages")?;
370
371        // Total characters
372        let total_characters: i64 = self
373            .db
374            .query_row_map(
375                "SELECT SUM(LENGTH(content)) FROM messages",
376                &[],
377                |row: &Row| row.get_typed::<Option<i64>>(0),
378            )
379            .context("Failed to sum content lengths")?
380            .unwrap_or(0);
381
382        // Per-agent stats
383        let mut agents: BTreeMap<String, AgentStats> = BTreeMap::new();
384        let agent_conv_rows: Vec<(String, i64)> = self.db.query_map_collect(
385            "SELECT agent, COUNT(*) as conv_count FROM conversations GROUP BY agent",
386            &[],
387            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<i64>(1)?)),
388        )?;
389        for (agent, conv_count) in agent_conv_rows {
390            agents.insert(
391                agent.clone(),
392                AgentStats {
393                    conversations: conv_count as usize,
394                    messages: 0, // Will be filled below
395                },
396            );
397        }
398
399        // Fill in message counts per agent
400        let msg_rows: Vec<(String, i64)> = self.db.query_map_collect(
401            "SELECT c.agent, COUNT(m.id) FROM messages m
402             JOIN conversations c ON m.conversation_id = c.id
403             GROUP BY c.agent",
404            &[],
405            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<i64>(1)?)),
406        )?;
407        for (agent, msg_count) in msg_rows {
408            if let Some(stats) = agents.get_mut(&agent) {
409                stats.messages = msg_count as usize;
410            }
411        }
412
413        // Per-role counts
414        let mut roles: BTreeMap<String, usize> = BTreeMap::new();
415        let role_rows: Vec<(String, i64)> = self.db.query_map_collect(
416            "SELECT role, COUNT(*) FROM messages GROUP BY role",
417            &[],
418            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<i64>(1)?)),
419        )?;
420        for (role, count) in role_rows {
421            roles.insert(role, count as usize);
422        }
423
424        // Time range
425        let time_range: (Option<i64>, Option<i64>) = self
426            .db
427            .query_row_map(
428                "SELECT MIN(started_at), MAX(started_at) FROM conversations",
429                &[],
430                |row: &Row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
431            )
432            .context("Failed to get time range")?;
433
434        Ok(Statistics {
435            total_conversations: total_conversations as usize,
436            total_messages: total_messages as usize,
437            total_characters: total_characters as usize,
438            agents,
439            roles,
440            time_range: TimeRange {
441                earliest: time_range
442                    .0
443                    .and_then(DateTime::from_timestamp_millis)
444                    .map(|dt| dt.to_rfc3339()),
445                latest: time_range
446                    .1
447                    .and_then(DateTime::from_timestamp_millis)
448                    .map(|dt| dt.to_rfc3339()),
449            },
450            computed_at: Utc::now().to_rfc3339(),
451        })
452    }
453
454    /// Generate timeline data.
455    fn generate_timeline(&self) -> Result<Timeline> {
456        info!("Generating timeline...");
457
458        let timeline_rows: Vec<(Option<String>, String, i64, i64)> = self.db.query_map_collect(
459            "SELECT DATE(m.created_at/1000, 'unixepoch') as date,
460                    COALESCE(c.agent, 'unknown') as agent,
461                    m.conversation_id,
462                    COUNT(*) as messages
463             FROM messages m
464             LEFT JOIN conversations c ON m.conversation_id = c.id
465             WHERE m.created_at IS NOT NULL
466             GROUP BY DATE(m.created_at/1000, 'unixepoch'),
467                      COALESCE(c.agent, 'unknown'),
468                      m.conversation_id
469             ORDER BY date, agent, m.conversation_id",
470            &[],
471            |row: &Row| {
472                Ok((
473                    row.get_typed::<Option<String>>(0)?,
474                    row.get_typed::<String>(1)?,
475                    row.get_typed::<i64>(2)?,
476                    row.get_typed::<i64>(3)?,
477                ))
478            },
479        )?;
480
481        let mut overall = TimelineAccumulator::default();
482        let mut agent_accumulators: HashMap<String, TimelineAccumulator> = HashMap::new();
483
484        for (date_opt, agent, conv_id, messages) in timeline_rows {
485            if let Some(date) = date_opt.as_deref() {
486                overall.record_message_group(date, conv_id, messages);
487                agent_accumulators
488                    .entry(agent)
489                    .or_default()
490                    .record_message_group(date, conv_id, messages);
491            }
492        }
493
494        let (daily, weekly, monthly) = overall.into_parts();
495
496        let by_agent = agent_accumulators
497            .into_iter()
498            .map(|(agent, accumulator)| {
499                let (daily, weekly, monthly) = accumulator.into_parts();
500                (
501                    agent,
502                    AgentTimeline {
503                        daily,
504                        weekly,
505                        monthly,
506                    },
507                )
508            })
509            .collect();
510
511        Ok(Timeline {
512            daily,
513            weekly,
514            monthly,
515            by_agent,
516        })
517    }
518
519    /// Generate workspace summary.
520    fn generate_workspace_summary(&self) -> Result<WorkspaceSummary> {
521        info!("Generating workspace summary...");
522        let started = Instant::now();
523
524        let mut workspaces: Vec<WorkspaceEntry> = Vec::new();
525
526        // Query 1: base workspace rows with conversation/time aggregates.
527        let workspace_rows: Vec<(String, i64, Option<i64>, Option<i64>)> =
528            self.db.query_map_collect(
529                "SELECT workspace, COUNT(*) as conv_count,
530                    MIN(started_at), MAX(started_at)
531             FROM conversations
532             WHERE workspace IS NOT NULL
533             GROUP BY workspace
534             ORDER BY conv_count DESC",
535                &[],
536                |row: &Row| {
537                    Ok((
538                        row.get_typed::<String>(0)?,
539                        row.get_typed::<i64>(1)?,
540                        row.get_typed::<Option<i64>>(2)?,
541                        row.get_typed::<Option<i64>>(3)?,
542                    ))
543                },
544            )?;
545
546        // Query 2: message counts for every workspace.
547        let mut messages_by_workspace: HashMap<String, i64> = HashMap::new();
548        let ws_msg_rows: Vec<(String, i64)> = self.db.query_map_collect(
549            "SELECT c.workspace, COUNT(m.id)
550             FROM conversations c
551             LEFT JOIN messages m ON m.conversation_id = c.id
552             WHERE c.workspace IS NOT NULL
553             GROUP BY c.workspace",
554            &[],
555            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<i64>(1)?)),
556        )?;
557        for (workspace, msg_count) in ws_msg_rows {
558            messages_by_workspace.insert(workspace, msg_count);
559        }
560
561        // Query 3: distinct agents for every workspace.
562        let mut agents_by_workspace: HashMap<String, Vec<String>> = HashMap::new();
563        let ws_agent_rows: Vec<(String, String)> = self.db.query_map_collect(
564            "SELECT workspace, agent
565             FROM conversations
566             WHERE workspace IS NOT NULL
567             GROUP BY workspace, agent
568             ORDER BY workspace, agent",
569            &[],
570            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<String>(1)?)),
571        )?;
572        for (workspace, agent) in ws_agent_rows {
573            agents_by_workspace
574                .entry(workspace)
575                .or_default()
576                .push(agent);
577        }
578
579        // Query 4: recent titles per workspace (sorted by started_at DESC, top 5 per workspace in Rust).
580        let mut recent_titles_by_workspace: HashMap<String, Vec<String>> = HashMap::new();
581        let ws_title_rows: Vec<(String, String)> = self.db.query_map_collect(
582            "SELECT workspace, title
583             FROM conversations
584             WHERE workspace IS NOT NULL AND title IS NOT NULL
585             ORDER BY workspace, started_at DESC",
586            &[],
587            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<String>(1)?)),
588        )?;
589        for (workspace, title) in ws_title_rows {
590            let titles = recent_titles_by_workspace.entry(workspace).or_default();
591            if titles.len() < 5 {
592                titles.push(title);
593            }
594        }
595
596        for (workspace, conv_count, min_ts, max_ts) in workspace_rows {
597            let msg_count = messages_by_workspace.get(&workspace).copied().unwrap_or(0);
598            let agents = agents_by_workspace.remove(&workspace).unwrap_or_default();
599            let recent_titles = recent_titles_by_workspace
600                .remove(&workspace)
601                .unwrap_or_default();
602
603            // Extract display name (last path component)
604            let display_name = Path::new(&workspace)
605                .file_name()
606                .map(|s| s.to_string_lossy().to_string())
607                .unwrap_or_else(|| workspace.clone());
608
609            workspaces.push(WorkspaceEntry {
610                path: workspace,
611                display_name,
612                conversations: conv_count as usize,
613                messages: msg_count as usize,
614                agents,
615                date_range: TimeRange {
616                    earliest: min_ts
617                        .and_then(DateTime::from_timestamp_millis)
618                        .map(|dt| dt.to_rfc3339()),
619                    latest: max_ts
620                        .and_then(DateTime::from_timestamp_millis)
621                        .map(|dt| dt.to_rfc3339()),
622                },
623                recent_titles,
624            });
625        }
626
627        info!(
628            query_count = 4,
629            workspace_rows = workspaces.len(),
630            elapsed_ms = started.elapsed().as_millis(),
631            "Workspace summary generated using set-based aggregation"
632        );
633
634        Ok(WorkspaceSummary { workspaces })
635    }
636
637    /// Generate agent summary.
638    fn generate_agent_summary(&self) -> Result<AgentSummary> {
639        info!("Generating agent summary...");
640        let started = Instant::now();
641
642        let mut agents: Vec<AgentEntry> = Vec::new();
643
644        // Query 1: base agent rows with conversation/time aggregates.
645        let agent_rows: Vec<(String, i64, Option<i64>, Option<i64>)> = self.db.query_map_collect(
646            "SELECT agent, COUNT(*) as conv_count,
647                    MIN(started_at), MAX(started_at)
648             FROM conversations
649             GROUP BY agent
650             ORDER BY conv_count DESC",
651            &[],
652            |row: &Row| {
653                Ok((
654                    row.get_typed::<String>(0)?,
655                    row.get_typed::<i64>(1)?,
656                    row.get_typed::<Option<i64>>(2)?,
657                    row.get_typed::<Option<i64>>(3)?,
658                ))
659            },
660        )?;
661
662        // Query 2: message counts for every agent.
663        let mut messages_by_agent: HashMap<String, i64> = HashMap::new();
664        let agent_msg_rows: Vec<(String, i64)> = self.db.query_map_collect(
665            "SELECT c.agent, COUNT(m.id)
666             FROM conversations c
667             LEFT JOIN messages m ON m.conversation_id = c.id
668             GROUP BY c.agent",
669            &[],
670            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<i64>(1)?)),
671        )?;
672        for (agent, msg_count) in agent_msg_rows {
673            messages_by_agent.insert(agent, msg_count);
674        }
675
676        // Query 3: distinct workspaces for every agent.
677        let mut workspaces_by_agent: HashMap<String, Vec<String>> = HashMap::new();
678        let agent_ws_rows: Vec<(String, String)> = self.db.query_map_collect(
679            "SELECT agent, workspace
680             FROM conversations
681             WHERE workspace IS NOT NULL
682             GROUP BY agent, workspace
683             ORDER BY agent, workspace",
684            &[],
685            |row: &Row| Ok((row.get_typed::<String>(0)?, row.get_typed::<String>(1)?)),
686        )?;
687        for (agent, workspace) in agent_ws_rows {
688            workspaces_by_agent
689                .entry(agent)
690                .or_default()
691                .push(workspace);
692        }
693
694        for (agent, conv_count, min_ts, max_ts) in agent_rows {
695            let msg_count = messages_by_agent.get(&agent).copied().unwrap_or(0);
696            let workspaces = workspaces_by_agent.remove(&agent).unwrap_or_default();
697
698            let avg_messages = if conv_count > 0 {
699                msg_count as f64 / conv_count as f64
700            } else {
701                0.0
702            };
703
704            agents.push(AgentEntry {
705                name: agent,
706                conversations: conv_count as usize,
707                messages: msg_count as usize,
708                workspaces,
709                date_range: TimeRange {
710                    earliest: min_ts
711                        .and_then(DateTime::from_timestamp_millis)
712                        .map(|dt| dt.to_rfc3339()),
713                    latest: max_ts
714                        .and_then(DateTime::from_timestamp_millis)
715                        .map(|dt| dt.to_rfc3339()),
716                },
717                avg_messages_per_conversation: avg_messages,
718            });
719        }
720
721        info!(
722            query_count = 3,
723            agent_rows = agents.len(),
724            elapsed_ms = started.elapsed().as_millis(),
725            "Agent summary generated using set-based aggregation"
726        );
727
728        Ok(AgentSummary { agents })
729    }
730
731    /// Generate top terms from conversation titles.
732    fn generate_top_terms(&self) -> Result<TopTerms> {
733        info!("Generating top terms...");
734
735        let stop_words: HashSet<&str> = STOP_WORDS.iter().copied().collect();
736
737        // Get all titles
738        let titles: Vec<String> = self.db.query_map_collect(
739            "SELECT title FROM conversations WHERE title IS NOT NULL",
740            &[],
741            |row: &Row| row.get_typed::<String>(0),
742        )?;
743
744        let mut term_counts: HashMap<String, usize> = HashMap::new();
745
746        for title in titles {
747            for word in title.split_whitespace() {
748                // Clean the word: remove punctuation, lowercase
749                let word: String = word
750                    .chars()
751                    .filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-')
752                    .collect::<String>()
753                    .to_lowercase();
754
755                // Filter: minimum length 3, not a stop word
756                if word.len() >= 3 && !stop_words.contains(word.as_str()) {
757                    *term_counts.entry(word).or_insert(0) += 1;
758                }
759            }
760        }
761
762        let mut top: Vec<(String, usize)> = term_counts.into_iter().collect();
763        top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
764
765        // Keep top 100
766        top.truncate(100);
767
768        Ok(TopTerms { terms: top })
769    }
770}
771
772#[derive(Default)]
773struct TimelineAccumulator {
774    daily_map: HashMap<String, DailyEntry>,
775    weekly_map: HashMap<String, WeeklyEntry>,
776    weekly_conv_ids: HashMap<String, HashSet<i64>>,
777    monthly_map: HashMap<String, MonthlyEntry>,
778    monthly_conv_ids: HashMap<String, HashSet<i64>>,
779}
780
781impl TimelineAccumulator {
782    fn record_message_group(&mut self, date: &str, conv_id: i64, messages: i64) {
783        let message_count = messages.max(0) as usize;
784        let date_key = date.to_string();
785        let daily = self
786            .daily_map
787            .entry(date_key.clone())
788            .or_insert(DailyEntry {
789                date: date_key,
790                messages: 0,
791                conversations: 0,
792            });
793        daily.messages = daily.messages.saturating_add(message_count);
794        daily.conversations = daily.conversations.saturating_add(1);
795
796        let Ok(parsed_date) = NaiveDate::parse_from_str(date, "%Y-%m-%d") else {
797            return;
798        };
799
800        let week = iso_week_label(parsed_date);
801        let weekly = self.weekly_map.entry(week.clone()).or_insert(WeeklyEntry {
802            week: week.clone(),
803            messages: 0,
804            conversations: 0,
805        });
806        weekly.messages = weekly.messages.saturating_add(message_count);
807        self.weekly_conv_ids
808            .entry(week)
809            .or_default()
810            .insert(conv_id);
811
812        let month = month_label(parsed_date);
813        let monthly = self
814            .monthly_map
815            .entry(month.clone())
816            .or_insert(MonthlyEntry {
817                month: month.clone(),
818                messages: 0,
819                conversations: 0,
820            });
821        monthly.messages = monthly.messages.saturating_add(message_count);
822        self.monthly_conv_ids
823            .entry(month)
824            .or_default()
825            .insert(conv_id);
826    }
827
828    fn into_parts(mut self) -> (Vec<DailyEntry>, Vec<WeeklyEntry>, Vec<MonthlyEntry>) {
829        for (week, conv_ids) in self.weekly_conv_ids {
830            if let Some(entry) = self.weekly_map.get_mut(&week) {
831                entry.conversations = conv_ids.len();
832            }
833        }
834
835        for (month, conv_ids) in self.monthly_conv_ids {
836            if let Some(entry) = self.monthly_map.get_mut(&month) {
837                entry.conversations = conv_ids.len();
838            }
839        }
840
841        let mut daily: Vec<DailyEntry> = self.daily_map.into_values().collect();
842        daily.sort_by(|a, b| a.date.cmp(&b.date));
843
844        let mut weekly: Vec<WeeklyEntry> = self.weekly_map.into_values().collect();
845        weekly.sort_by(|a, b| a.week.cmp(&b.week));
846
847        let mut monthly: Vec<MonthlyEntry> = self.monthly_map.into_values().collect();
848        monthly.sort_by(|a, b| a.month.cmp(&b.month));
849
850        (daily, weekly, monthly)
851    }
852}
853
854/// Aggregate daily entries to weekly.
855pub fn aggregate_to_weekly(daily: &[DailyEntry]) -> Vec<WeeklyEntry> {
856    let mut weekly_map: HashMap<String, WeeklyEntry> = HashMap::new();
857
858    for entry in daily {
859        // Parse date and get ISO week
860        if let Ok(date) = NaiveDate::parse_from_str(&entry.date, "%Y-%m-%d") {
861            let week_str = iso_week_label(date);
862
863            let weekly = weekly_map.entry(week_str.clone()).or_insert(WeeklyEntry {
864                week: week_str,
865                messages: 0,
866                conversations: 0,
867            });
868            weekly.messages = weekly.messages.saturating_add(entry.messages);
869            weekly.conversations = weekly.conversations.saturating_add(entry.conversations);
870        }
871    }
872
873    let mut result: Vec<WeeklyEntry> = weekly_map.into_values().collect();
874    result.sort_by(|a, b| a.week.cmp(&b.week));
875    result
876}
877
878/// Aggregate daily entries to monthly.
879pub fn aggregate_to_monthly(daily: &[DailyEntry]) -> Vec<MonthlyEntry> {
880    let mut monthly_map: HashMap<String, MonthlyEntry> = HashMap::new();
881
882    for entry in daily {
883        // Extract YYYY-MM from date
884        if let Ok(date) = NaiveDate::parse_from_str(&entry.date, "%Y-%m-%d") {
885            let month_str = month_label(date);
886
887            let monthly = monthly_map
888                .entry(month_str.clone())
889                .or_insert(MonthlyEntry {
890                    month: month_str,
891                    messages: 0,
892                    conversations: 0,
893                });
894            monthly.messages = monthly.messages.saturating_add(entry.messages);
895            monthly.conversations = monthly.conversations.saturating_add(entry.conversations);
896        }
897    }
898
899    let mut result: Vec<MonthlyEntry> = monthly_map.into_values().collect();
900    result.sort_by(|a, b| a.month.cmp(&b.month));
901    result
902}
903
904fn iso_week_label(date: NaiveDate) -> String {
905    let iso_week = date.iso_week();
906    format!("{}-W{:02}", iso_week.year(), iso_week.week())
907}
908
909fn month_label(date: NaiveDate) -> String {
910    format!("{:04}-{:02}", date.year(), date.month())
911}
912
913#[cfg(test)]
914mod tests {
915    use super::*;
916    use tempfile::TempDir;
917
918    fn create_test_db() -> (TempDir, Connection) {
919        let dir = TempDir::new().unwrap();
920        let db_path = dir.path().join("test.db");
921        let conn = Connection::open(db_path.to_string_lossy().as_ref()).unwrap();
922
923        // Create schema
924        conn.execute_batch(
925            "CREATE TABLE conversations (
926                id INTEGER PRIMARY KEY,
927                agent TEXT NOT NULL,
928                workspace TEXT,
929                title TEXT,
930                source_path TEXT NOT NULL,
931                started_at INTEGER,
932                ended_at INTEGER,
933                message_count INTEGER,
934                metadata_json TEXT
935            );
936            CREATE TABLE messages (
937                id INTEGER PRIMARY KEY,
938                conversation_id INTEGER NOT NULL,
939                idx INTEGER NOT NULL,
940                role TEXT NOT NULL,
941                content TEXT NOT NULL,
942                created_at INTEGER,
943                FOREIGN KEY (conversation_id) REFERENCES conversations(id)
944            );",
945        )
946        .unwrap();
947
948        (dir, conn)
949    }
950
951    fn insert_test_data(conn: &Connection) {
952        // Insert conversations
953        conn.execute(
954            "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
955             VALUES (1, 'claude-code', '/home/user/project-a', 'Debug authentication flow', '/path/a.jsonl', 1700000000000, 5)",
956        ).unwrap();
957        conn.execute(
958            "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
959             VALUES (2, 'claude-code', '/home/user/project-a', 'Fix database connection', '/path/b.jsonl', 1700100000000, 3)",
960        ).unwrap();
961        conn.execute(
962            "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
963             VALUES (3, 'codex', '/home/user/project-b', 'Add user authentication', '/path/c.jsonl', 1700200000000, 4)",
964        ).unwrap();
965
966        // Insert messages
967        for conv_id in 1..=3 {
968            let msg_count = match conv_id {
969                1 => 5,
970                2 => 3,
971                3 => 4,
972                _ => 0,
973            };
974            for idx in 0..msg_count {
975                let role = if conv_id == 3 && idx == 3 {
976                    "narrator"
977                } else if idx % 2 == 0 {
978                    "user"
979                } else {
980                    "agent"
981                };
982                let created_at =
983                    1700000000000i64 + (conv_id as i64 * 100000000) + (idx as i64 * 1000);
984                let content = if conv_id == 3 && idx == 1 {
985                    format!("Message {} for conv {} with caf\u{00e9}", idx, conv_id)
986                } else {
987                    format!("Message {} for conv {}", idx, conv_id)
988                };
989                conn.execute_compat(
990                    "INSERT INTO messages (conversation_id, idx, role, content, created_at)
991                     VALUES (?1, ?2, ?3, ?4, ?5)",
992                    frankensqlite::params![
993                        conv_id as i64,
994                        idx as i64,
995                        role,
996                        content.as_str(),
997                        created_at
998                    ],
999                )
1000                .unwrap();
1001            }
1002        }
1003    }
1004
1005    #[test]
1006    fn test_statistics_generation() {
1007        let (_dir, conn) = create_test_db();
1008        insert_test_data(&conn);
1009
1010        let generator = AnalyticsGenerator::new(&conn);
1011        let stats = generator.generate_statistics().unwrap();
1012
1013        assert_eq!(stats.total_conversations, 3);
1014        assert_eq!(stats.total_messages, 12); // 5 + 3 + 4
1015        assert!(stats.agents.contains_key("claude-code"));
1016        assert!(stats.agents.contains_key("codex"));
1017        assert_eq!(stats.agents["claude-code"].conversations, 2);
1018        assert_eq!(stats.agents["codex"].conversations, 1);
1019    }
1020
1021    /// `coding_agent_session_search-ibuuh.32` (sink #2 equivalence gate):
1022    /// the packet-driven `Statistics::from_packets` must agree with the
1023    /// SQL-driven `AnalyticsGenerator::generate_statistics` on every
1024    /// counted field for the same canonical corpus. Once this passes,
1025    /// callers that already hold packets (e.g. the rebuild pipeline)
1026    /// can derive analytics without paying for per-row SQL aggregations
1027    /// AND operators have a structured proof that the analytics sink
1028    /// is packet-equivalent.
1029    #[test]
1030    fn analytics_statistics_from_packets_matches_sql_for_canonical_corpus() {
1031        use crate::model::conversation_packet::{
1032            ConversationPacket, ConversationPacketMessage, ConversationPacketProvenance,
1033        };
1034        use serde_json::Value;
1035
1036        let (_dir, conn) = create_test_db();
1037        insert_test_data(&conn);
1038
1039        let sql_stats = AnalyticsGenerator::new(&conn)
1040            .generate_statistics()
1041            .unwrap();
1042
1043        // Re-derive the same corpus as a Vec<ConversationPacket> by
1044        // building each packet from canonical replay equivalents. The
1045        // fixture uses the real storage role spelling ("agent") plus a
1046        // multibyte message so role buckets and LENGTH() semantics both
1047        // stay pinned to the SQL surface.
1048        let mut packets: Vec<ConversationPacket> = Vec::new();
1049        let conv_rows: Vec<(i64, String, Option<String>, Option<i64>)> = conn
1050            .query_map_collect(
1051                "SELECT id, agent, source_path, started_at FROM conversations ORDER BY id ASC",
1052                &[],
1053                |row: &Row| {
1054                    Ok((
1055                        row.get_typed::<i64>(0)?,
1056                        row.get_typed::<String>(1)?,
1057                        row.get_typed::<Option<String>>(2)?,
1058                        row.get_typed::<Option<i64>>(3)?,
1059                    ))
1060                },
1061            )
1062            .unwrap();
1063
1064        for (conv_id, agent, source_path, started_at) in conv_rows {
1065            let msg_rows: Vec<(i64, String, String, Option<i64>)> = conn
1066                .query_map_collect(
1067                    "SELECT idx, role, content, created_at
1068                     FROM messages
1069                     WHERE conversation_id = ?1
1070                     ORDER BY idx ASC",
1071                    &[frankensqlite::compat::ParamValue::from(conv_id)],
1072                    |row: &Row| {
1073                        Ok((
1074                            row.get_typed::<i64>(0)?,
1075                            row.get_typed::<String>(1)?,
1076                            row.get_typed::<String>(2)?,
1077                            row.get_typed::<Option<i64>>(3)?,
1078                        ))
1079                    },
1080                )
1081                .unwrap();
1082
1083            // Build packets through the canonical_replay payload shape
1084            // by hand: the hash details don't matter for equivalence
1085            // here, only the projections + identity + timestamps fields
1086            // the analytics derivation reads.
1087            use crate::model::types::{
1088                Conversation, Message, MessageRole, Snippet as CanonicalSnippet,
1089            };
1090            let _ = CanonicalSnippet {
1091                id: None,
1092                file_path: None,
1093                start_line: None,
1094                end_line: None,
1095                language: None,
1096                snippet_text: None,
1097            };
1098            let canonical = Conversation {
1099                id: Some(conv_id),
1100                agent_slug: agent.clone(),
1101                workspace: None,
1102                external_id: None,
1103                title: None,
1104                source_path: source_path
1105                    .map(std::path::PathBuf::from)
1106                    .unwrap_or_else(|| std::path::PathBuf::from(format!("/tmp/conv-{conv_id}"))),
1107                started_at,
1108                ended_at: None,
1109                approx_tokens: None,
1110                metadata_json: Value::Null,
1111                source_id: "local".to_string(),
1112                origin_host: None,
1113                messages: msg_rows
1114                    .into_iter()
1115                    .map(|(idx, role, content, created_at)| Message {
1116                        id: None,
1117                        idx,
1118                        role: match role.as_str() {
1119                            "user" => MessageRole::User,
1120                            "agent" | "assistant" => MessageRole::Agent,
1121                            "tool" => MessageRole::Tool,
1122                            "system" => MessageRole::System,
1123                            other => MessageRole::Other(other.to_string()),
1124                        },
1125                        author: None,
1126                        created_at,
1127                        content,
1128                        extra_json: Value::Null,
1129                        snippets: Vec::new(),
1130                    })
1131                    .collect(),
1132            };
1133            packets.push(ConversationPacket::from_canonical_replay(
1134                &canonical,
1135                ConversationPacketProvenance::local(),
1136            ));
1137            // Sanity check: every packet message must mirror the
1138            // ConversationPacketMessage shape so analytics projections
1139            // are well-formed (catches `MessageRole::Other` regressions).
1140            for msg in &packets.last().unwrap().payload.messages {
1141                let _: &ConversationPacketMessage = msg;
1142            }
1143        }
1144
1145        let mut packet_stats = Statistics::from_packets(&packets);
1146        // The two paths obviously stamp different `computed_at`
1147        // timestamps; pin the SQL one onto the packet result so the
1148        // remaining fields can be compared structurally.
1149        packet_stats.computed_at = sql_stats.computed_at.clone();
1150
1151        assert_eq!(
1152            packet_stats.total_conversations, sql_stats.total_conversations,
1153            "packet path total_conversations must match SQL path"
1154        );
1155        assert_eq!(
1156            packet_stats.total_messages, sql_stats.total_messages,
1157            "packet path total_messages must match SQL path (12 = 5+3+4)"
1158        );
1159        assert_eq!(
1160            packet_stats.total_characters, sql_stats.total_characters,
1161            "packet path total_characters must match SUM(LENGTH(content))"
1162        );
1163        assert_eq!(
1164            packet_stats.agents, sql_stats.agents,
1165            "per-agent (conversations, messages) buckets must match"
1166        );
1167        assert_eq!(
1168            packet_stats.roles, sql_stats.roles,
1169            "role-count buckets must agree (user/assistant)"
1170        );
1171        assert_eq!(
1172            packet_stats.time_range.earliest, sql_stats.time_range.earliest,
1173            "earliest started_at must round-trip identically through DateTime::from_timestamp_millis"
1174        );
1175        assert_eq!(
1176            packet_stats.time_range.latest, sql_stats.time_range.latest,
1177            "latest started_at must round-trip identically"
1178        );
1179        // Final structural check: the two structs must be byte-for-byte
1180        // equal once `computed_at` is normalized. JSON serialization is
1181        // the strongest portable equality contract for Statistics.
1182        let sql_json = serde_json::to_string(&sql_stats).unwrap();
1183        let packet_json = serde_json::to_string(&packet_stats).unwrap();
1184        assert_eq!(
1185            sql_json, packet_json,
1186            "SQL-driven and packet-driven Statistics must serialize identically"
1187        );
1188    }
1189
1190    #[test]
1191    fn test_timeline_aggregation() {
1192        let daily = vec![
1193            DailyEntry {
1194                date: "2024-01-01".into(),
1195                messages: 10,
1196                conversations: 1,
1197            },
1198            DailyEntry {
1199                date: "2024-01-02".into(),
1200                messages: 20,
1201                conversations: 2,
1202            },
1203            DailyEntry {
1204                date: "2024-01-08".into(),
1205                messages: 15,
1206                conversations: 1,
1207            },
1208        ];
1209
1210        let weekly = aggregate_to_weekly(&daily);
1211        assert_eq!(weekly.len(), 2); // Week 1 and Week 2
1212
1213        let monthly = aggregate_to_monthly(&daily);
1214        assert_eq!(monthly.len(), 1);
1215        assert_eq!(monthly[0].messages, 45); // 10 + 20 + 15
1216    }
1217
1218    #[test]
1219    fn test_timeline_aggregation_saturates_counter_arithmetic() {
1220        let daily = vec![
1221            DailyEntry {
1222                date: "2024-01-01".into(),
1223                messages: usize::MAX,
1224                conversations: usize::MAX,
1225            },
1226            DailyEntry {
1227                date: "2024-01-02".into(),
1228                messages: 1,
1229                conversations: 1,
1230            },
1231        ];
1232
1233        let weekly = aggregate_to_weekly(&daily);
1234        assert_eq!(weekly[0].messages, usize::MAX);
1235        assert_eq!(weekly[0].conversations, usize::MAX);
1236
1237        let monthly = aggregate_to_monthly(&daily);
1238        assert_eq!(monthly[0].messages, usize::MAX);
1239        assert_eq!(monthly[0].conversations, usize::MAX);
1240    }
1241
1242    #[test]
1243    fn precomputed_weekly_and_monthly_timelines_count_distinct_conversations() {
1244        let (_dir, conn) = create_test_db();
1245
1246        conn.execute(
1247            "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
1248             VALUES (1, 'codex', '/tmp/project', 'Multi-day conversation', '/path/one.jsonl', 1704067200000, 2)",
1249        )
1250        .unwrap();
1251        conn.execute(
1252            "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
1253             VALUES (2, 'codex', '/tmp/project', 'Second conversation', '/path/two.jsonl', 1704153600000, 1)",
1254        )
1255        .unwrap();
1256
1257        for (conv_id, idx, created_at) in [
1258            (1_i64, 0_i64, 1_704_067_200_000_i64),
1259            (1_i64, 1_i64, 1_704_153_600_000_i64),
1260            (2_i64, 0_i64, 1_704_153_600_000_i64),
1261        ] {
1262            conn.execute_compat(
1263                "INSERT INTO messages (conversation_id, idx, role, content, created_at)
1264                 VALUES (?1, ?2, 'assistant', 'message', ?3)",
1265                frankensqlite::params![conv_id, idx, created_at],
1266            )
1267            .unwrap();
1268        }
1269
1270        let timeline = AnalyticsGenerator::new(&conn).generate_timeline().unwrap();
1271
1272        assert_eq!(timeline.daily.len(), 2);
1273        assert_eq!(timeline.daily[0].conversations, 1);
1274        assert_eq!(timeline.daily[1].conversations, 2);
1275        assert_eq!(timeline.weekly.len(), 1);
1276        assert_eq!(timeline.weekly[0].messages, 3);
1277        assert_eq!(
1278            timeline.weekly[0].conversations, 2,
1279            "a conversation with messages on two days in the same ISO week must be counted once"
1280        );
1281        assert_eq!(timeline.monthly.len(), 1);
1282        assert_eq!(timeline.monthly[0].messages, 3);
1283        assert_eq!(
1284            timeline.monthly[0].conversations, 2,
1285            "a conversation with messages on two days in the same month must be counted once"
1286        );
1287
1288        let codex = timeline
1289            .by_agent
1290            .get("codex")
1291            .expect("codex agent timeline should exist");
1292        assert_eq!(codex.weekly[0].conversations, 2);
1293        assert_eq!(codex.monthly[0].conversations, 2);
1294    }
1295
1296    #[test]
1297    fn test_top_terms_extraction() {
1298        let (_dir, conn) = create_test_db();
1299        insert_test_data(&conn);
1300
1301        let generator = AnalyticsGenerator::new(&conn);
1302        let top = generator.generate_top_terms().unwrap();
1303
1304        // "authentication" appears in 2 titles
1305        assert!(
1306            top.terms
1307                .iter()
1308                .any(|(term, count)| term == "authentication" && *count >= 2)
1309        );
1310    }
1311
1312    #[test]
1313    fn test_top_terms_tie_break_alphabetically_for_deterministic_json() {
1314        let (_dir, conn) = create_test_db();
1315
1316        for (id, title) in [(1_i64, "banana"), (2_i64, "apple"), (3_i64, "cherry")] {
1317            let source_path = format!("/path/{id}.jsonl");
1318            conn.execute_compat(
1319                "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
1320                 VALUES (?1, 'codex', '/tmp/project', ?2, ?3, 1704067200000, 0)",
1321                frankensqlite::params![id, title, source_path.as_str()],
1322            )
1323            .unwrap();
1324        }
1325
1326        let top = AnalyticsGenerator::new(&conn).generate_top_terms().unwrap();
1327
1328        assert_eq!(
1329            top.terms,
1330            vec![
1331                ("apple".to_string(), 1),
1332                ("banana".to_string(), 1),
1333                ("cherry".to_string(), 1),
1334            ]
1335        );
1336    }
1337
1338    #[test]
1339    fn test_workspace_summary() {
1340        let (_dir, conn) = create_test_db();
1341        insert_test_data(&conn);
1342
1343        let generator = AnalyticsGenerator::new(&conn);
1344        let summary = generator.generate_workspace_summary().unwrap();
1345
1346        assert_eq!(summary.workspaces.len(), 2);
1347
1348        // project-a has 2 conversations
1349        let project_a = summary
1350            .workspaces
1351            .iter()
1352            .find(|w| w.path.contains("project-a"));
1353        assert!(project_a.is_some());
1354        assert_eq!(project_a.unwrap().conversations, 2);
1355    }
1356
1357    #[test]
1358    fn test_agent_summary() {
1359        let (_dir, conn) = create_test_db();
1360        insert_test_data(&conn);
1361
1362        let generator = AnalyticsGenerator::new(&conn);
1363        let summary = generator.generate_agent_summary().unwrap();
1364
1365        assert_eq!(summary.agents.len(), 2);
1366
1367        let claude = summary.agents.iter().find(|a| a.name == "claude-code");
1368        assert!(claude.is_some());
1369        assert_eq!(claude.unwrap().conversations, 2);
1370        assert_eq!(claude.unwrap().messages, 8); // 5 + 3
1371    }
1372
1373    #[test]
1374    fn test_workspace_summary_distinct_agents_and_recent_titles() {
1375        let (_dir, conn) = create_test_db();
1376        insert_test_data(&conn);
1377
1378        let generator = AnalyticsGenerator::new(&conn);
1379        let summary = generator.generate_workspace_summary().unwrap();
1380
1381        let project_a = summary
1382            .workspaces
1383            .iter()
1384            .find(|w| w.path == "/home/user/project-a")
1385            .expect("project-a workspace should exist");
1386
1387        assert_eq!(project_a.messages, 8); // 5 + 3
1388        assert_eq!(project_a.agents, vec!["claude-code".to_string()]);
1389        assert_eq!(project_a.recent_titles.len(), 2);
1390        assert_eq!(
1391            project_a.recent_titles.first().map(String::as_str),
1392            Some("Fix database connection")
1393        );
1394    }
1395
1396    #[test]
1397    fn test_agent_summary_high_cardinality_distribution() {
1398        let (_dir, conn) = create_test_db();
1399
1400        let mut conv_id: i64 = 1;
1401
1402        // High-cardinality main agent across many workspaces.
1403        for i in 0..40 {
1404            let workspace = format!("/home/user/ws-{}", i % 10);
1405            let started_at = 1_700_000_000_000i64 + i as i64 * 1_000;
1406            let title = format!("Claude conversation {}", i);
1407            let source = format!("/path/{}.jsonl", conv_id);
1408            conn.execute_compat(
1409                "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
1410                 VALUES (?1, 'claude-code', ?2, ?3, ?4, ?5, 1)",
1411                frankensqlite::params![
1412                    conv_id,
1413                    workspace.as_str(),
1414                    title.as_str(),
1415                    source.as_str(),
1416                    started_at
1417                ],
1418            )
1419            .unwrap();
1420            let content = format!("message {}", i);
1421            conn.execute_compat(
1422                "INSERT INTO messages (conversation_id, idx, role, content, created_at)
1423                 VALUES (?1, 0, 'assistant', ?2, ?3)",
1424                frankensqlite::params![conv_id, content.as_str(), started_at],
1425            )
1426            .unwrap();
1427            conv_id += 1;
1428        }
1429
1430        // Secondary agent with lower cardinality.
1431        for i in 0..5 {
1432            let started_at = 1_700_100_000_000i64 + i as i64 * 1_000;
1433            let title = format!("Codex conversation {}", i);
1434            let source = format!("/path/{}.jsonl", conv_id);
1435            conn.execute_compat(
1436                "INSERT INTO conversations (id, agent, workspace, title, source_path, started_at, message_count)
1437                 VALUES (?1, 'codex', '/home/user/codex-ws', ?2, ?3, ?4, 1)",
1438                frankensqlite::params![
1439                    conv_id,
1440                    title.as_str(),
1441                    source.as_str(),
1442                    started_at
1443                ],
1444            )
1445            .unwrap();
1446            let content = format!("codex {}", i);
1447            conn.execute_compat(
1448                "INSERT INTO messages (conversation_id, idx, role, content, created_at)
1449                 VALUES (?1, 0, 'assistant', ?2, ?3)",
1450                frankensqlite::params![conv_id, content.as_str(), started_at],
1451            )
1452            .unwrap();
1453            conv_id += 1;
1454        }
1455
1456        let generator = AnalyticsGenerator::new(&conn);
1457        let summary = generator.generate_agent_summary().unwrap();
1458
1459        let claude = summary
1460            .agents
1461            .iter()
1462            .find(|a| a.name == "claude-code")
1463            .expect("claude-code agent should exist");
1464        assert_eq!(claude.conversations, 40);
1465        assert_eq!(claude.messages, 40);
1466        assert_eq!(claude.workspaces.len(), 10);
1467        assert!((claude.avg_messages_per_conversation - 1.0).abs() < f64::EPSILON);
1468
1469        let codex = summary
1470            .agents
1471            .iter()
1472            .find(|a| a.name == "codex")
1473            .expect("codex agent should exist");
1474        assert_eq!(codex.conversations, 5);
1475        assert_eq!(codex.messages, 5);
1476    }
1477
1478    #[test]
1479    fn test_bundle_write() {
1480        let (_dir, conn) = create_test_db();
1481        insert_test_data(&conn);
1482
1483        let generator = AnalyticsGenerator::new(&conn);
1484        let bundle = generator.generate_all().unwrap();
1485
1486        let output_dir = TempDir::new().unwrap();
1487        bundle.write_to_dir(output_dir.path()).unwrap();
1488
1489        // Verify files exist
1490        assert!(output_dir.path().join("statistics.json").exists());
1491        assert!(output_dir.path().join("timeline.json").exists());
1492        assert!(output_dir.path().join("workspace_summary.json").exists());
1493        assert!(output_dir.path().join("agent_summary.json").exists());
1494        assert!(output_dir.path().join("top_terms.json").exists());
1495    }
1496
1497    #[test]
1498    fn test_generate_all() {
1499        let (_dir, conn) = create_test_db();
1500        insert_test_data(&conn);
1501
1502        let generator = AnalyticsGenerator::new(&conn);
1503        let bundle = generator.generate_all().unwrap();
1504
1505        // Verify bundle contains all parts
1506        assert_eq!(bundle.statistics.total_conversations, 3);
1507        assert!(!bundle.timeline.daily.is_empty() || bundle.timeline.monthly.is_empty());
1508        assert!(!bundle.workspace_summary.workspaces.is_empty());
1509        assert!(!bundle.agent_summary.agents.is_empty());
1510        // top_terms might be empty depending on stop word filtering
1511    }
1512
1513    #[test]
1514    fn test_empty_database() {
1515        let (_dir, conn) = create_test_db();
1516        // Don't insert any data
1517
1518        let generator = AnalyticsGenerator::new(&conn);
1519        let bundle = generator.generate_all().unwrap();
1520
1521        assert_eq!(bundle.statistics.total_conversations, 0);
1522        assert_eq!(bundle.statistics.total_messages, 0);
1523        assert!(bundle.timeline.daily.is_empty());
1524        assert!(bundle.workspace_summary.workspaces.is_empty());
1525        assert!(bundle.agent_summary.agents.is_empty());
1526        assert!(bundle.top_terms.terms.is_empty());
1527    }
1528}