Skip to main content

smc_cli_cc/
analytics.rs

1//! Frequency analysis and aggregate statistics across conversation logs.
2
3use crate::config::SessionFile;
4use crate::models;
5use anyhow::Result;
6use colored::*;
7use indicatif::{ProgressBar, ProgressStyle};
8use rayon::prelude::*;
9use std::collections::HashMap;
10use std::io::BufRead;
11use std::sync::atomic::{AtomicU64, Ordering};
12use std::sync::Mutex;
13
14/// Format a number with comma separators (e.g., 1,234,567).
15pub fn format_count(n: u64) -> String {
16    let s = n.to_string();
17    let mut result = String::new();
18    for (i, c) in s.chars().rev().enumerate() {
19        if i > 0 && i % 3 == 0 {
20            result.push(',');
21        }
22        result.push(c);
23    }
24    result.chars().rev().collect()
25}
26
27/// Format bytes into a human-readable string (e.g., "2.85GB").
28pub fn format_bytes(bytes: u64) -> String {
29    if bytes < 1024 {
30        format!("{}B", bytes)
31    } else if bytes < 1024 * 1024 {
32        format!("{:.1}KB", bytes as f64 / 1024.0)
33    } else if bytes < 1024 * 1024 * 1024 {
34        format!("{:.1}MB", bytes as f64 / (1024.0 * 1024.0))
35    } else {
36        format!("{:.2}GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
37    }
38}
39
40fn make_progress_bar(len: u64) -> ProgressBar {
41    let pb = ProgressBar::new(len);
42    pb.set_style(
43        ProgressStyle::default_bar()
44            .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} files")
45            .unwrap()
46            .progress_chars("█▓░"),
47    );
48    pb
49}
50
51/// Print aggregate statistics: total sessions, size, and top projects.
52pub fn print_stats(files: &[SessionFile]) -> Result<()> {
53    let total_files = files.len();
54    let total_size: u64 = files.iter().map(|f| f.size_bytes).sum();
55
56    let mut projects: HashMap<String, (usize, u64)> = HashMap::new();
57    for f in files {
58        let entry = projects.entry(f.project_name.clone()).or_default();
59        entry.0 += 1;
60        entry.1 += f.size_bytes;
61    }
62
63    println!("{}", "smc Stats".bold().cyan());
64    println!("{}", "═".repeat(50));
65    println!("  Total sessions:  {}", total_files.to_string().bold());
66    println!(
67        "  Total size:      {}",
68        format_bytes(total_size).bold()
69    );
70    println!("  Projects:        {}", projects.len().to_string().bold());
71    println!();
72
73    println!("{}", "Top Projects by Size".bold());
74    println!("{}", "─".repeat(50));
75
76    let mut sorted: Vec<_> = projects.into_iter().collect();
77    sorted.sort_by(|a, b| b.1 .1.cmp(&a.1 .1));
78
79    for (name, (count, size)) in sorted.iter().take(15) {
80        println!(
81            "  {:30} {:>4} sessions  {:>8}",
82            name.cyan(),
83            count,
84            format_bytes(*size)
85        );
86    }
87
88    if sorted.len() > 15 {
89        println!("  ... and {} more projects", sorted.len() - 15);
90    }
91
92    Ok(())
93}
94
95/// Print all projects with session counts, sizes, and date ranges.
96pub fn print_projects(files: &[SessionFile]) -> Result<()> {
97    struct ProjectInfo {
98        sessions: usize,
99        total_size: u64,
100        earliest: Option<String>,
101        latest: Option<String>,
102    }
103
104    let mut projects: HashMap<String, ProjectInfo> = HashMap::new();
105
106    for file in files {
107        let entry = projects
108            .entry(file.project_name.clone())
109            .or_insert(ProjectInfo {
110                sessions: 0,
111                total_size: 0,
112                earliest: None,
113                latest: None,
114            });
115        entry.sessions += 1;
116        entry.total_size += file.size_bytes;
117
118        if let Ok(f) = std::fs::File::open(&file.path) {
119            let reader = std::io::BufReader::new(f);
120            for line in reader.lines().take(5) {
121                let Ok(line) = line else { continue };
122                if let Ok(record) = serde_json::from_str::<models::Record>(&line) {
123                    if let Some(msg) = record.as_message_record() {
124                        if let Some(ts) = &msg.timestamp {
125                            let ts_date = ts.get(..10).unwrap_or(ts);
126                            if entry.earliest.is_none()
127                                || entry.earliest.as_deref().unwrap_or("") > ts_date
128                            {
129                                entry.earliest = Some(ts_date.to_string());
130                            }
131                            if entry.latest.is_none()
132                                || entry.latest.as_deref().unwrap_or("") < ts_date
133                            {
134                                entry.latest = Some(ts_date.to_string());
135                            }
136                            break;
137                        }
138                    }
139                }
140            }
141        }
142    }
143
144    let mut sorted: Vec<_> = projects.into_iter().collect();
145    sorted.sort_by(|a, b| {
146        b.1.latest
147            .as_deref()
148            .unwrap_or("")
149            .cmp(a.1.latest.as_deref().unwrap_or(""))
150    });
151
152    println!(
153        "{} projects\n",
154        sorted.len().to_string().bold()
155    );
156
157    for (name, info) in &sorted {
158        let date_range = match (&info.earliest, &info.latest) {
159            (Some(e), Some(l)) if e == l => e.clone(),
160            (Some(e), Some(l)) => format!("{} → {}", e, l),
161            (Some(d), None) | (None, Some(d)) => d.clone(),
162            (None, None) => "unknown".to_string(),
163        };
164
165        println!(
166            "  {:30} {:>4} sessions  {:>8}  {}",
167            name.cyan(),
168            info.sessions,
169            format_bytes(info.total_size),
170            date_range.dimmed()
171        );
172    }
173
174    Ok(())
175}
176
177/// Character frequency analysis on parsed message content.
178pub fn print_freq_chars(files: &[SessionFile]) -> Result<()> {
179    let counts: Vec<AtomicU64> = (0..26).map(|_| AtomicU64::new(0)).collect();
180    let pb = make_progress_bar(files.len() as u64);
181
182    files.par_iter().for_each(|file| {
183        if let Ok(f) = std::fs::File::open(&file.path) {
184            let reader = std::io::BufReader::with_capacity(256 * 1024, f);
185            for line in reader.lines() {
186                let Ok(line) = line else { continue };
187                let Ok(record) = serde_json::from_str::<models::Record>(&line) else { continue };
188                let Some(msg) = record.as_message_record() else { continue };
189                let text = msg.text_content();
190                for b in text.bytes() {
191                    let idx = match b {
192                        b'a'..=b'z' => (b - b'a') as usize,
193                        b'A'..=b'Z' => (b - b'A') as usize,
194                        _ => continue,
195                    };
196                    counts[idx].fetch_add(1, Ordering::Relaxed);
197                }
198            }
199        }
200        pb.inc(1);
201    });
202
203    pb.finish_and_clear();
204    print_char_table(&counts, "parsed content", files);
205    Ok(())
206}
207
208/// Character frequency analysis on raw JSONL bytes.
209pub fn print_freq_chars_raw(files: &[SessionFile]) -> Result<()> {
210    let counts: Vec<AtomicU64> = (0..26).map(|_| AtomicU64::new(0)).collect();
211    let pb = make_progress_bar(files.len() as u64);
212
213    files.par_iter().for_each(|file| {
214        if let Ok(data) = std::fs::read(&file.path) {
215            for &b in &data {
216                let idx = match b {
217                    b'a'..=b'z' => (b - b'a') as usize,
218                    b'A'..=b'Z' => (b - b'A') as usize,
219                    _ => continue,
220                };
221                counts[idx].fetch_add(1, Ordering::Relaxed);
222            }
223        }
224        pb.inc(1);
225    });
226
227    pb.finish_and_clear();
228    print_char_table(&counts, "raw JSONL bytes", files);
229    Ok(())
230}
231
232fn print_char_table(counts: &[AtomicU64], label: &str, files: &[SessionFile]) {
233    let totals: Vec<u64> = counts.iter().map(|c| c.load(Ordering::Relaxed)).collect();
234    let max_count = *totals.iter().max().unwrap_or(&1);
235    let grand_total: u64 = totals.iter().sum();
236
237    println!("{}", format!("Character Frequency (a-z, case-insensitive, {})", label).bold().cyan());
238    println!("{}", "═".repeat(60));
239
240    for (i, count) in totals.iter().enumerate() {
241        let letter = (b'a' + i as u8) as char;
242        let bar_len = (*count as f64 / max_count as f64 * 40.0) as usize;
243        let bar = "█".repeat(bar_len);
244        let pct = *count as f64 / grand_total as f64 * 100.0;
245        println!(
246            "  {}  {:>12}  ({:>5.2}%)  {}",
247            letter.to_string().bold(),
248            format_count(*count),
249            pct,
250            bar.cyan()
251        );
252    }
253
254    println!("{}", "─".repeat(60));
255    println!(
256        "  Total: {}  across {} files ({})",
257        format_count(grand_total).bold(),
258        files.len(),
259        format_bytes(files.iter().map(|f| f.size_bytes).sum())
260    );
261}
262
263/// Word frequency analysis across parsed message content.
264pub fn print_freq_words(files: &[SessionFile], limit: usize) -> Result<()> {
265    let word_counts: Mutex<HashMap<String, u64>> = Mutex::new(HashMap::new());
266    let pb = make_progress_bar(files.len() as u64);
267
268    files.par_iter().for_each(|file| {
269        let mut local: HashMap<String, u64> = HashMap::new();
270        if let Ok(f) = std::fs::File::open(&file.path) {
271            let reader = std::io::BufReader::with_capacity(256 * 1024, f);
272            for line in reader.lines() {
273                let Ok(line) = line else { continue };
274                let Ok(record) = serde_json::from_str::<models::Record>(&line) else { continue };
275                let Some(msg) = record.as_message_record() else { continue };
276                let text = msg.text_content();
277                for word in text.split(|c: char| !c.is_alphanumeric()) {
278                    if word.len() >= 3 {
279                        *local.entry(word.to_lowercase()).or_default() += 1;
280                    }
281                }
282            }
283        }
284        let mut global = word_counts.lock().unwrap();
285        for (word, count) in local {
286            *global.entry(word).or_default() += count;
287        }
288        pb.inc(1);
289    });
290
291    pb.finish_and_clear();
292
293    let counts = word_counts.into_inner().unwrap();
294    let mut sorted: Vec<_> = counts.into_iter().collect();
295    sorted.sort_by(|a, b| b.1.cmp(&a.1));
296
297    let max_count = sorted.first().map(|(_, c)| *c).unwrap_or(1);
298
299    println!("{}", "Word Frequency (top words, 3+ chars)".bold().cyan());
300    println!("{}", "═".repeat(60));
301
302    for (word, count) in sorted.iter().take(limit) {
303        let bar_len = (*count as f64 / max_count as f64 * 30.0) as usize;
304        let bar = "█".repeat(bar_len);
305        println!("  {:20} {:>12}  {}", word.bold(), format_count(*count), bar.cyan());
306    }
307
308    let grand_total: u64 = sorted.iter().map(|(_, c)| c).sum();
309    println!("{}", "─".repeat(60));
310    println!("  {} unique words, {} total occurrences", format_count(sorted.len() as u64), format_count(grand_total));
311
312    Ok(())
313}
314
315/// Tool usage frequency analysis.
316pub fn print_freq_tools(files: &[SessionFile], limit: usize) -> Result<()> {
317    let tool_counts: Mutex<HashMap<String, u64>> = Mutex::new(HashMap::new());
318    let pb = make_progress_bar(files.len() as u64);
319
320    files.par_iter().for_each(|file| {
321        let mut local: HashMap<String, u64> = HashMap::new();
322        if let Ok(f) = std::fs::File::open(&file.path) {
323            let reader = std::io::BufReader::with_capacity(256 * 1024, f);
324            for line in reader.lines() {
325                let Ok(line) = line else { continue };
326                let Ok(record) = serde_json::from_str::<models::Record>(&line) else { continue };
327                let Some(msg) = record.as_message_record() else { continue };
328                for tool in msg.tool_calls() {
329                    *local.entry(tool.to_string()).or_default() += 1;
330                }
331            }
332        }
333        let mut global = tool_counts.lock().unwrap();
334        for (tool, count) in local {
335            *global.entry(tool).or_default() += count;
336        }
337        pb.inc(1);
338    });
339
340    pb.finish_and_clear();
341
342    let counts = tool_counts.into_inner().unwrap();
343    let mut sorted: Vec<_> = counts.into_iter().collect();
344    sorted.sort_by(|a, b| b.1.cmp(&a.1));
345
346    let max_count = sorted.first().map(|(_, c)| *c).unwrap_or(1);
347    let grand_total: u64 = sorted.iter().map(|(_, c)| c).sum();
348
349    println!("{}", "Tool Usage Frequency".bold().cyan());
350    println!("{}", "═".repeat(60));
351
352    for (tool, count) in sorted.iter().take(limit) {
353        let bar_len = (*count as f64 / max_count as f64 * 30.0) as usize;
354        let bar = "█".repeat(bar_len);
355        let pct = *count as f64 / grand_total as f64 * 100.0;
356        println!("  {:20} {:>10}  ({:>5.1}%)  {}", tool.bold(), format_count(*count), pct, bar.cyan());
357    }
358
359    println!("{}", "─".repeat(60));
360    println!("  {} total tool calls", format_count(grand_total));
361
362    Ok(())
363}
364
365/// Message role frequency analysis.
366pub fn print_freq_roles(files: &[SessionFile]) -> Result<()> {
367    let role_counts: Mutex<HashMap<String, u64>> = Mutex::new(HashMap::new());
368    let pb = make_progress_bar(files.len() as u64);
369
370    files.par_iter().for_each(|file| {
371        let mut local: HashMap<String, u64> = HashMap::new();
372        if let Ok(f) = std::fs::File::open(&file.path) {
373            let reader = std::io::BufReader::with_capacity(256 * 1024, f);
374            for line in reader.lines() {
375                let Ok(line) = line else { continue };
376                let Ok(record) = serde_json::from_str::<models::Record>(&line) else { continue };
377                if record.is_message() {
378                    *local.entry(record.role_str().to_string()).or_default() += 1;
379                }
380            }
381        }
382        let mut global = role_counts.lock().unwrap();
383        for (role, count) in local {
384            *global.entry(role).or_default() += count;
385        }
386        pb.inc(1);
387    });
388
389    pb.finish_and_clear();
390
391    let counts = role_counts.into_inner().unwrap();
392    let mut sorted: Vec<_> = counts.into_iter().collect();
393    sorted.sort_by(|a, b| b.1.cmp(&a.1));
394
395    let max_count = sorted.first().map(|(_, c)| *c).unwrap_or(1);
396    let grand_total: u64 = sorted.iter().map(|(_, c)| c).sum();
397
398    println!("{}", "Message Role Frequency".bold().cyan());
399    println!("{}", "═".repeat(60));
400
401    for (role, count) in &sorted {
402        let bar_len = (*count as f64 / max_count as f64 * 40.0) as usize;
403        let bar = "█".repeat(bar_len);
404        let pct = *count as f64 / grand_total as f64 * 100.0;
405        println!("  {:20} {:>10}  ({:>5.1}%)  {}", role.bold(), format_count(*count), pct, bar.cyan());
406    }
407
408    println!("{}", "─".repeat(60));
409    println!("  {} total messages", format_count(grand_total));
410
411    Ok(())
412}