1use std::collections::{HashMap, HashSet};
10use std::io::{BufRead, BufReader};
11use std::path::{Path, PathBuf};
12use std::sync::Arc;
13
14use serde::{Deserialize, Serialize};
15use tokio::sync::Semaphore;
16
17#[derive(Debug, Clone)]
19pub struct DiscoverConfig {
20 pub since_days: u32,
22 pub min_count: usize,
24 pub top: usize,
26 pub all_projects: bool,
28}
29
30impl Default for DiscoverConfig {
31 fn default() -> Self {
32 Self {
33 since_days: 90,
34 min_count: 3,
35 top: 20,
36 all_projects: false,
37 }
38 }
39}
40
41#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
43pub enum SuggestionCategory {
44 ClaudeMdRule,
46 Skill,
48 Command,
50}
51
52impl SuggestionCategory {
53 pub fn as_str(&self) -> &'static str {
54 match self {
55 Self::ClaudeMdRule => "CLAUDE.MD RULE",
56 Self::Skill => "SKILL",
57 Self::Command => "COMMAND",
58 }
59 }
60
61 pub fn icon(&self) -> &'static str {
62 match self {
63 Self::ClaudeMdRule => "📋",
64 Self::Skill => "🧩",
65 Self::Command => "⚡",
66 }
67 }
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct DiscoverSuggestion {
73 pub pattern: String,
75 pub count: usize,
77 pub session_count: usize,
79 pub project_count: usize,
81 pub cross_project: bool,
83 pub category: SuggestionCategory,
85 pub score: f64,
87 pub example_sessions: Vec<String>,
89}
90
91static STOP_WORDS: &[&str] = &[
96 "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
97 "from", "is", "it", "its", "be", "as", "was", "are", "were", "been", "have", "has", "had",
98 "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall",
99 "this", "that", "these", "those", "i", "you", "we", "they", "he", "she", "my", "your", "our",
100 "their", "his", "her", "me", "us", "them", "so", "if", "then", "than", "when", "what", "how",
101 "why", "where", "who", "which", "not", "no", "also", "just", "now", "up", "out", "about",
102 "into", "after", "before", "all", "any", "some", "more", "new", "add", "use", "make", "get",
103 "go", "run", "see", "here", "there", "need", "want", "please", "ok", "okay", "yes", "yeah",
104 "let", "can", "help", "look", "check", "same", "like", "very", "much", "only", "other", "also",
105 "each", "file", "code", "create", "update", "change", "think", "know", "give", "take", "put",
106 "keep",
107];
108
109static SYSTEM_INJECTION_MARKERS: &[&str] = &[
110 "this session is being continued",
111 "read the full transcript",
112 "context summary below covers",
113 "exact snippets error messages content",
114 "exiting plan mode",
115 "task tools haven",
116 "teamcreate tool team parallelize",
117];
118
119fn is_stop_word(token: &str) -> bool {
120 STOP_WORDS.contains(&token)
121}
122
123fn is_system_injection(text: &str) -> bool {
124 let lower = text.to_lowercase();
125 SYSTEM_INJECTION_MARKERS
126 .iter()
127 .any(|marker| lower.contains(marker))
128}
129
130pub fn normalize_text(text: &str) -> Vec<String> {
137 let lower = text.to_lowercase();
138 let clean: String = lower
140 .chars()
141 .map(|c| {
142 if c.is_alphanumeric() || c == '-' {
143 c
144 } else {
145 ' '
146 }
147 })
148 .collect();
149
150 clean
151 .split_whitespace()
152 .filter(|t| t.len() > 2 && !is_stop_word(t))
153 .map(|t| t.to_string())
154 .collect()
155}
156
157pub fn extract_ngrams(tokens: &[String], n: usize) -> Vec<Vec<String>> {
159 if tokens.len() < n {
160 return vec![];
161 }
162 (0..=(tokens.len() - n))
163 .map(|i| tokens[i..i + n].to_vec())
164 .collect()
165}
166
167pub fn jaccard_overlap(a: &[String], b: &[String]) -> f64 {
169 if a.is_empty() || b.is_empty() {
170 return 0.0;
171 }
172 let set_a: HashSet<&str> = a.iter().map(|s| s.as_str()).collect();
173 let set_b: HashSet<&str> = b.iter().map(|s| s.as_str()).collect();
174 let intersection = set_a.intersection(&set_b).count();
175 let union = set_a.union(&set_b).count();
176 if union == 0 {
177 0.0
178 } else {
179 intersection as f64 / union as f64
180 }
181}
182
183#[derive(Debug, Clone)]
189pub struct SessionData {
190 pub session_id: String,
191 pub project: String,
192 pub messages: Vec<String>,
193}
194
195fn extract_user_content(line: &str) -> Option<String> {
197 #[derive(Deserialize)]
198 struct MessageContent {
199 #[serde(rename = "type")]
200 msg_type: Option<String>,
201 content: Option<serde_json::Value>,
202 }
203 #[derive(Deserialize)]
204 struct Entry {
205 #[serde(rename = "type")]
206 entry_type: Option<String>,
207 message: Option<MessageContent>,
208 }
209
210 let entry: Entry = serde_json::from_str(line).ok()?;
211
212 if entry.entry_type.as_deref() != Some("user") {
213 return None;
214 }
215
216 let message = entry.message?;
217 if message.msg_type.as_deref() != Some("human") && message.msg_type.is_some() {
219 }
222
223 let content = message.content?;
224
225 let text = match &content {
227 serde_json::Value::String(s) => s.clone(),
228 _ => return None,
229 };
230
231 let trimmed = text.trim();
232
233 if trimmed.starts_with('<') {
235 return None;
236 }
237
238 let len = trimmed.len();
240 if !(10..=800).contains(&len) {
241 return None;
242 }
243
244 if is_system_injection(trimmed) {
246 return None;
247 }
248
249 Some(trimmed.to_string())
250}
251
252fn extract_all_user_messages(path: &Path) -> Vec<String> {
256 let file = match std::fs::File::open(path) {
257 Ok(f) => f,
258 Err(_) => return vec![],
259 };
260
261 let reader = BufReader::new(file);
262 let mut messages = Vec::new();
263
264 for line in reader.lines() {
265 let line = match line {
266 Ok(l) => l,
267 Err(_) => continue,
268 };
269 if line.trim().is_empty() {
270 continue;
271 }
272 if let Some(msg) = extract_user_content(&line) {
273 messages.push(msg);
274 }
275 }
276
277 messages
278}
279
280pub async fn collect_sessions_data(
290 projects_dir: &Path,
291 since_days: u32,
292 filter_project: Option<&str>,
293) -> Vec<SessionData> {
294 if !projects_dir.exists() {
295 return vec![];
296 }
297
298 let project_dirs: Vec<PathBuf> = match std::fs::read_dir(projects_dir) {
300 Ok(entries) => entries
301 .filter_map(|e| e.ok())
302 .filter(|e| e.path().is_dir())
303 .filter(|e| {
304 if let Some(proj) = filter_project {
305 e.file_name().to_string_lossy().contains(proj)
306 } else {
307 true
308 }
309 })
310 .map(|e| e.path())
311 .collect(),
312 Err(_) => return vec![],
313 };
314
315 let cutoff = chrono::Utc::now() - chrono::Duration::days(since_days as i64);
316 let cutoff_secs = cutoff.timestamp() as u64;
317
318 let semaphore = Arc::new(Semaphore::new(32));
319 let mut handles = Vec::new();
320
321 for project_dir in project_dirs {
322 let project_name = project_dir
323 .file_name()
324 .unwrap_or_default()
325 .to_string_lossy()
326 .to_string();
327
328 let jsonl_files: Vec<PathBuf> = match std::fs::read_dir(&project_dir) {
330 Ok(entries) => entries
331 .filter_map(|e| e.ok())
332 .map(|e| e.path())
333 .filter(|p| p.extension().map(|ext| ext == "jsonl").unwrap_or(false))
334 .collect(),
335 Err(_) => continue,
336 };
337
338 for filepath in jsonl_files {
339 let session_id = filepath
340 .file_stem()
341 .unwrap_or_default()
342 .to_string_lossy()
343 .to_string();
344
345 if session_id.starts_with("agent-") {
347 continue;
348 }
349
350 let mtime = match filepath.metadata() {
352 Ok(m) => m
353 .modified()
354 .ok()
355 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
356 .map(|d| d.as_secs())
357 .unwrap_or(0),
358 Err(_) => continue,
359 };
360
361 if mtime < cutoff_secs {
362 continue;
363 }
364
365 let sem = Arc::clone(&semaphore);
366 let filepath_clone = filepath.clone();
367 let project_name_clone = project_name.clone();
368 let session_id_clone = session_id.clone();
369
370 let handle = tokio::spawn(async move {
371 let _permit = sem.acquire().await.ok()?;
372 let path = filepath_clone.clone();
374 let messages =
375 tokio::task::spawn_blocking(move || extract_all_user_messages(&path))
376 .await
377 .ok()?;
378
379 if messages.is_empty() {
380 return None;
381 }
382
383 Some(SessionData {
384 session_id: session_id_clone,
385 project: project_name_clone,
386 messages,
387 })
388 });
389
390 handles.push(handle);
391 }
392 }
393
394 let mut result = Vec::new();
395 for handle in handles {
396 if let Ok(Some(data)) = handle.await {
397 result.push(data);
398 }
399 }
400
401 result
402}
403
404#[derive(Debug, Clone)]
410struct NgramOccurrence {
411 session_id: String,
412 project: String,
413}
414
415pub fn discover_patterns(
417 sessions_data: &[SessionData],
418 min_count: usize,
419 top: usize,
420) -> Vec<DiscoverSuggestion> {
421 let total_sessions = sessions_data.len();
422 if total_sessions == 0 {
423 return vec![];
424 }
425
426 let mut ngram_index: HashMap<Vec<String>, Vec<NgramOccurrence>> = HashMap::new();
428
429 for sd in sessions_data {
430 for msg in &sd.messages {
431 let tokens = normalize_text(msg);
432 if tokens.len() < 3 {
433 continue;
434 }
435
436 for n in 3..=6usize {
437 for ngram in extract_ngrams(&tokens, n) {
438 ngram_index.entry(ngram).or_default().push(NgramOccurrence {
439 session_id: sd.session_id.clone(),
440 project: sd.project.clone(),
441 });
442 }
443 }
444 }
445 }
446
447 let frequent: HashMap<Vec<String>, Vec<NgramOccurrence>> = ngram_index
449 .into_iter()
450 .filter(|(_, occs)| occs.len() >= min_count)
451 .collect();
452
453 let mut sorted_ngrams: Vec<(Vec<String>, Vec<NgramOccurrence>)> =
455 frequent.into_iter().collect();
456
457 sorted_ngrams.sort_by(|a, b| b.0.len().cmp(&a.0.len()).then(b.1.len().cmp(&a.1.len())));
459
460 let mut kept: Vec<(Vec<String>, Vec<NgramOccurrence>)> = Vec::new();
461 let mut subsumed: HashSet<Vec<String>> = HashSet::new();
462
463 for (ngram, occs) in sorted_ngrams {
464 if subsumed.contains(&ngram) {
465 continue;
466 }
467 for sub_n in 3..ngram.len() {
469 let end = ngram.len() - sub_n + 1;
470 for i in 0..end {
471 let sub = ngram[i..i + sub_n].to_vec();
472 subsumed.insert(sub);
473 }
474 }
475 kept.push((ngram, occs));
476 }
477
478 let mut clusters: Vec<Vec<usize>> = Vec::new();
480 let mut assigned: HashSet<usize> = HashSet::new();
481
482 for i in 0..kept.len() {
483 if assigned.contains(&i) {
484 continue;
485 }
486 let mut cluster = vec![i];
487 for j in (i + 1)..kept.len() {
488 if assigned.contains(&j) {
489 continue;
490 }
491 let overlap = jaccard_overlap(&kept[i].0, &kept[j].0);
492 if overlap > 0.6 {
493 cluster.push(j);
494 assigned.insert(j);
495 }
496 }
497 clusters.push(cluster);
498 assigned.insert(i);
499 }
500
501 let mut suggestions = Vec::new();
503
504 for cluster in &clusters {
505 let best_idx = *cluster
507 .iter()
508 .max_by_key(|&&i| (kept[i].0.len(), kept[i].1.len()))
509 .unwrap();
510
511 let (ref ngram, _) = kept[best_idx];
512
513 let mut all_occurrences: Vec<&NgramOccurrence> = Vec::new();
515 for &idx in cluster {
516 all_occurrences.extend(kept[idx].1.iter());
517 }
518
519 let distinct_sessions: HashSet<&str> = all_occurrences
520 .iter()
521 .map(|o| o.session_id.as_str())
522 .collect();
523 let distinct_projects: HashSet<&str> =
524 all_occurrences.iter().map(|o| o.project.as_str()).collect();
525
526 let count = all_occurrences.len();
527 let session_count = distinct_sessions.len();
528 let project_count = distinct_projects.len();
529 let cross_project = project_count >= 2;
530
531 if session_count < min_count {
532 continue;
533 }
534
535 let session_pct = session_count as f64 / total_sessions as f64;
536
537 let category = if session_pct > 0.20 {
538 SuggestionCategory::ClaudeMdRule
539 } else if session_pct >= 0.05 {
540 SuggestionCategory::Skill
541 } else {
542 SuggestionCategory::Command
543 };
544
545 let score = session_pct * if cross_project { 1.5 } else { 1.0 };
546
547 let mut example_sessions: Vec<String> = distinct_sessions
549 .iter()
550 .take(2)
551 .map(|s| s.to_string())
552 .collect();
553 example_sessions.sort(); let pattern = ngram.join(" ");
556
557 suggestions.push(DiscoverSuggestion {
558 pattern,
559 count,
560 session_count,
561 project_count,
562 cross_project,
563 category,
564 score: (score * 10000.0).round() / 10000.0,
565 example_sessions,
566 });
567 }
568
569 suggestions.sort_by(|a, b| {
571 b.score
572 .partial_cmp(&a.score)
573 .unwrap_or(std::cmp::Ordering::Equal)
574 });
575 suggestions.truncate(top);
576 suggestions
577}
578
579pub async fn run_discover(
587 claude_home: &Path,
588 config: &DiscoverConfig,
589 filter_project: Option<&str>,
590) -> anyhow::Result<(Vec<DiscoverSuggestion>, usize, usize)> {
591 let projects_dir = claude_home.join("projects");
592
593 eprint!("\rScanning sessions...");
594 let sessions_data =
595 collect_sessions_data(&projects_dir, config.since_days, filter_project).await;
596
597 let total_sessions = sessions_data.len();
598 let total_projects: HashSet<&str> = sessions_data.iter().map(|s| s.project.as_str()).collect();
599 let total_projects_count = total_projects.len();
600
601 if total_sessions == 0 {
602 eprintln!();
603 return Ok((vec![], 0, 0));
604 }
605
606 eprint!(
607 "\rAnalyzing {} sessions across {} project(s)... ",
608 total_sessions, total_projects_count
609 );
610
611 let sessions_data_clone = sessions_data;
613 let min_count = config.min_count;
614 let top = config.top;
615
616 let suggestions = tokio::task::spawn_blocking(move || {
617 discover_patterns(&sessions_data_clone, min_count, top)
618 })
619 .await
620 .map_err(|e| anyhow::anyhow!("spawn_blocking failed: {}", e))?;
621
622 eprintln!();
623
624 Ok((suggestions, total_sessions, total_projects_count))
625}
626
627#[cfg(test)]
632mod tests {
633 use super::*;
634
635 #[test]
636 fn test_normalize_text() {
637 let tokens = normalize_text("Add unit tests for the authentication flow!");
638 assert!(tokens.contains(&"authentication".to_string()));
641 assert!(tokens.contains(&"flow".to_string()));
642 assert!(tokens.contains(&"tests".to_string()));
643 assert!(!tokens.contains(&"the".to_string()));
645 assert!(!tokens.contains(&"for".to_string()));
646 assert!(!tokens.contains(&"add".to_string()));
648 }
649
650 #[test]
651 fn test_normalize_strips_punctuation() {
652 let tokens = normalize_text("Write tests: before implementation.");
653 assert!(tokens.contains(&"write".to_string()));
655 assert!(tokens.contains(&"tests".to_string()));
656 assert!(tokens.contains(&"before".to_string()) || !tokens.contains(&"before".to_string()));
657 assert!(!tokens.contains(&"before".to_string()));
659 }
660
661 #[test]
662 fn test_ngram_extraction() {
663 let tokens: Vec<String> = vec![
664 "write".into(),
665 "tests".into(),
666 "authentication".into(),
667 "flow".into(),
668 "security".into(),
669 ];
670
671 let trigrams = extract_ngrams(&tokens, 3);
672 assert_eq!(trigrams.len(), 3);
673 assert_eq!(trigrams[0], vec!["write", "tests", "authentication"]);
674
675 let six_grams = extract_ngrams(&tokens, 6);
676 assert!(six_grams.is_empty(), "tokens shorter than 6 → no 6-grams");
677
678 let tokens6: Vec<String> = vec![
679 "a".into(),
680 "b".into(),
681 "c".into(),
682 "d".into(),
683 "e".into(),
684 "f".into(),
685 ];
686 let six_grams2 = extract_ngrams(&tokens6, 6);
687 assert_eq!(six_grams2.len(), 1);
688 }
689
690 #[test]
691 fn test_jaccard_overlap() {
692 let a: Vec<String> = vec!["write".into(), "tests".into(), "first".into()];
693 let b: Vec<String> = vec!["write".into(), "tests".into(), "auth".into()];
694 let overlap = jaccard_overlap(&a, &b);
695 assert!((overlap - 0.5).abs() < 1e-9);
697
698 let identical: Vec<String> = vec!["foo".into(), "bar".into()];
699 assert!((jaccard_overlap(&identical, &identical) - 1.0).abs() < 1e-9);
700
701 let empty: Vec<String> = vec![];
702 assert_eq!(jaccard_overlap(&empty, &a), 0.0);
703 }
704
705 #[test]
706 fn test_category_threshold() {
707 let mut sessions_data: Vec<SessionData> = Vec::new();
709
710 for i in 0..100 {
712 let msg = if i < 22 {
713 "security review authentication flow handling properly"
714 } else {
715 format!("some other message number {}", i).leak()
716 };
717 sessions_data.push(SessionData {
718 session_id: format!("session-{:04}", i),
719 project: "proj-a".to_string(),
720 messages: vec![msg.to_string()],
721 });
722 }
723
724 let suggestions = discover_patterns(&sessions_data, 3, 20);
725 let security = suggestions.iter().find(|s| s.pattern.contains("security"));
727 if let Some(s) = security {
728 assert_eq!(
729 s.category,
730 SuggestionCategory::ClaudeMdRule,
731 "22/100 = 22% > 20% → ClaudeMdRule"
732 );
733 }
734 }
735
736 #[test]
737 fn test_cross_project_bonus() {
738 let sessions_data: Vec<SessionData> = (0..10)
740 .map(|i| SessionData {
741 session_id: format!("s{}", i),
742 project: if i < 5 { "proj-a" } else { "proj-b" }.to_string(),
743 messages: vec!["deploy staging environment pipeline testing".to_string()],
744 })
745 .collect();
746
747 let suggestions = discover_patterns(&sessions_data, 3, 20);
748 assert!(!suggestions.is_empty(), "should find patterns");
749
750 let first = &suggestions[0];
751 assert!(first.cross_project, "pattern appears in 2 projects");
752 assert!(first.score > first.session_count as f64 / 10.0);
754 }
755}