use anyhow::Result;
use regex::Regex;
use rusqlite::Connection;
use serde::Deserialize;
use serde_json::json;
use std::path::Path;
use crate::commands::scrape::database;
#[derive(Debug, Deserialize)]
struct SessionYaml {
title: Option<String>,
created: Option<String>,
git: Option<SessionGitYaml>,
}
#[derive(Debug, Deserialize)]
struct SessionGitYaml {
branch: Option<String>,
}
#[derive(Debug)]
struct ParsedSession {
id: String,
title: String,
started_at: Option<String>,
ended_at: Option<String>,
branch: Option<String>,
classification: Option<String>,
files_changed: i32,
commits_made: i32,
goals: Vec<Goal>,
observations: Vec<Observation>,
}
#[derive(Debug)]
struct Goal {
content: String,
completed: bool,
}
#[derive(Debug)]
struct Observation {
content: String,
observation_type: String,
timestamp: Option<String>,
}
pub fn create_materialized_views(conn: &Connection) -> Result<()> {
conn.execute_batch(
r#"
-- Sessions view (materialized from session.started events)
CREATE TABLE IF NOT EXISTS sessions (
id TEXT PRIMARY KEY,
title TEXT,
started_at TEXT,
ended_at TEXT,
branch TEXT,
classification TEXT,
files_changed INTEGER,
commits_made INTEGER,
file_path TEXT
);
-- Observations extracted from sessions (from session.observation events)
CREATE TABLE IF NOT EXISTS observations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT,
content TEXT,
observation_type TEXT,
timestamp TEXT,
FOREIGN KEY (session_id) REFERENCES sessions(id)
);
-- Goals per session (from session.goal events)
CREATE TABLE IF NOT EXISTS goals (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT,
content TEXT,
completed INTEGER,
FOREIGN KEY (session_id) REFERENCES sessions(id)
);
-- Indexes
CREATE INDEX IF NOT EXISTS idx_observations_session ON observations(session_id);
CREATE INDEX IF NOT EXISTS idx_observations_type ON observations(observation_type);
CREATE INDEX IF NOT EXISTS idx_goals_session ON goals(session_id);
CREATE INDEX IF NOT EXISTS idx_sessions_branch ON sessions(branch);
"#,
)?;
Ok(())
}
fn parse_yaml_frontmatter(content: &str) -> Option<SessionYaml> {
let rest = content.strip_prefix("---")?;
let end = rest.find("\n---")?;
let yaml_str = &rest[..end];
serde_yaml::from_str(yaml_str).ok()
}
fn parse_session_file(path: &Path) -> Result<ParsedSession> {
let content = std::fs::read_to_string(path)?;
let id = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
let (title, started_at, branch) = if let Some(fm) = parse_yaml_frontmatter(&content) {
(
fm.title.unwrap_or_else(|| id.clone()),
fm.created,
fm.git.and_then(|g| g.branch),
)
} else {
let title_re = Regex::new(r"^# Session: (.+)$").unwrap();
let title = content
.lines()
.find_map(|line| title_re.captures(line).map(|c| c[1].to_string()))
.unwrap_or_else(|| id.clone());
(
title,
extract_field(&content, "Started"),
extract_field(&content, "Git Branch"),
)
};
let classification = extract_classification(&content);
let (files_changed, commits_made) = extract_stats(&content);
let goals = extract_goals(&content);
let observations = extract_observations(&content);
Ok(ParsedSession {
id,
title,
started_at,
ended_at: None,
branch,
classification,
files_changed,
commits_made,
goals,
observations,
})
}
fn extract_field(content: &str, field: &str) -> Option<String> {
let pattern = format!(r"\*\*{}\*\*:\s*(.+)", regex::escape(field));
let re = Regex::new(&pattern).ok()?;
re.captures(content).map(|c| c[1].trim().to_string())
}
fn extract_classification(content: &str) -> Option<String> {
let re = Regex::new(r"Work Type:\s*([\w-]+)").ok()?;
re.captures(content).map(|c| c[1].to_string())
}
fn extract_stats(content: &str) -> (i32, i32) {
let files_re = Regex::new(r"Files Changed:\s*(\d+)").ok();
let commits_re = Regex::new(r"Commits:\s*(\d+)").ok();
let files = files_re
.and_then(|re| re.captures(content))
.and_then(|c| c[1].parse().ok())
.unwrap_or(0);
let commits = commits_re
.and_then(|re| re.captures(content))
.and_then(|c| c[1].parse().ok())
.unwrap_or(0);
(files, commits)
}
fn extract_goals(content: &str) -> Vec<Goal> {
let mut goals = Vec::new();
let goals_section = content
.split("## Goals")
.nth(1)
.and_then(|s| s.split("\n## ").next())
.unwrap_or("");
let checkbox_re = Regex::new(r"- \[([xX ])\] (.+)").unwrap();
for cap in checkbox_re.captures_iter(goals_section) {
let completed = &cap[1] != " ";
let content = cap[2].trim().to_string();
goals.push(Goal { content, completed });
}
goals
}
fn extract_observations(content: &str) -> Vec<Observation> {
let mut observations = Vec::new();
if let Some(decisions) = extract_section(content, "Key Decisions") {
for line in decisions.lines() {
let line = line.trim();
if line.starts_with('-') || line.starts_with('*') {
let text = line.trim_start_matches('-').trim_start_matches('*').trim();
if !text.is_empty() {
observations.push(Observation {
content: text.to_string(),
observation_type: "decision".to_string(),
timestamp: None,
});
}
}
}
}
if let Some(patterns) = extract_section(content, "Patterns Observed") {
for line in patterns.lines() {
let line = line.trim();
if line.starts_with('-') || line.starts_with('*') {
let text = line.trim_start_matches('-').trim_start_matches('*').trim();
if !text.is_empty() {
observations.push(Observation {
content: text.to_string(),
observation_type: "pattern".to_string(),
timestamp: None,
});
}
}
}
}
let work_re = Regex::new(r"^\d+\.\s+(.+)$").unwrap();
if let Some(work) = extract_section(content, "Work Completed") {
for line in work.lines() {
if let Some(cap) = work_re.captures(line.trim()) {
observations.push(Observation {
content: cap[1].to_string(),
observation_type: "work".to_string(),
timestamp: None,
});
}
}
}
if let Some(ctx) = extract_section(content, "Previous Session Context") {
let ctx = ctx.trim();
if !ctx.is_empty() && !ctx.starts_with("<!--") {
observations.push(Observation {
content: ctx.to_string(),
observation_type: "context".to_string(),
timestamp: None,
});
}
}
observations
}
fn extract_section(content: &str, header: &str) -> Option<String> {
let bold_pattern = format!(r"\*\*{}:\*\*\s*", regex::escape(header));
let heading_pattern = format!(r"## {}\s*", regex::escape(header));
if let Ok(re) = Regex::new(&bold_pattern) {
if let Some(m) = re.find(content) {
let start = m.end();
let rest = &content[start..];
let end = rest
.find("\n**")
.or_else(|| rest.find("\n## "))
.unwrap_or(rest.len());
return Some(rest[..end].to_string());
}
}
if let Ok(re) = Regex::new(&heading_pattern) {
if let Some(m) = re.find(content) {
let start = m.end();
let rest = &content[start..];
let end = rest.find("\n## ").unwrap_or(rest.len());
return Some(rest[..end].to_string());
}
}
None
}
fn insert_session(conn: &Connection, session: &ParsedSession, file_path: &str) -> Result<()> {
conn.execute(
"DELETE FROM observations WHERE session_id = ?1",
[&session.id],
)?;
conn.execute("DELETE FROM goals WHERE session_id = ?1", [&session.id])?;
conn.execute("DELETE FROM sessions WHERE id = ?1", [&session.id])?;
let timestamp = session.started_at.as_deref().unwrap_or(&session.id);
let session_event = serde_json::json!({
"title": &session.title,
"started_at": &session.started_at,
"ended_at": &session.ended_at,
"branch": &session.branch,
"classification": &session.classification,
"files_changed": session.files_changed,
"commits_made": session.commits_made,
"file_path": file_path,
});
database::insert_event(
conn,
"session.started",
timestamp,
&session.id,
Some(file_path),
&session_event.to_string(),
)?;
conn.execute(
"INSERT INTO sessions (id, title, started_at, ended_at, branch, classification, files_changed, commits_made, file_path) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
rusqlite::params![
&session.id,
&session.title,
&session.started_at,
&session.ended_at,
&session.branch,
&session.classification,
session.files_changed,
session.commits_made,
file_path,
],
)?;
let mut goal_stmt =
conn.prepare("INSERT INTO goals (session_id, content, completed) VALUES (?1, ?2, ?3)")?;
for goal in &session.goals {
let goal_event = json!({
"session_id": &session.id,
"content": &goal.content,
"completed": goal.completed,
});
database::insert_event(
conn,
"session.goal",
timestamp,
&session.id,
Some(file_path),
&goal_event.to_string(),
)?;
goal_stmt.execute(rusqlite::params![
&session.id,
&goal.content,
goal.completed as i32
])?;
}
let mut obs_stmt = conn.prepare(
"INSERT INTO observations (session_id, content, observation_type, timestamp) VALUES (?1, ?2, ?3, ?4)",
)?;
for obs in &session.observations {
let event_type = match obs.observation_type.as_str() {
"decision" => "session.decision",
"pattern" => "session.pattern",
"work" => "session.work",
"context" => "session.context",
_ => "session.observation",
};
let obs_event = json!({
"session_id": &session.id,
"content": &obs.content,
"observation_type": &obs.observation_type,
});
database::insert_event(
conn,
event_type,
obs.timestamp.as_deref().unwrap_or(timestamp),
&session.id,
Some(file_path),
&obs_event.to_string(),
)?;
obs_stmt.execute(rusqlite::params![
&session.id,
&obs.content,
&obs.observation_type,
&obs.timestamp,
])?;
}
Ok(())
}
pub fn scrape_sessions(
conn: &Connection,
session_files: &[std::path::PathBuf],
full: bool,
) -> Result<(usize, usize)> {
create_materialized_views(conn)?;
let processed: std::collections::HashSet<String> = if full {
std::collections::HashSet::new()
} else {
let mut stmt = conn.prepare("SELECT id FROM sessions")?;
let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
rows.filter_map(|r| r.ok()).collect()
};
let mut processed_count = 0;
let mut skipped = 0;
for path in session_files {
let id = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
if !full && processed.contains(&id) {
skipped += 1;
continue;
}
match parse_session_file(path) {
Ok(session) => {
if let Err(e) = insert_session(conn, &session, path.to_string_lossy().as_ref()) {
eprintln!(" Warning: failed to insert session {}: {}", id, e);
} else {
processed_count += 1;
}
}
Err(e) => {
eprintln!(" Warning: failed to parse {}: {}", path.display(), e);
}
}
}
let fts_count = database::populate_eventlog_fts5(conn)?;
if fts_count > 0 {
println!(" {} session events indexed in eventlog_fts", fts_count);
}
Ok((processed_count, skipped))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_goals() {
let content = r#"## Goals
- [ ] implement feature
- [x] fix bug
- [ ] write tests
"#;
let goals = extract_goals(content);
assert_eq!(goals.len(), 3);
assert!(!goals[0].completed);
assert!(goals[1].completed);
assert_eq!(goals[0].content, "implement feature");
}
#[test]
fn test_extract_field() {
let content = "**Started**: 2025-11-21T16:31:07Z\n**Branch**: main";
assert_eq!(
extract_field(content, "Started"),
Some("2025-11-21T16:31:07Z".to_string())
);
}
#[test]
fn test_parse_yaml_frontmatter() {
let content = r#"---
type: session
id: '20260130-183221'
title: Complete v0.9.2
status: active
llm: claude
created: '2026-01-30T23:32:21Z'
start_timestamp: 1769815941000
git:
branch: patina
starting_commit: 9c61c5e2
start_tag: session-20260130-183221-claude-start
---
## Goals
- [ ] Complete v0.9.2
"#;
let fm = parse_yaml_frontmatter(content).expect("should parse YAML frontmatter");
assert_eq!(fm.title, Some("Complete v0.9.2".to_string()));
assert_eq!(fm.created, Some("2026-01-30T23:32:21Z".to_string()));
let git = fm.git.expect("should have git section");
assert_eq!(git.branch, Some("patina".to_string()));
}
#[test]
fn test_parse_yaml_frontmatter_none_for_legacy() {
let content = "# Session: Legacy Session\n**ID**: 20251121-113107\n";
assert!(parse_yaml_frontmatter(content).is_none());
}
#[test]
fn test_parse_session_file_yaml_format() {
let content = r#"---
type: session
id: '20260130-183221'
title: Complete v0.9.2
status: archived
llm: claude
created: '2026-01-30T23:32:21Z'
start_timestamp: 1769815941000
git:
branch: patina
starting_commit: 9c61c5e2
start_tag: session-20260130-183221-claude-start
---
## Goals
- [ ] Complete v0.9.2
- [x] Fix parser bug
## Session Classification
- Work Type: feature
- Files Changed: 5
- Commits: 3
"#;
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("20260130-183221.md");
std::fs::write(&file_path, content).unwrap();
let session = parse_session_file(&file_path).unwrap();
assert_eq!(session.id, "20260130-183221");
assert_eq!(session.title, "Complete v0.9.2");
assert_eq!(session.started_at, Some("2026-01-30T23:32:21Z".to_string()));
assert_eq!(session.branch, Some("patina".to_string()));
assert_eq!(session.classification, Some("feature".to_string()));
assert_eq!(session.files_changed, 5);
assert_eq!(session.commits_made, 3);
assert_eq!(session.goals.len(), 2);
assert!(!session.goals[0].completed);
assert!(session.goals[1].completed);
}
#[test]
fn test_parse_session_file_legacy_format() {
let content = r#"# Session: Legacy Session Title
**ID**: 20251121-113107
**Started**: 2025-11-21T16:31:07Z
**LLM**: claude
**Git Branch**: work
**Session Tag**: session-20251121-113107-claude-start
**Starting Commit**: abc123
## Goals
- [x] implement feature
## Session Classification
- Work Type: pattern-work
- Files Changed: 8
- Commits: 4
"#;
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("20251121-113107.md");
std::fs::write(&file_path, content).unwrap();
let session = parse_session_file(&file_path).unwrap();
assert_eq!(session.id, "20251121-113107");
assert_eq!(session.title, "Legacy Session Title");
assert_eq!(session.started_at, Some("2025-11-21T16:31:07Z".to_string()));
assert_eq!(session.branch, Some("work".to_string()));
assert_eq!(session.classification, Some("pattern-work".to_string()));
assert_eq!(session.files_changed, 8);
assert_eq!(session.commits_made, 4);
}
}