reasonkit/telemetry/
schema.rs

1//! SQLite Schema for RALL Telemetry
2//!
3//! Local-first telemetry storage with privacy-preserving design.
4
5use crate::telemetry::TELEMETRY_SCHEMA_VERSION;
6
7/// SQLite schema for telemetry database
8pub const SCHEMA_SQL: &str = r#"
9-- ============================================================================
10-- RALL TELEMETRY SCHEMA v1
11-- ReasonKit Adaptive Learning Loop - Local Telemetry Storage
12-- ============================================================================
13
14-- Schema version tracking
15CREATE TABLE IF NOT EXISTS schema_version (
16    version INTEGER PRIMARY KEY,
17    applied_at TEXT NOT NULL DEFAULT (datetime('now')),
18    description TEXT
19);
20
21-- Insert current version if not exists
22INSERT OR IGNORE INTO schema_version (version, description)
23VALUES (1, 'Initial RALL telemetry schema');
24
25-- ============================================================================
26-- CORE TABLES
27-- ============================================================================
28
29-- Sessions table: One entry per CLI session
30CREATE TABLE IF NOT EXISTS sessions (
31    id TEXT PRIMARY KEY,                    -- UUID
32    started_at TEXT NOT NULL,               -- ISO 8601 timestamp
33    ended_at TEXT,                          -- ISO 8601 timestamp (NULL if active)
34    duration_ms INTEGER,                    -- Session duration in milliseconds
35    tool_count INTEGER DEFAULT 0,           -- Number of tools used
36    query_count INTEGER DEFAULT 0,          -- Number of queries
37    feedback_count INTEGER DEFAULT 0,       -- Number of feedback events
38    profile TEXT,                           -- Reasoning profile used (quick/balanced/deep/etc)
39    success_rate REAL,                      -- Overall success rate (0.0-1.0)
40    client_version TEXT,                    -- ReasonKit CLI version
41    os_family TEXT                          -- Operating system (sanitized)
42);
43
44-- Queries table: Individual query events (PII-stripped)
45CREATE TABLE IF NOT EXISTS queries (
46    id TEXT PRIMARY KEY,                    -- UUID
47    session_id TEXT NOT NULL,               -- FK to sessions
48    timestamp TEXT NOT NULL,                -- ISO 8601 timestamp
49
50    -- Query metadata (NO raw query text - privacy)
51    query_hash TEXT NOT NULL,               -- SHA-256 hash of normalized query
52    query_length INTEGER NOT NULL,          -- Character count
53    query_token_count INTEGER,              -- Token count (if available)
54    query_type TEXT,                        -- Classification: search/reason/code/general
55
56    -- Execution metrics
57    latency_ms INTEGER NOT NULL,            -- Total execution time
58    tool_calls INTEGER DEFAULT 0,           -- Number of tool calls
59    retrieval_count INTEGER DEFAULT 0,      -- Number of documents retrieved
60
61    -- Results (anonymized)
62    result_count INTEGER,                   -- Number of results returned
63    result_quality_score REAL,              -- Self-assessed quality (0.0-1.0)
64    error_occurred INTEGER DEFAULT 0,       -- Boolean: did an error occur?
65    error_category TEXT,                    -- Error classification (if error)
66
67    -- Context
68    profile_used TEXT,                      -- Reasoning profile
69    tools_used TEXT,                        -- JSON array of tool names used
70
71    FOREIGN KEY (session_id) REFERENCES sessions(id)
72);
73
74-- Create indexes for common queries
75CREATE INDEX IF NOT EXISTS idx_queries_session ON queries(session_id);
76CREATE INDEX IF NOT EXISTS idx_queries_timestamp ON queries(timestamp);
77CREATE INDEX IF NOT EXISTS idx_queries_type ON queries(query_type);
78CREATE INDEX IF NOT EXISTS idx_queries_hash ON queries(query_hash);
79
80-- Feedback table: User feedback events
81CREATE TABLE IF NOT EXISTS feedback (
82    id TEXT PRIMARY KEY,                    -- UUID
83    session_id TEXT NOT NULL,               -- FK to sessions
84    query_id TEXT,                          -- FK to queries (optional)
85    timestamp TEXT NOT NULL,                -- ISO 8601 timestamp
86
87    -- Feedback data
88    feedback_type TEXT NOT NULL,            -- thumbs_up/thumbs_down/explicit/implicit
89    rating INTEGER,                         -- 1-5 star rating (if explicit)
90    category TEXT,                          -- accuracy/relevance/speed/format/other
91
92    -- Context (anonymized)
93    context_hash TEXT,                      -- Hash of surrounding context
94
95    FOREIGN KEY (session_id) REFERENCES sessions(id),
96    FOREIGN KEY (query_id) REFERENCES queries(id)
97);
98
99CREATE INDEX IF NOT EXISTS idx_feedback_session ON feedback(session_id);
100CREATE INDEX IF NOT EXISTS idx_feedback_query ON feedback(query_id);
101CREATE INDEX IF NOT EXISTS idx_feedback_type ON feedback(feedback_type);
102
103-- Tool usage table: Track tool invocations
104CREATE TABLE IF NOT EXISTS tool_usage (
105    id TEXT PRIMARY KEY,                    -- UUID
106    session_id TEXT NOT NULL,               -- FK to sessions
107    query_id TEXT,                          -- FK to queries
108    timestamp TEXT NOT NULL,                -- ISO 8601 timestamp
109
110    -- Tool metadata
111    tool_name TEXT NOT NULL,                -- Tool identifier
112    tool_category TEXT,                     -- search/file/shell/mcp/reasoning
113
114    -- Execution metrics
115    execution_ms INTEGER NOT NULL,          -- Execution time
116    success INTEGER NOT NULL,               -- Boolean: did it succeed?
117    error_type TEXT,                        -- Error classification (if failed)
118
119    -- Input/output stats (NO content - privacy)
120    input_size_bytes INTEGER,               -- Size of input
121    output_size_bytes INTEGER,              -- Size of output
122
123    FOREIGN KEY (session_id) REFERENCES sessions(id),
124    FOREIGN KEY (query_id) REFERENCES queries(id)
125);
126
127CREATE INDEX IF NOT EXISTS idx_tool_usage_session ON tool_usage(session_id);
128CREATE INDEX IF NOT EXISTS idx_tool_usage_tool ON tool_usage(tool_name);
129CREATE INDEX IF NOT EXISTS idx_tool_usage_category ON tool_usage(tool_category);
130
131-- Reasoning traces table: ThinkTool execution traces
132CREATE TABLE IF NOT EXISTS reasoning_traces (
133    id TEXT PRIMARY KEY,                    -- UUID
134    session_id TEXT NOT NULL,               -- FK to sessions
135    query_id TEXT,                          -- FK to queries
136    timestamp TEXT NOT NULL,                -- ISO 8601 timestamp
137
138    -- Trace metadata
139    thinktool_name TEXT NOT NULL,           -- GigaThink/LaserLogic/etc
140    step_count INTEGER NOT NULL,            -- Number of reasoning steps
141
142    -- Execution metrics
143    total_ms INTEGER NOT NULL,              -- Total execution time
144    avg_step_ms REAL,                       -- Average time per step
145
146    -- Quality metrics (computed, not user-provided)
147    coherence_score REAL,                   -- Self-consistency check (0.0-1.0)
148    depth_score REAL,                       -- Reasoning depth metric
149
150    -- Anonymized structure
151    step_types TEXT,                        -- JSON array of step type names
152
153    FOREIGN KEY (session_id) REFERENCES sessions(id),
154    FOREIGN KEY (query_id) REFERENCES queries(id)
155);
156
157CREATE INDEX IF NOT EXISTS idx_traces_session ON reasoning_traces(session_id);
158CREATE INDEX IF NOT EXISTS idx_traces_thinktool ON reasoning_traces(thinktool_name);
159
160-- ============================================================================
161-- AGGREGATION TABLES (for ML training data)
162-- ============================================================================
163
164-- Daily aggregates: Pre-computed daily statistics
165CREATE TABLE IF NOT EXISTS daily_aggregates (
166    date TEXT PRIMARY KEY,                  -- YYYY-MM-DD
167    computed_at TEXT NOT NULL,              -- When this was computed
168
169    -- Volume metrics
170    session_count INTEGER DEFAULT 0,
171    query_count INTEGER DEFAULT 0,
172    feedback_count INTEGER DEFAULT 0,
173    tool_invocations INTEGER DEFAULT 0,
174
175    -- Performance metrics
176    avg_latency_ms REAL,
177    p50_latency_ms REAL,
178    p95_latency_ms REAL,
179    p99_latency_ms REAL,
180
181    -- Quality metrics
182    avg_success_rate REAL,
183    positive_feedback_ratio REAL,
184    error_rate REAL,
185
186    -- Tool distribution (JSON)
187    tool_distribution TEXT,
188
189    -- Query type distribution (JSON)
190    query_type_distribution TEXT
191);
192
193-- Query clusters: K-means clustering results for pattern detection
194CREATE TABLE IF NOT EXISTS query_clusters (
195    id INTEGER PRIMARY KEY,
196    computed_at TEXT NOT NULL,
197
198    -- Cluster metadata
199    cluster_count INTEGER NOT NULL,         -- K value used
200    silhouette_score REAL,                  -- Clustering quality
201
202    -- Cluster data (JSON)
203    centroids TEXT,                         -- JSON array of centroid embeddings
204    cluster_sizes TEXT,                     -- JSON array of cluster sizes
205    representative_hashes TEXT              -- JSON array of representative query hashes
206);
207
208-- ============================================================================
209-- PRIVACY CONTROLS
210-- ============================================================================
211
212-- Privacy consent tracking
213CREATE TABLE IF NOT EXISTS privacy_consent (
214    id TEXT PRIMARY KEY,
215    timestamp TEXT NOT NULL,
216
217    -- Consent levels
218    local_telemetry INTEGER NOT NULL,       -- Boolean: allow local storage
219    aggregated_sharing INTEGER NOT NULL,    -- Boolean: allow aggregated sharing
220    community_contribution INTEGER NOT NULL, -- Boolean: contribute to community model
221
222    -- Metadata
223    consent_version INTEGER NOT NULL,       -- Consent form version
224    ip_hash TEXT                            -- Hashed IP for legal compliance
225);
226
227-- Redaction log: Track what was redacted (for audit)
228CREATE TABLE IF NOT EXISTS redaction_log (
229    id TEXT PRIMARY KEY,
230    timestamp TEXT NOT NULL,
231
232    -- Redaction metadata
233    source_table TEXT NOT NULL,
234    source_id TEXT NOT NULL,
235    redaction_type TEXT NOT NULL,           -- pii/sensitive/custom
236    pattern_matched TEXT                    -- Regex pattern that triggered redaction
237);
238
239CREATE INDEX IF NOT EXISTS idx_redaction_source ON redaction_log(source_table, source_id);
240
241-- ============================================================================
242-- VIEWS for common queries
243-- ============================================================================
244
245-- Recent session summary
246CREATE VIEW IF NOT EXISTS v_recent_sessions AS
247SELECT
248    s.id,
249    s.started_at,
250    s.duration_ms,
251    s.query_count,
252    s.tool_count,
253    s.success_rate,
254    COUNT(DISTINCT f.id) as feedback_items,
255    AVG(CASE WHEN f.feedback_type = 'thumbs_up' THEN 1.0
256             WHEN f.feedback_type = 'thumbs_down' THEN 0.0
257             ELSE NULL END) as feedback_score
258FROM sessions s
259LEFT JOIN feedback f ON s.id = f.session_id
260WHERE s.started_at > datetime('now', '-30 days')
261GROUP BY s.id
262ORDER BY s.started_at DESC;
263
264-- Tool performance summary
265CREATE VIEW IF NOT EXISTS v_tool_performance AS
266SELECT
267    tool_name,
268    tool_category,
269    COUNT(*) as invocation_count,
270    AVG(execution_ms) as avg_execution_ms,
271    SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as success_rate,
272    SUM(input_size_bytes) as total_input_bytes,
273    SUM(output_size_bytes) as total_output_bytes
274FROM tool_usage
275WHERE timestamp > datetime('now', '-30 days')
276GROUP BY tool_name, tool_category
277ORDER BY invocation_count DESC;
278
279-- ThinkTool effectiveness
280CREATE VIEW IF NOT EXISTS v_thinktool_stats AS
281SELECT
282    thinktool_name,
283    COUNT(*) as usage_count,
284    AVG(step_count) as avg_steps,
285    AVG(total_ms) as avg_execution_ms,
286    AVG(coherence_score) as avg_coherence,
287    AVG(depth_score) as avg_depth
288FROM reasoning_traces
289WHERE timestamp > datetime('now', '-30 days')
290GROUP BY thinktool_name
291ORDER BY usage_count DESC;
292"#;
293
294/// Get the current schema version
295pub fn current_version() -> u32 {
296    TELEMETRY_SCHEMA_VERSION
297}
298
299/// Migration SQL for future schema updates
300pub fn get_migration_sql(_from_version: u32, _to_version: u32) -> Option<&'static str> {
301    None
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307
308    #[test]
309    fn test_schema_version() {
310        assert_eq!(current_version(), 1);
311    }
312
313    #[test]
314    fn test_schema_sql_not_empty() {
315        assert!(!SCHEMA_SQL.is_empty());
316        assert!(SCHEMA_SQL.contains("CREATE TABLE"));
317    }
318}