Skip to main content

chasm/schema/
registry.rs

1// Copyright (c) 2024-2026 Nervosys LLC
2// SPDX-License-Identifier: AGPL-3.0-only
3//! Schema Registry — central catalog of all provider schemas with detection.
4//!
5//! The registry is the primary entry point for the schema subsystem. It:
6//! 1. Holds all known provider schemas
7//! 2. Auto-detects which schema a workspace uses
8//! 3. Provides search/filter APIs for AI agents
9//! 4. Exposes the ontology for cross-provider mapping
10
11use crate::schema::ontology::Ontology;
12use crate::schema::types::*;
13use crate::schema::versions;
14use anyhow::{Context, Result};
15use serde::{Deserialize, Serialize};
16use std::collections::HashMap;
17use std::path::Path;
18
19// ============================================================================
20// Schema Registry
21// ============================================================================
22
23/// Central registry of all known AI chat provider schemas.
24///
25/// ```rust,ignore
26/// let registry = SchemaRegistry::new();
27///
28/// // List all schemas
29/// for schema in registry.list_schemas() {
30///     println!("{}: {} fields", schema.id(), schema.field_count());
31/// }
32///
33/// // Detect schema for a workspace
34/// let detected = registry.detect_schema("/path/to/session.jsonl")?;
35/// ```
36pub struct SchemaRegistry {
37    /// All registered schemas indexed by their version ID
38    schemas: HashMap<String, ProviderSchema>,
39    /// The ontology over all schemas
40    ontology: Ontology,
41}
42
43impl SchemaRegistry {
44    /// Create a new registry pre-loaded with all known provider schemas.
45    pub fn new() -> Self {
46        let all_schemas = versions::build_all_provider_schemas();
47        let mut schemas = HashMap::new();
48
49        for schema in all_schemas {
50            schemas.insert(schema.id(), schema);
51        }
52
53        Self {
54            schemas,
55            ontology: Ontology::build(),
56        }
57    }
58
59    /// Get a reference to the ontology.
60    pub fn ontology(&self) -> &Ontology {
61        &self.ontology
62    }
63
64    /// List all registered schemas.
65    pub fn list_schemas(&self) -> Vec<&ProviderSchema> {
66        let mut schemas: Vec<&ProviderSchema> = self.schemas.values().collect();
67        schemas.sort_by_key(|s| s.id());
68        schemas
69    }
70
71    /// Get a schema by its version ID.
72    pub fn get_schema(&self, id: &str) -> Option<&ProviderSchema> {
73        self.schemas.get(id)
74    }
75
76    /// Get all schemas for a specific provider.
77    pub fn schemas_for_provider(&self, provider: &str) -> Vec<&ProviderSchema> {
78        self.schemas
79            .values()
80            .filter(|s| s.version.provider == provider)
81            .collect()
82    }
83
84    /// Register a custom schema (for plugins / new providers).
85    pub fn register_schema(&mut self, schema: ProviderSchema) {
86        self.schemas.insert(schema.id(), schema);
87    }
88
89    /// Detect which schema a session file uses based on its contents.
90    pub fn detect_schema_from_file(&self, path: &Path) -> Result<DetectedSchema> {
91        let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");
92
93        let content = std::fs::read_to_string(path)
94            .with_context(|| format!("Failed to read {}", path.display()))?;
95
96        match extension {
97            "jsonl" => self.detect_jsonl_schema(&content, path),
98            "json" => self.detect_json_schema(&content, path),
99            _ => Ok(DetectedSchema {
100                schema_id: "unknown".into(),
101                confidence: 0.0,
102                evidence: vec![format!("Unknown file extension: .{}", extension)],
103                detected_version: None,
104            }),
105        }
106    }
107
108    /// Detect schema from a VS Code workspace storage directory.
109    pub fn detect_schema_from_workspace(&self, workspace_dir: &Path) -> Result<DetectedSchema> {
110        let chat_sessions = workspace_dir.join("chatSessions");
111
112        if !chat_sessions.exists() {
113            return Ok(DetectedSchema {
114                schema_id: "unknown".into(),
115                confidence: 0.0,
116                evidence: vec!["No chatSessions directory found".into()],
117                detected_version: None,
118            });
119        }
120
121        // Check what file types exist
122        let mut has_jsonl = false;
123        let mut has_json = false;
124        let mut jsonl_count = 0;
125        let mut json_count = 0;
126
127        if let Ok(entries) = std::fs::read_dir(&chat_sessions) {
128            for entry in entries.flatten() {
129                let path = entry.path();
130                match path.extension().and_then(|e| e.to_str()) {
131                    Some("jsonl") => {
132                        has_jsonl = true;
133                        jsonl_count += 1;
134                    }
135                    Some("json") => {
136                        // Exclude backup files
137                        let name = path.file_name().unwrap_or_default().to_string_lossy();
138                        if !name.contains(".bak") && !name.contains(".pre-") {
139                            has_json = true;
140                            json_count += 1;
141                        }
142                    }
143                    _ => {}
144                }
145            }
146        }
147
148        let mut evidence = Vec::new();
149
150        if has_jsonl && !has_json {
151            evidence.push(format!(
152                "Found {} .jsonl files, no .json files → JSONL format",
153                jsonl_count
154            ));
155            return Ok(DetectedSchema {
156                schema_id: "copilot-jsonl-v1".into(),
157                confidence: 0.95,
158                evidence,
159                detected_version: None,
160            });
161        }
162
163        if has_json && !has_jsonl {
164            evidence.push(format!(
165                "Found {} .json files, no .jsonl files → JSON format",
166                json_count
167            ));
168            return Ok(DetectedSchema {
169                schema_id: "copilot-json-v3".into(),
170                confidence: 0.95,
171                evidence,
172                detected_version: None,
173            });
174        }
175
176        if has_jsonl && has_json {
177            evidence.push(format!(
178                "Found both .jsonl ({}) and .json ({}) files → mixed / transitional",
179                jsonl_count, json_count
180            ));
181
182            // Newer format takes precedence
183            let schema_id = if jsonl_count >= json_count {
184                "copilot-jsonl-v1"
185            } else {
186                "copilot-json-v3"
187            };
188
189            return Ok(DetectedSchema {
190                schema_id: schema_id.into(),
191                confidence: 0.7,
192                evidence,
193                detected_version: None,
194            });
195        }
196
197        Ok(DetectedSchema {
198            schema_id: "unknown".into(),
199            confidence: 0.0,
200            evidence: vec!["No session files found".into()],
201            detected_version: None,
202        })
203    }
204
205    /// Export the full registry as JSON (for AI agent consumption).
206    pub fn to_json(&self) -> Result<String> {
207        let export = RegistryExport {
208            version: "2.0.0".into(),
209            schema_count: self.schemas.len(),
210            schemas: self.list_schemas().into_iter().cloned().collect(),
211            ontology: self.ontology.clone(),
212        };
213
214        serde_json::to_string_pretty(&export).map_err(Into::into)
215    }
216
217    /// Export the registry as a compact JSON for embedding in documents.
218    pub fn to_json_compact(&self) -> Result<String> {
219        let export = RegistryExport {
220            version: "2.0.0".into(),
221            schema_count: self.schemas.len(),
222            schemas: self.list_schemas().into_iter().cloned().collect(),
223            ontology: self.ontology.clone(),
224        };
225
226        serde_json::to_string(&export).map_err(Into::into)
227    }
228
229    // ========================================================================
230    // Internal detection helpers
231    // ========================================================================
232
233    fn detect_jsonl_schema(&self, content: &str, _path: &Path) -> Result<DetectedSchema> {
234        let first_line = content.lines().next().unwrap_or("");
235        let mut evidence = Vec::new();
236
237        // Try to parse as Copilot JSONL (kind:0 envelope)
238        if let Ok(val) = serde_json::from_str::<serde_json::Value>(first_line) {
239            if val.get("kind").is_some() {
240                evidence.push("First line has 'kind' field → Copilot JSONL event format".into());
241
242                let kind = val.get("kind").and_then(|k| k.as_u64()).unwrap_or(99);
243                if kind == 0 {
244                    evidence.push("kind=0 → full session snapshot (expected first line)".into());
245                }
246
247                // Check for data.version
248                if let Some(data) = val.get("data") {
249                    if let Some(version) = data.get("version").and_then(|v| v.as_u64()) {
250                        evidence.push(format!(
251                            "data.version = {} → session format version",
252                            version
253                        ));
254                    }
255                }
256
257                // Check for extensionVersion in data
258                let ext_version = val
259                    .get("data")
260                    .and_then(|d| d.get("requests"))
261                    .and_then(|r| r.as_array())
262                    .and_then(|arr| arr.first())
263                    .and_then(|req| req.get("result"))
264                    .and_then(|res| res.get("metadata"))
265                    .and_then(|meta| meta.get("extensionVersion"))
266                    .and_then(|v| v.as_str())
267                    .map(String::from);
268
269                return Ok(DetectedSchema {
270                    schema_id: "copilot-jsonl-v1".into(),
271                    confidence: 0.95,
272                    evidence,
273                    detected_version: ext_version,
274                });
275            }
276
277            // Check for Claude Code format (type field)
278            if val.get("type").is_some() && val.get("message").is_some() {
279                evidence.push("Has 'type' and 'message' fields → Claude Code format".into());
280                return Ok(DetectedSchema {
281                    schema_id: "claude-code-jsonl-v1".into(),
282                    confidence: 0.9,
283                    evidence,
284                    detected_version: None,
285                });
286            }
287
288            // Check for Codex CLI format (role/content)
289            if val.get("role").is_some() && val.get("content").is_some() {
290                evidence.push("Has 'role' and 'content' fields → Codex CLI / OpenAI format".into());
291                return Ok(DetectedSchema {
292                    schema_id: "codex-cli-jsonl-v1".into(),
293                    confidence: 0.8,
294                    evidence,
295                    detected_version: None,
296                });
297            }
298        }
299
300        evidence.push("Could not identify JSONL format from first line".into());
301        Ok(DetectedSchema {
302            schema_id: "unknown".into(),
303            confidence: 0.0,
304            evidence,
305            detected_version: None,
306        })
307    }
308
309    fn detect_json_schema(&self, content: &str, _path: &Path) -> Result<DetectedSchema> {
310        let mut evidence = Vec::new();
311
312        if let Ok(val) = serde_json::from_str::<serde_json::Value>(content) {
313            // Check for Copilot Chat JSON v3
314            if val.get("requests").is_some() {
315                evidence.push("Has 'requests' field → Copilot Chat format".into());
316
317                if let Some(version) = val.get("version").and_then(|v| v.as_u64()) {
318                    evidence.push(format!(
319                        "version = {} → session format v{}",
320                        version, version
321                    ));
322                }
323
324                if val.get("creationDate").is_some() {
325                    evidence.push("Has 'creationDate' → Copilot JSON v3".into());
326                }
327
328                return Ok(DetectedSchema {
329                    schema_id: "copilot-json-v3".into(),
330                    confidence: 0.95,
331                    evidence,
332                    detected_version: None,
333                });
334            }
335
336            // Check for Continue.dev format
337            if val.get("history").is_some() && val.get("dateCreated").is_some() {
338                evidence.push("Has 'history' and 'dateCreated' → Continue.dev format".into());
339                return Ok(DetectedSchema {
340                    schema_id: "continue-dev-json-v1".into(),
341                    confidence: 0.9,
342                    evidence,
343                    detected_version: None,
344                });
345            }
346
347            // Check for Gemini CLI format (contents with parts)
348            if val.get("contents").is_some() {
349                evidence.push("Has 'contents' field → Gemini format".into());
350                return Ok(DetectedSchema {
351                    schema_id: "gemini-cli-json-v1".into(),
352                    confidence: 0.85,
353                    evidence,
354                    detected_version: None,
355                });
356            }
357
358            // Check for OpenAI API format
359            if val.get("messages").is_some() && val.get("model").is_some() {
360                evidence.push("Has 'messages' and 'model' → OpenAI API format".into());
361                return Ok(DetectedSchema {
362                    schema_id: "openai-api-openai-api-v1".into(),
363                    confidence: 0.9,
364                    evidence,
365                    detected_version: None,
366                });
367            }
368        } else {
369            evidence.push("Failed to parse as JSON".into());
370        }
371
372        Ok(DetectedSchema {
373            schema_id: "unknown".into(),
374            confidence: 0.0,
375            evidence,
376            detected_version: None,
377        })
378    }
379}
380
381impl Default for SchemaRegistry {
382    fn default() -> Self {
383        Self::new()
384    }
385}
386
387// ============================================================================
388// Detection Result
389// ============================================================================
390
391/// Result of schema auto-detection
392#[derive(Debug, Clone, Serialize, Deserialize)]
393pub struct DetectedSchema {
394    /// Best-matching schema version ID
395    pub schema_id: String,
396    /// Confidence score (0.0 – 1.0)
397    pub confidence: f64,
398    /// Evidence that led to this conclusion
399    pub evidence: Vec<String>,
400    /// Detected extension version (if extractable)
401    pub detected_version: Option<String>,
402}
403
404// ============================================================================
405// Registry Export (for serialization)
406// ============================================================================
407
408#[derive(Serialize, Deserialize)]
409struct RegistryExport {
410    version: String,
411    schema_count: usize,
412    schemas: Vec<ProviderSchema>,
413    ontology: Ontology,
414}