1use crate::schema::ontology::Ontology;
12use crate::schema::types::*;
13use crate::schema::versions;
14use anyhow::{Context, Result};
15use serde::{Deserialize, Serialize};
16use std::collections::HashMap;
17use std::path::Path;
18
19pub struct SchemaRegistry {
37 schemas: HashMap<String, ProviderSchema>,
39 ontology: Ontology,
41}
42
43impl SchemaRegistry {
44 pub fn new() -> Self {
46 let all_schemas = versions::build_all_provider_schemas();
47 let mut schemas = HashMap::new();
48
49 for schema in all_schemas {
50 schemas.insert(schema.id(), schema);
51 }
52
53 Self {
54 schemas,
55 ontology: Ontology::build(),
56 }
57 }
58
59 pub fn ontology(&self) -> &Ontology {
61 &self.ontology
62 }
63
64 pub fn list_schemas(&self) -> Vec<&ProviderSchema> {
66 let mut schemas: Vec<&ProviderSchema> = self.schemas.values().collect();
67 schemas.sort_by_key(|s| s.id());
68 schemas
69 }
70
71 pub fn get_schema(&self, id: &str) -> Option<&ProviderSchema> {
73 self.schemas.get(id)
74 }
75
76 pub fn schemas_for_provider(&self, provider: &str) -> Vec<&ProviderSchema> {
78 self.schemas
79 .values()
80 .filter(|s| s.version.provider == provider)
81 .collect()
82 }
83
84 pub fn register_schema(&mut self, schema: ProviderSchema) {
86 self.schemas.insert(schema.id(), schema);
87 }
88
89 pub fn detect_schema_from_file(&self, path: &Path) -> Result<DetectedSchema> {
91 let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");
92
93 let content = std::fs::read_to_string(path)
94 .with_context(|| format!("Failed to read {}", path.display()))?;
95
96 match extension {
97 "jsonl" => self.detect_jsonl_schema(&content, path),
98 "json" => self.detect_json_schema(&content, path),
99 _ => Ok(DetectedSchema {
100 schema_id: "unknown".into(),
101 confidence: 0.0,
102 evidence: vec![format!("Unknown file extension: .{}", extension)],
103 detected_version: None,
104 }),
105 }
106 }
107
108 pub fn detect_schema_from_workspace(&self, workspace_dir: &Path) -> Result<DetectedSchema> {
110 let chat_sessions = workspace_dir.join("chatSessions");
111
112 if !chat_sessions.exists() {
113 return Ok(DetectedSchema {
114 schema_id: "unknown".into(),
115 confidence: 0.0,
116 evidence: vec!["No chatSessions directory found".into()],
117 detected_version: None,
118 });
119 }
120
121 let mut has_jsonl = false;
123 let mut has_json = false;
124 let mut jsonl_count = 0;
125 let mut json_count = 0;
126
127 if let Ok(entries) = std::fs::read_dir(&chat_sessions) {
128 for entry in entries.flatten() {
129 let path = entry.path();
130 match path.extension().and_then(|e| e.to_str()) {
131 Some("jsonl") => {
132 has_jsonl = true;
133 jsonl_count += 1;
134 }
135 Some("json") => {
136 let name = path.file_name().unwrap_or_default().to_string_lossy();
138 if !name.contains(".bak") && !name.contains(".pre-") {
139 has_json = true;
140 json_count += 1;
141 }
142 }
143 _ => {}
144 }
145 }
146 }
147
148 let mut evidence = Vec::new();
149
150 if has_jsonl && !has_json {
151 evidence.push(format!(
152 "Found {} .jsonl files, no .json files → JSONL format",
153 jsonl_count
154 ));
155 return Ok(DetectedSchema {
156 schema_id: "copilot-jsonl-v1".into(),
157 confidence: 0.95,
158 evidence,
159 detected_version: None,
160 });
161 }
162
163 if has_json && !has_jsonl {
164 evidence.push(format!(
165 "Found {} .json files, no .jsonl files → JSON format",
166 json_count
167 ));
168 return Ok(DetectedSchema {
169 schema_id: "copilot-json-v3".into(),
170 confidence: 0.95,
171 evidence,
172 detected_version: None,
173 });
174 }
175
176 if has_jsonl && has_json {
177 evidence.push(format!(
178 "Found both .jsonl ({}) and .json ({}) files → mixed / transitional",
179 jsonl_count, json_count
180 ));
181
182 let schema_id = if jsonl_count >= json_count {
184 "copilot-jsonl-v1"
185 } else {
186 "copilot-json-v3"
187 };
188
189 return Ok(DetectedSchema {
190 schema_id: schema_id.into(),
191 confidence: 0.7,
192 evidence,
193 detected_version: None,
194 });
195 }
196
197 Ok(DetectedSchema {
198 schema_id: "unknown".into(),
199 confidence: 0.0,
200 evidence: vec!["No session files found".into()],
201 detected_version: None,
202 })
203 }
204
205 pub fn to_json(&self) -> Result<String> {
207 let export = RegistryExport {
208 version: "2.0.0".into(),
209 schema_count: self.schemas.len(),
210 schemas: self.list_schemas().into_iter().cloned().collect(),
211 ontology: self.ontology.clone(),
212 };
213
214 serde_json::to_string_pretty(&export).map_err(Into::into)
215 }
216
217 pub fn to_json_compact(&self) -> Result<String> {
219 let export = RegistryExport {
220 version: "2.0.0".into(),
221 schema_count: self.schemas.len(),
222 schemas: self.list_schemas().into_iter().cloned().collect(),
223 ontology: self.ontology.clone(),
224 };
225
226 serde_json::to_string(&export).map_err(Into::into)
227 }
228
229 fn detect_jsonl_schema(&self, content: &str, _path: &Path) -> Result<DetectedSchema> {
234 let first_line = content.lines().next().unwrap_or("");
235 let mut evidence = Vec::new();
236
237 if let Ok(val) = serde_json::from_str::<serde_json::Value>(first_line) {
239 if val.get("kind").is_some() {
240 evidence.push("First line has 'kind' field → Copilot JSONL event format".into());
241
242 let kind = val.get("kind").and_then(|k| k.as_u64()).unwrap_or(99);
243 if kind == 0 {
244 evidence.push("kind=0 → full session snapshot (expected first line)".into());
245 }
246
247 if let Some(data) = val.get("data") {
249 if let Some(version) = data.get("version").and_then(|v| v.as_u64()) {
250 evidence.push(format!(
251 "data.version = {} → session format version",
252 version
253 ));
254 }
255 }
256
257 let ext_version = val
259 .get("data")
260 .and_then(|d| d.get("requests"))
261 .and_then(|r| r.as_array())
262 .and_then(|arr| arr.first())
263 .and_then(|req| req.get("result"))
264 .and_then(|res| res.get("metadata"))
265 .and_then(|meta| meta.get("extensionVersion"))
266 .and_then(|v| v.as_str())
267 .map(String::from);
268
269 return Ok(DetectedSchema {
270 schema_id: "copilot-jsonl-v1".into(),
271 confidence: 0.95,
272 evidence,
273 detected_version: ext_version,
274 });
275 }
276
277 if val.get("type").is_some() && val.get("message").is_some() {
279 evidence.push("Has 'type' and 'message' fields → Claude Code format".into());
280 return Ok(DetectedSchema {
281 schema_id: "claude-code-jsonl-v1".into(),
282 confidence: 0.9,
283 evidence,
284 detected_version: None,
285 });
286 }
287
288 if val.get("role").is_some() && val.get("content").is_some() {
290 evidence.push("Has 'role' and 'content' fields → Codex CLI / OpenAI format".into());
291 return Ok(DetectedSchema {
292 schema_id: "codex-cli-jsonl-v1".into(),
293 confidence: 0.8,
294 evidence,
295 detected_version: None,
296 });
297 }
298 }
299
300 evidence.push("Could not identify JSONL format from first line".into());
301 Ok(DetectedSchema {
302 schema_id: "unknown".into(),
303 confidence: 0.0,
304 evidence,
305 detected_version: None,
306 })
307 }
308
309 fn detect_json_schema(&self, content: &str, _path: &Path) -> Result<DetectedSchema> {
310 let mut evidence = Vec::new();
311
312 if let Ok(val) = serde_json::from_str::<serde_json::Value>(content) {
313 if val.get("requests").is_some() {
315 evidence.push("Has 'requests' field → Copilot Chat format".into());
316
317 if let Some(version) = val.get("version").and_then(|v| v.as_u64()) {
318 evidence.push(format!(
319 "version = {} → session format v{}",
320 version, version
321 ));
322 }
323
324 if val.get("creationDate").is_some() {
325 evidence.push("Has 'creationDate' → Copilot JSON v3".into());
326 }
327
328 return Ok(DetectedSchema {
329 schema_id: "copilot-json-v3".into(),
330 confidence: 0.95,
331 evidence,
332 detected_version: None,
333 });
334 }
335
336 if val.get("history").is_some() && val.get("dateCreated").is_some() {
338 evidence.push("Has 'history' and 'dateCreated' → Continue.dev format".into());
339 return Ok(DetectedSchema {
340 schema_id: "continue-dev-json-v1".into(),
341 confidence: 0.9,
342 evidence,
343 detected_version: None,
344 });
345 }
346
347 if val.get("contents").is_some() {
349 evidence.push("Has 'contents' field → Gemini format".into());
350 return Ok(DetectedSchema {
351 schema_id: "gemini-cli-json-v1".into(),
352 confidence: 0.85,
353 evidence,
354 detected_version: None,
355 });
356 }
357
358 if val.get("messages").is_some() && val.get("model").is_some() {
360 evidence.push("Has 'messages' and 'model' → OpenAI API format".into());
361 return Ok(DetectedSchema {
362 schema_id: "openai-api-openai-api-v1".into(),
363 confidence: 0.9,
364 evidence,
365 detected_version: None,
366 });
367 }
368 } else {
369 evidence.push("Failed to parse as JSON".into());
370 }
371
372 Ok(DetectedSchema {
373 schema_id: "unknown".into(),
374 confidence: 0.0,
375 evidence,
376 detected_version: None,
377 })
378 }
379}
380
381impl Default for SchemaRegistry {
382 fn default() -> Self {
383 Self::new()
384 }
385}
386
387#[derive(Debug, Clone, Serialize, Deserialize)]
393pub struct DetectedSchema {
394 pub schema_id: String,
396 pub confidence: f64,
398 pub evidence: Vec<String>,
400 pub detected_version: Option<String>,
402}
403
404#[derive(Serialize, Deserialize)]
409struct RegistryExport {
410 version: String,
411 schema_count: usize,
412 schemas: Vec<ProviderSchema>,
413 ontology: Ontology,
414}