1use crate::compiler::models::*;
8use serde::{Deserialize, Serialize};
9use std::collections::{BTreeSet, HashMap};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct UnifiedSchema {
14 pub models: Vec<UnifiedModel>,
16 pub domains: Vec<String>,
18 pub total_instances: usize,
20}
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct UnifiedModel {
25 pub name: String,
27 pub schema_org_type: String,
29 pub fields: Vec<UnifiedField>,
31 pub sources: Vec<ModelSource>,
33 pub total_instances: usize,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct UnifiedField {
40 pub canonical_name: String,
42 pub field_type: FieldType,
44 pub present_in: Vec<String>,
46 pub coverage: f32,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ModelSource {
53 pub domain: String,
55 pub instance_count: usize,
57 pub field_coverage: f32,
59}
60
61pub fn unify_schemas(schemas: &[CompiledSchema]) -> UnifiedSchema {
66 if schemas.is_empty() {
67 return UnifiedSchema {
68 models: Vec::new(),
69 domains: Vec::new(),
70 total_instances: 0,
71 };
72 }
73
74 let domains: Vec<String> = schemas.iter().map(|s| s.domain.clone()).collect();
75 let domain_count = domains.len();
76
77 let mut type_groups: HashMap<String, Vec<(&str, &DataModel)>> = HashMap::new();
79 for schema in schemas {
80 for model in &schema.models {
81 type_groups
82 .entry(model.schema_org_type.clone())
83 .or_default()
84 .push((&schema.domain, model));
85 }
86 }
87
88 let mut unified_models: Vec<UnifiedModel> = Vec::new();
89
90 for (schema_type, models) in &type_groups {
91 let mut field_map: HashMap<String, (FieldType, BTreeSet<String>)> = HashMap::new();
93
94 for (domain, model) in models {
95 for field in &model.fields {
96 let canonical = canonicalize_field_name(&field.name);
97 let entry = field_map
98 .entry(canonical.clone())
99 .or_insert_with(|| (field.field_type.clone(), BTreeSet::new()));
100 entry.1.insert(domain.to_string());
101 }
102 }
103
104 let unified_fields: Vec<UnifiedField> = field_map
105 .into_iter()
106 .map(|(name, (field_type, present_in))| {
107 let coverage = present_in.len() as f32 / domain_count as f32;
108 UnifiedField {
109 canonical_name: name,
110 field_type,
111 present_in: present_in.into_iter().collect(),
112 coverage,
113 }
114 })
115 .collect();
116
117 let unified_field_count = unified_fields.len();
118
119 let sources: Vec<ModelSource> = models
121 .iter()
122 .map(|(domain, model)| {
123 let field_coverage = if unified_field_count > 0 {
124 model.fields.len() as f32 / unified_field_count as f32
125 } else {
126 0.0
127 };
128 ModelSource {
129 domain: domain.to_string(),
130 instance_count: model.instance_count,
131 field_coverage,
132 }
133 })
134 .collect();
135
136 let total_instances: usize = models.iter().map(|(_, m)| m.instance_count).sum();
137
138 let name = models[0].1.name.clone();
140
141 unified_models.push(UnifiedModel {
142 name,
143 schema_org_type: schema_type.clone(),
144 fields: unified_fields,
145 sources,
146 total_instances,
147 });
148 }
149
150 unified_models.sort_by(|a, b| b.total_instances.cmp(&a.total_instances));
152
153 let total_instances: usize = unified_models.iter().map(|m| m.total_instances).sum();
154
155 UnifiedSchema {
156 models: unified_models,
157 domains,
158 total_instances,
159 }
160}
161
162fn canonicalize_field_name(name: &str) -> String {
164 match name {
166 "price" | "cost" | "amount" => "price".to_string(),
167 "rating" | "score" | "stars" => "rating".to_string(),
168 "name" | "title" | "label" => "name".to_string(),
169 "url" | "link" | "href" => "url".to_string(),
170 "node_id" | "id" | "nodeId" => "node_id".to_string(),
171 "image_url" | "image" | "thumbnail" | "picture" => "image_url".to_string(),
172 "description" | "desc" | "summary" | "blurb" => "description".to_string(),
173 "category" | "type" | "kind" => "category".to_string(),
174 "brand" | "manufacturer" | "maker" => "brand".to_string(),
175 "availability" | "in_stock" | "stock" => "availability".to_string(),
176 other => other.to_string(),
177 }
178}
179
180pub fn generate_universal_python(unified: &UnifiedSchema) -> String {
182 let mut out = String::new();
183
184 out.push_str("\"\"\"Universal Cortex client — queries all compiled sites\"\"\"\n");
185 out.push_str("# Generated by Cortex Web Compiler — do not edit manually\n\n");
186 out.push_str("from __future__ import annotations\n");
187 out.push_str("from dataclasses import dataclass, field\n");
188 out.push_str("from typing import Optional, List, Any\n");
189 out.push_str("import importlib\n\n");
190
191 out.push_str("_COMPILED_DOMAINS = [\n");
193 for domain in &unified.domains {
194 let module = domain.replace(['.', '-'], "_");
195 out.push_str(&format!(
196 " (\"{domain}\", \"cortex.compiled.{module}\"),\n"
197 ));
198 }
199 out.push_str("]\n\n");
200
201 out.push_str(
202 r#"def _get_compiled_sites(domains=None):
203 """Load compiled site modules."""
204 sites = []
205 for domain, module_name in _COMPILED_DOMAINS:
206 if domains and domain not in domains:
207 continue
208 try:
209 mod = importlib.import_module(module_name)
210 sites.append((domain, mod))
211 except ImportError:
212 continue
213 return sites
214
215"#,
216 );
217
218 for model in &unified.models {
220 out.push_str(&format!("\n@dataclass\nclass {}:\n", model.name));
221 out.push_str(&format!(
222 " \"\"\"Unified {} from {} sites\"\"\"\n",
223 model.name,
224 model.sources.len()
225 ));
226 out.push_str(" url: str\n");
227 out.push_str(" source_domain: str\n");
228
229 for field in &model.fields {
230 if field.canonical_name == "url" || field.canonical_name == "node_id" {
231 continue;
232 }
233 let py_type = field.field_type.to_python_type();
234 if field.coverage < 1.0 {
235 out.push_str(&format!(
236 " {}: Optional[{}] = None\n",
237 field.canonical_name, py_type
238 ));
239 } else {
240 out.push_str(&format!(
241 " {}: {} = \"\"\n",
242 field.canonical_name, py_type
243 ));
244 }
245 }
246
247 out.push_str(&format!(
249 r#"
250 @staticmethod
251 def search(query: str, domains: List[str] = None, **filters) -> List[{name}]:
252 """Search {name}s across all compiled sites."""
253 results = []
254 for domain, site_module in _get_compiled_sites(domains):
255 try:
256 if hasattr(site_module, '{name}'):
257 site_cls = getattr(site_module, '{name}')
258 site_results = site_cls.search(query, **filters)
259 results.extend([
260 {name}(url=r.url, source_domain=domain, **{{
261 k: getattr(r, k, None) for k in {name}.__dataclass_fields__
262 if k not in ('url', 'source_domain')
263 }})
264 for r in site_results
265 ])
266 except Exception:
267 continue
268 return results
269
270"#,
271 name = model.name
272 ));
273 }
274
275 out
276}
277
278#[cfg(test)]
279mod tests {
280 use super::*;
281 use chrono::Utc;
282
283 fn make_schema(domain: &str, models: Vec<DataModel>) -> CompiledSchema {
284 CompiledSchema {
285 domain: domain.to_string(),
286 compiled_at: Utc::now(),
287 models,
288 actions: Vec::new(),
289 relationships: Vec::new(),
290 stats: SchemaStats {
291 total_models: 0,
292 total_fields: 0,
293 total_instances: 0,
294 avg_confidence: 0.0,
295 },
296 }
297 }
298
299 fn product_model(instance_count: usize) -> DataModel {
300 DataModel {
301 name: "Product".to_string(),
302 schema_org_type: "Product".to_string(),
303 fields: vec![
304 ModelField {
305 name: "url".to_string(),
306 field_type: FieldType::Url,
307 source: FieldSource::Inferred,
308 confidence: 1.0,
309 nullable: false,
310 example_values: vec![],
311 feature_dim: None,
312 },
313 ModelField {
314 name: "name".to_string(),
315 field_type: FieldType::String,
316 source: FieldSource::JsonLd,
317 confidence: 0.99,
318 nullable: false,
319 example_values: vec![],
320 feature_dim: None,
321 },
322 ModelField {
323 name: "price".to_string(),
324 field_type: FieldType::Float,
325 source: FieldSource::JsonLd,
326 confidence: 0.99,
327 nullable: true,
328 example_values: vec![],
329 feature_dim: Some(48),
330 },
331 ],
332 instance_count,
333 example_urls: vec![],
334 search_action: None,
335 list_url: None,
336 }
337 }
338
339 #[test]
340 fn test_unify_two_schemas() {
341 let schemas = vec![
342 make_schema("amazon.com", vec![product_model(50000)]),
343 make_schema("bestbuy.com", vec![product_model(20000)]),
344 ];
345
346 let unified = unify_schemas(&schemas);
347 assert_eq!(unified.domains.len(), 2);
348 assert_eq!(unified.models.len(), 1);
349
350 let product = &unified.models[0];
351 assert_eq!(product.name, "Product");
352 assert_eq!(product.total_instances, 70000);
353 assert_eq!(product.sources.len(), 2);
354 }
355
356 #[test]
357 fn test_unify_empty() {
358 let unified = unify_schemas(&[]);
359 assert!(unified.models.is_empty());
360 assert_eq!(unified.total_instances, 0);
361 }
362
363 #[test]
364 fn test_canonicalize_field_name() {
365 assert_eq!(canonicalize_field_name("price"), "price");
366 assert_eq!(canonicalize_field_name("cost"), "price");
367 assert_eq!(canonicalize_field_name("rating"), "rating");
368 assert_eq!(canonicalize_field_name("stars"), "rating");
369 assert_eq!(canonicalize_field_name("title"), "name");
370 }
371
372 #[test]
373 fn test_unified_field_coverage() {
374 let schemas = vec![
375 make_schema("a.com", vec![product_model(100)]),
376 make_schema("b.com", vec![product_model(200)]),
377 ];
378
379 let unified = unify_schemas(&schemas);
380 let product = &unified.models[0];
381
382 for field in &product.fields {
384 assert_eq!(
385 field.coverage, 1.0,
386 "field {} should have full coverage",
387 field.canonical_name
388 );
389 }
390 }
391
392 #[test]
393 fn test_generate_universal_python() {
394 let schemas = vec![
395 make_schema("amazon.com", vec![product_model(50000)]),
396 make_schema("bestbuy.com", vec![product_model(20000)]),
397 ];
398 let unified = unify_schemas(&schemas);
399 let code = generate_universal_python(&unified);
400
401 assert!(code.contains("class Product:"));
402 assert!(code.contains("source_domain: str"));
403 assert!(code.contains("def search("));
404 assert!(code.contains("_COMPILED_DOMAINS"));
405 }
406
407 #[test]
410 fn test_v4_unify_many_domains() {
411 let schemas: Vec<CompiledSchema> = (0..10)
412 .map(|i| make_schema(&format!("site{i}.com"), vec![product_model(100 * (i + 1))]))
413 .collect();
414
415 let unified = unify_schemas(&schemas);
416 assert_eq!(unified.domains.len(), 10);
417 assert_eq!(unified.models.len(), 1);
418
419 let product = &unified.models[0];
420 assert_eq!(product.sources.len(), 10);
421 assert_eq!(product.total_instances, 5500); }
423
424 #[test]
425 fn test_v4_unify_different_model_types() {
426 let article_model = DataModel {
427 name: "Article".to_string(),
428 schema_org_type: "Article".to_string(),
429 fields: vec![ModelField {
430 name: "title".to_string(),
431 field_type: FieldType::String,
432 source: FieldSource::JsonLd,
433 confidence: 0.95,
434 nullable: false,
435 example_values: vec![],
436 feature_dim: None,
437 }],
438 instance_count: 1000,
439 example_urls: vec![],
440 search_action: None,
441 list_url: None,
442 };
443
444 let schemas = vec![
445 make_schema("amazon.com", vec![product_model(5000)]),
446 make_schema("bbc.com", vec![article_model.clone()]),
447 make_schema("bestbuy.com", vec![product_model(3000)]),
448 make_schema("cnn.com", vec![article_model]),
449 ];
450
451 let unified = unify_schemas(&schemas);
452 assert_eq!(unified.domains.len(), 4);
453 assert_eq!(unified.models.len(), 2, "should have Product and Article");
454
455 let product = unified.models.iter().find(|m| m.name == "Product").unwrap();
456 assert_eq!(product.sources.len(), 2);
457 assert_eq!(product.total_instances, 8000);
458
459 let article = unified.models.iter().find(|m| m.name == "Article").unwrap();
460 assert_eq!(article.sources.len(), 2);
461 assert_eq!(article.total_instances, 2000);
462 }
463
464 #[test]
465 fn test_v4_universal_python_multi_type() {
466 let article_model = DataModel {
467 name: "Article".to_string(),
468 schema_org_type: "Article".to_string(),
469 fields: vec![ModelField {
470 name: "title".to_string(),
471 field_type: FieldType::String,
472 source: FieldSource::JsonLd,
473 confidence: 0.9,
474 nullable: false,
475 example_values: vec![],
476 feature_dim: None,
477 }],
478 instance_count: 500,
479 example_urls: vec![],
480 search_action: None,
481 list_url: None,
482 };
483
484 let schemas = vec![
485 make_schema("amazon.com", vec![product_model(5000)]),
486 make_schema("bbc.com", vec![article_model]),
487 ];
488
489 let unified = unify_schemas(&schemas);
490 let code = generate_universal_python(&unified);
491
492 assert!(code.contains("class Product:"), "Product class");
493 assert!(code.contains("class Article:"), "Article class");
494 assert!(code.contains("_COMPILED_DOMAINS"), "domain registry");
495 }
496}