Skip to main content

cortex_runtime/compiler/
unifier.rs

1//! Cross-site schema unification via Schema.org types.
2//!
3//! Schema.org types are universal — Product on Amazon = Product on Best Buy.
4//! The unifier merges compiled schemas from multiple sites into a unified schema
5//! where queries can span all compiled domains.
6
7use crate::compiler::models::*;
8use serde::{Deserialize, Serialize};
9use std::collections::{BTreeSet, HashMap};
10
11/// A unified schema spanning multiple compiled sites.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct UnifiedSchema {
14    /// Unified models (one per Schema.org type, combining all domains).
15    pub models: Vec<UnifiedModel>,
16    /// All domains included in this unified schema.
17    pub domains: Vec<String>,
18    /// Total node instances across all domains.
19    pub total_instances: usize,
20}
21
22/// A unified model combining instances from multiple domains.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct UnifiedModel {
25    /// Model name (e.g., "Product").
26    pub name: String,
27    /// Schema.org type.
28    pub schema_org_type: String,
29    /// Union of all fields across all domains.
30    pub fields: Vec<UnifiedField>,
31    /// Which domains contribute to this model.
32    pub sources: Vec<ModelSource>,
33    /// Total instances across all domains.
34    pub total_instances: usize,
35}
36
37/// A field in a unified model — tracks which domains have it.
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct UnifiedField {
40    /// Canonical Schema.org property name.
41    pub canonical_name: String,
42    /// Inferred type.
43    pub field_type: FieldType,
44    /// Which domains have this field.
45    pub present_in: Vec<String>,
46    /// Percentage of sources that have this field (0.0-1.0).
47    pub coverage: f32,
48}
49
50/// Info about a domain's contribution to a unified model.
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ModelSource {
53    /// Domain name.
54    pub domain: String,
55    /// Number of instances of this model on this domain.
56    pub instance_count: usize,
57    /// What percentage of the unified fields this domain has.
58    pub field_coverage: f32,
59}
60
61/// Unify multiple compiled schemas into a single unified schema.
62///
63/// Groups models by Schema.org type, unions all fields, normalizes names,
64/// and records which domains contribute what.
65pub fn unify_schemas(schemas: &[CompiledSchema]) -> UnifiedSchema {
66    if schemas.is_empty() {
67        return UnifiedSchema {
68            models: Vec::new(),
69            domains: Vec::new(),
70            total_instances: 0,
71        };
72    }
73
74    let domains: Vec<String> = schemas.iter().map(|s| s.domain.clone()).collect();
75    let domain_count = domains.len();
76
77    // Group all models by their schema_org_type
78    let mut type_groups: HashMap<String, Vec<(&str, &DataModel)>> = HashMap::new();
79    for schema in schemas {
80        for model in &schema.models {
81            type_groups
82                .entry(model.schema_org_type.clone())
83                .or_default()
84                .push((&schema.domain, model));
85        }
86    }
87
88    let mut unified_models: Vec<UnifiedModel> = Vec::new();
89
90    for (schema_type, models) in &type_groups {
91        // Union all fields
92        let mut field_map: HashMap<String, (FieldType, BTreeSet<String>)> = HashMap::new();
93
94        for (domain, model) in models {
95            for field in &model.fields {
96                let canonical = canonicalize_field_name(&field.name);
97                let entry = field_map
98                    .entry(canonical.clone())
99                    .or_insert_with(|| (field.field_type.clone(), BTreeSet::new()));
100                entry.1.insert(domain.to_string());
101            }
102        }
103
104        let unified_fields: Vec<UnifiedField> = field_map
105            .into_iter()
106            .map(|(name, (field_type, present_in))| {
107                let coverage = present_in.len() as f32 / domain_count as f32;
108                UnifiedField {
109                    canonical_name: name,
110                    field_type,
111                    present_in: present_in.into_iter().collect(),
112                    coverage,
113                }
114            })
115            .collect();
116
117        let unified_field_count = unified_fields.len();
118
119        // Build sources
120        let sources: Vec<ModelSource> = models
121            .iter()
122            .map(|(domain, model)| {
123                let field_coverage = if unified_field_count > 0 {
124                    model.fields.len() as f32 / unified_field_count as f32
125                } else {
126                    0.0
127                };
128                ModelSource {
129                    domain: domain.to_string(),
130                    instance_count: model.instance_count,
131                    field_coverage,
132                }
133            })
134            .collect();
135
136        let total_instances: usize = models.iter().map(|(_, m)| m.instance_count).sum();
137
138        // Use the most common model name
139        let name = models[0].1.name.clone();
140
141        unified_models.push(UnifiedModel {
142            name,
143            schema_org_type: schema_type.clone(),
144            fields: unified_fields,
145            sources,
146            total_instances,
147        });
148    }
149
150    // Sort by total instances (most significant first)
151    unified_models.sort_by(|a, b| b.total_instances.cmp(&a.total_instances));
152
153    let total_instances: usize = unified_models.iter().map(|m| m.total_instances).sum();
154
155    UnifiedSchema {
156        models: unified_models,
157        domains,
158        total_instances,
159    }
160}
161
162/// Canonicalize a field name to Schema.org conventions.
163fn canonicalize_field_name(name: &str) -> String {
164    // Map common aliases to canonical names
165    match name {
166        "price" | "cost" | "amount" => "price".to_string(),
167        "rating" | "score" | "stars" => "rating".to_string(),
168        "name" | "title" | "label" => "name".to_string(),
169        "url" | "link" | "href" => "url".to_string(),
170        "node_id" | "id" | "nodeId" => "node_id".to_string(),
171        "image_url" | "image" | "thumbnail" | "picture" => "image_url".to_string(),
172        "description" | "desc" | "summary" | "blurb" => "description".to_string(),
173        "category" | "type" | "kind" => "category".to_string(),
174        "brand" | "manufacturer" | "maker" => "brand".to_string(),
175        "availability" | "in_stock" | "stock" => "availability".to_string(),
176        other => other.to_string(),
177    }
178}
179
180/// Generate a universal Python client that queries all compiled sites.
181pub fn generate_universal_python(unified: &UnifiedSchema) -> String {
182    let mut out = String::new();
183
184    out.push_str("\"\"\"Universal Cortex client — queries all compiled sites\"\"\"\n");
185    out.push_str("# Generated by Cortex Web Compiler — do not edit manually\n\n");
186    out.push_str("from __future__ import annotations\n");
187    out.push_str("from dataclasses import dataclass, field\n");
188    out.push_str("from typing import Optional, List, Any\n");
189    out.push_str("import importlib\n\n");
190
191    // Domain registry
192    out.push_str("_COMPILED_DOMAINS = [\n");
193    for domain in &unified.domains {
194        let module = domain.replace(['.', '-'], "_");
195        out.push_str(&format!(
196            "    (\"{domain}\", \"cortex.compiled.{module}\"),\n"
197        ));
198    }
199    out.push_str("]\n\n");
200
201    out.push_str(
202        r#"def _get_compiled_sites(domains=None):
203    """Load compiled site modules."""
204    sites = []
205    for domain, module_name in _COMPILED_DOMAINS:
206        if domains and domain not in domains:
207            continue
208        try:
209            mod = importlib.import_module(module_name)
210            sites.append((domain, mod))
211        except ImportError:
212            continue
213    return sites
214
215"#,
216    );
217
218    // Generate unified dataclasses
219    for model in &unified.models {
220        out.push_str(&format!("\n@dataclass\nclass {}:\n", model.name));
221        out.push_str(&format!(
222            "    \"\"\"Unified {} from {} sites\"\"\"\n",
223            model.name,
224            model.sources.len()
225        ));
226        out.push_str("    url: str\n");
227        out.push_str("    source_domain: str\n");
228
229        for field in &model.fields {
230            if field.canonical_name == "url" || field.canonical_name == "node_id" {
231                continue;
232            }
233            let py_type = field.field_type.to_python_type();
234            if field.coverage < 1.0 {
235                out.push_str(&format!(
236                    "    {}: Optional[{}] = None\n",
237                    field.canonical_name, py_type
238                ));
239            } else {
240                out.push_str(&format!(
241                    "    {}: {} = \"\"\n",
242                    field.canonical_name, py_type
243                ));
244            }
245        }
246
247        // search method
248        out.push_str(&format!(
249            r#"
250    @staticmethod
251    def search(query: str, domains: List[str] = None, **filters) -> List[{name}]:
252        """Search {name}s across all compiled sites."""
253        results = []
254        for domain, site_module in _get_compiled_sites(domains):
255            try:
256                if hasattr(site_module, '{name}'):
257                    site_cls = getattr(site_module, '{name}')
258                    site_results = site_cls.search(query, **filters)
259                    results.extend([
260                        {name}(url=r.url, source_domain=domain, **{{
261                            k: getattr(r, k, None) for k in {name}.__dataclass_fields__
262                            if k not in ('url', 'source_domain')
263                        }})
264                        for r in site_results
265                    ])
266            except Exception:
267                continue
268        return results
269
270"#,
271            name = model.name
272        ));
273    }
274
275    out
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281    use chrono::Utc;
282
283    fn make_schema(domain: &str, models: Vec<DataModel>) -> CompiledSchema {
284        CompiledSchema {
285            domain: domain.to_string(),
286            compiled_at: Utc::now(),
287            models,
288            actions: Vec::new(),
289            relationships: Vec::new(),
290            stats: SchemaStats {
291                total_models: 0,
292                total_fields: 0,
293                total_instances: 0,
294                avg_confidence: 0.0,
295            },
296        }
297    }
298
299    fn product_model(instance_count: usize) -> DataModel {
300        DataModel {
301            name: "Product".to_string(),
302            schema_org_type: "Product".to_string(),
303            fields: vec![
304                ModelField {
305                    name: "url".to_string(),
306                    field_type: FieldType::Url,
307                    source: FieldSource::Inferred,
308                    confidence: 1.0,
309                    nullable: false,
310                    example_values: vec![],
311                    feature_dim: None,
312                },
313                ModelField {
314                    name: "name".to_string(),
315                    field_type: FieldType::String,
316                    source: FieldSource::JsonLd,
317                    confidence: 0.99,
318                    nullable: false,
319                    example_values: vec![],
320                    feature_dim: None,
321                },
322                ModelField {
323                    name: "price".to_string(),
324                    field_type: FieldType::Float,
325                    source: FieldSource::JsonLd,
326                    confidence: 0.99,
327                    nullable: true,
328                    example_values: vec![],
329                    feature_dim: Some(48),
330                },
331            ],
332            instance_count,
333            example_urls: vec![],
334            search_action: None,
335            list_url: None,
336        }
337    }
338
339    #[test]
340    fn test_unify_two_schemas() {
341        let schemas = vec![
342            make_schema("amazon.com", vec![product_model(50000)]),
343            make_schema("bestbuy.com", vec![product_model(20000)]),
344        ];
345
346        let unified = unify_schemas(&schemas);
347        assert_eq!(unified.domains.len(), 2);
348        assert_eq!(unified.models.len(), 1);
349
350        let product = &unified.models[0];
351        assert_eq!(product.name, "Product");
352        assert_eq!(product.total_instances, 70000);
353        assert_eq!(product.sources.len(), 2);
354    }
355
356    #[test]
357    fn test_unify_empty() {
358        let unified = unify_schemas(&[]);
359        assert!(unified.models.is_empty());
360        assert_eq!(unified.total_instances, 0);
361    }
362
363    #[test]
364    fn test_canonicalize_field_name() {
365        assert_eq!(canonicalize_field_name("price"), "price");
366        assert_eq!(canonicalize_field_name("cost"), "price");
367        assert_eq!(canonicalize_field_name("rating"), "rating");
368        assert_eq!(canonicalize_field_name("stars"), "rating");
369        assert_eq!(canonicalize_field_name("title"), "name");
370    }
371
372    #[test]
373    fn test_unified_field_coverage() {
374        let schemas = vec![
375            make_schema("a.com", vec![product_model(100)]),
376            make_schema("b.com", vec![product_model(200)]),
377        ];
378
379        let unified = unify_schemas(&schemas);
380        let product = &unified.models[0];
381
382        // All fields present in both sites → coverage = 1.0
383        for field in &product.fields {
384            assert_eq!(
385                field.coverage, 1.0,
386                "field {} should have full coverage",
387                field.canonical_name
388            );
389        }
390    }
391
392    #[test]
393    fn test_generate_universal_python() {
394        let schemas = vec![
395            make_schema("amazon.com", vec![product_model(50000)]),
396            make_schema("bestbuy.com", vec![product_model(20000)]),
397        ];
398        let unified = unify_schemas(&schemas);
399        let code = generate_universal_python(&unified);
400
401        assert!(code.contains("class Product:"));
402        assert!(code.contains("source_domain: str"));
403        assert!(code.contains("def search("));
404        assert!(code.contains("_COMPILED_DOMAINS"));
405    }
406
407    // ── v4 Test Suite: Phase 1C — Cross-Site Unification ──
408
409    #[test]
410    fn test_v4_unify_many_domains() {
411        let schemas: Vec<CompiledSchema> = (0..10)
412            .map(|i| make_schema(&format!("site{i}.com"), vec![product_model(100 * (i + 1))]))
413            .collect();
414
415        let unified = unify_schemas(&schemas);
416        assert_eq!(unified.domains.len(), 10);
417        assert_eq!(unified.models.len(), 1);
418
419        let product = &unified.models[0];
420        assert_eq!(product.sources.len(), 10);
421        assert_eq!(product.total_instances, 5500); // sum of 100, 200, ..., 1000
422    }
423
424    #[test]
425    fn test_v4_unify_different_model_types() {
426        let article_model = DataModel {
427            name: "Article".to_string(),
428            schema_org_type: "Article".to_string(),
429            fields: vec![ModelField {
430                name: "title".to_string(),
431                field_type: FieldType::String,
432                source: FieldSource::JsonLd,
433                confidence: 0.95,
434                nullable: false,
435                example_values: vec![],
436                feature_dim: None,
437            }],
438            instance_count: 1000,
439            example_urls: vec![],
440            search_action: None,
441            list_url: None,
442        };
443
444        let schemas = vec![
445            make_schema("amazon.com", vec![product_model(5000)]),
446            make_schema("bbc.com", vec![article_model.clone()]),
447            make_schema("bestbuy.com", vec![product_model(3000)]),
448            make_schema("cnn.com", vec![article_model]),
449        ];
450
451        let unified = unify_schemas(&schemas);
452        assert_eq!(unified.domains.len(), 4);
453        assert_eq!(unified.models.len(), 2, "should have Product and Article");
454
455        let product = unified.models.iter().find(|m| m.name == "Product").unwrap();
456        assert_eq!(product.sources.len(), 2);
457        assert_eq!(product.total_instances, 8000);
458
459        let article = unified.models.iter().find(|m| m.name == "Article").unwrap();
460        assert_eq!(article.sources.len(), 2);
461        assert_eq!(article.total_instances, 2000);
462    }
463
464    #[test]
465    fn test_v4_universal_python_multi_type() {
466        let article_model = DataModel {
467            name: "Article".to_string(),
468            schema_org_type: "Article".to_string(),
469            fields: vec![ModelField {
470                name: "title".to_string(),
471                field_type: FieldType::String,
472                source: FieldSource::JsonLd,
473                confidence: 0.9,
474                nullable: false,
475                example_values: vec![],
476                feature_dim: None,
477            }],
478            instance_count: 500,
479            example_urls: vec![],
480            search_action: None,
481            list_url: None,
482        };
483
484        let schemas = vec![
485            make_schema("amazon.com", vec![product_model(5000)]),
486            make_schema("bbc.com", vec![article_model]),
487        ];
488
489        let unified = unify_schemas(&schemas);
490        let code = generate_universal_python(&unified);
491
492        assert!(code.contains("class Product:"), "Product class");
493        assert!(code.contains("class Article:"), "Article class");
494        assert!(code.contains("_COMPILED_DOMAINS"), "domain registry");
495    }
496}