Skip to main content

cortex_runtime/compiler/
schema.rs

1//! Schema inference engine — discovers typed data models from SiteMap structured data.
2//!
3//! Walks every node in the SiteMap, groups by Schema.org type, infers field types,
4//! calculates nullability, and produces a `CompiledSchema` with all discovered models.
5
6use crate::compiler::actions::compile_actions;
7use crate::compiler::models::*;
8use crate::compiler::relationships::infer_relationships;
9use crate::map::types::*;
10use chrono::Utc;
11use std::collections::{BTreeMap, BTreeSet, HashMap};
12
13/// Page type to Schema.org type name mapping.
14fn page_type_to_schema_org(pt: PageType) -> Option<&'static str> {
15    match pt {
16        PageType::ProductDetail => Some("Product"),
17        PageType::ProductListing => Some("ProductListing"),
18        PageType::Article => Some("Article"),
19        PageType::ReviewList => Some("Review"),
20        PageType::Faq => Some("FAQPage"),
21        PageType::AboutPage => Some("Organization"),
22        PageType::ContactPage => Some("ContactPoint"),
23        PageType::PricingPage => Some("Offer"),
24        PageType::Documentation => Some("TechArticle"),
25        PageType::MediaPage => Some("MediaObject"),
26        PageType::Forum => Some("DiscussionForumPosting"),
27        PageType::SocialFeed => Some("SocialMediaPosting"),
28        PageType::Calendar => Some("Event"),
29        PageType::Cart => Some("Cart"),
30        PageType::Checkout => Some("CheckoutPage"),
31        PageType::Account => Some("Account"),
32        PageType::Login => Some("LoginPage"),
33        PageType::Home => Some("WebSite"),
34        PageType::SearchResults => Some("SearchResultsPage"),
35        PageType::Dashboard => Some("Dashboard"),
36        _ => None,
37    }
38}
39
40/// Infer field name and type from a feature dimension for a given page type.
41fn feature_dim_to_field(dim: usize, value: f32, pt: PageType) -> Option<(String, FieldType, f32)> {
42    // Only return fields relevant to this page type with meaningful values
43    if value == 0.0 {
44        return None;
45    }
46
47    match dim {
48        // Commerce fields — primarily for product pages
49        FEAT_PRICE if pt == PageType::ProductDetail || pt == PageType::PricingPage => {
50            Some(("price".to_string(), FieldType::Float, value))
51        }
52        FEAT_PRICE_ORIGINAL if pt == PageType::ProductDetail => {
53            Some(("original_price".to_string(), FieldType::Float, value))
54        }
55        FEAT_DISCOUNT_PCT if pt == PageType::ProductDetail => {
56            Some(("discount_percent".to_string(), FieldType::Float, value))
57        }
58        FEAT_AVAILABILITY if pt == PageType::ProductDetail => Some((
59            "availability".to_string(),
60            FieldType::Enum(vec![
61                "in_stock".to_string(),
62                "out_of_stock".to_string(),
63                "preorder".to_string(),
64            ]),
65            value,
66        )),
67        FEAT_RATING
68            if pt == PageType::ProductDetail
69                || pt == PageType::ReviewList
70                || pt == PageType::Article =>
71        {
72            Some(("rating".to_string(), FieldType::Float, value))
73        }
74        FEAT_REVIEW_COUNT_LOG if pt == PageType::ProductDetail || pt == PageType::ReviewList => {
75            Some(("review_count".to_string(), FieldType::Integer, value))
76        }
77        FEAT_REVIEW_SENTIMENT if pt == PageType::ProductDetail || pt == PageType::ReviewList => {
78            Some(("review_sentiment".to_string(), FieldType::Float, value))
79        }
80        FEAT_SHIPPING_FREE if pt == PageType::ProductDetail => {
81            Some(("free_shipping".to_string(), FieldType::Bool, value))
82        }
83        FEAT_SHIPPING_SPEED if pt == PageType::ProductDetail => {
84            Some(("shipping_speed_days".to_string(), FieldType::Integer, value))
85        }
86        FEAT_SELLER_REPUTATION if pt == PageType::ProductDetail => {
87            Some(("seller_reputation".to_string(), FieldType::Float, value))
88        }
89        FEAT_VARIANT_COUNT if pt == PageType::ProductDetail => {
90            Some(("variant_count".to_string(), FieldType::Integer, value))
91        }
92        FEAT_DEAL_SCORE if pt == PageType::ProductDetail => {
93            Some(("deal_score".to_string(), FieldType::Float, value))
94        }
95        FEAT_CATEGORY_PRICE_PERCENTILE if pt == PageType::ProductDetail => Some((
96            "category_price_percentile".to_string(),
97            FieldType::Float,
98            value,
99        )),
100        // Content fields — for articles/docs
101        FEAT_TEXT_LENGTH_LOG if pt == PageType::Article || pt == PageType::Documentation => {
102            Some(("word_count".to_string(), FieldType::Integer, value))
103        }
104        FEAT_READING_LEVEL if pt == PageType::Article || pt == PageType::Documentation => {
105            Some(("reading_level".to_string(), FieldType::Float, value))
106        }
107        FEAT_SENTIMENT if pt == PageType::Article => {
108            Some(("sentiment".to_string(), FieldType::Float, value))
109        }
110        FEAT_IMAGE_COUNT => Some(("image_count".to_string(), FieldType::Integer, value)),
111        FEAT_VIDEO_PRESENT if value > 0.5 => {
112            Some(("has_video".to_string(), FieldType::Bool, value))
113        }
114        // Navigation
115        FEAT_BREADCRUMB_DEPTH if value > 0.0 => {
116            Some(("breadcrumb_depth".to_string(), FieldType::Integer, value))
117        }
118        _ => None,
119    }
120}
121
122/// Infer a `CompiledSchema` from a SiteMap.
123///
124/// Walks every node, groups by page type / Schema.org type, infers fields,
125/// discovers relationships and actions, and returns the complete compiled schema.
126pub fn infer_schema(site_map: &SiteMap, domain: &str) -> CompiledSchema {
127    // Group nodes by their Schema.org type
128    let mut type_groups: HashMap<String, Vec<usize>> = HashMap::new();
129
130    for (idx, node) in site_map.nodes.iter().enumerate() {
131        // Only consider nodes with reasonable confidence
132        let confidence = node.confidence as f32 / 255.0;
133        if confidence < 0.3 {
134            continue;
135        }
136
137        if let Some(schema_type) = page_type_to_schema_org(node.page_type) {
138            type_groups
139                .entry(schema_type.to_string())
140                .or_default()
141                .push(idx);
142        }
143    }
144
145    // Build models from grouped nodes
146    let mut models: Vec<DataModel> = Vec::new();
147
148    for (schema_type, node_indices) in &type_groups {
149        if node_indices.is_empty() {
150            continue;
151        }
152
153        // Skip types with too few instances (likely noise) unless it's a singleton type
154        let singleton_types = [
155            "Cart",
156            "CheckoutPage",
157            "Account",
158            "LoginPage",
159            "WebSite",
160            "SearchResultsPage",
161            "Dashboard",
162        ];
163        if node_indices.len() < 2 && !singleton_types.contains(&schema_type.as_str()) {
164            continue;
165        }
166
167        // Collect field data across all instances
168        let mut field_occurrences: BTreeMap<String, Vec<(FieldType, f32, String)>> =
169            BTreeMap::new();
170        let mut example_urls: Vec<String> = Vec::new();
171        let mut list_url: Option<String> = None;
172
173        // Determine the representative PageType for this group
174        let representative_pt = site_map.nodes[node_indices[0]].page_type;
175
176        for &idx in node_indices {
177            // Collect example URLs (first 5)
178            if example_urls.len() < 5 && idx < site_map.urls.len() {
179                example_urls.push(site_map.urls[idx].clone());
180            }
181
182            // Always include url and node_id fields
183            if !field_occurrences.contains_key("url") {
184                field_occurrences.insert(
185                    "url".to_string(),
186                    vec![(FieldType::Url, 1.0, String::new())],
187                );
188            }
189
190            // Extract fields from feature vector
191            if idx < site_map.features.len() {
192                let features = &site_map.features[idx];
193                for (dim, &value) in features.iter().enumerate() {
194                    if let Some((field_name, field_type, val)) =
195                        feature_dim_to_field(dim, value, representative_pt)
196                    {
197                        let example = format_feature_value(dim, val);
198                        field_occurrences.entry(field_name).or_default().push((
199                            field_type,
200                            FieldSource::Inferred.default_confidence(),
201                            example,
202                        ));
203                    }
204                }
205
206                // Check for structured data richness → implies JSON-LD fields
207                if features[FEAT_HAS_STRUCTURED_DATA] > 0.5 {
208                    // Add standard Schema.org fields based on type
209                    add_schema_org_fields(schema_type, &mut field_occurrences);
210                }
211            }
212        }
213
214        // Detect listing page for this type
215        for (idx, node) in site_map.nodes.iter().enumerate() {
216            if node.page_type == PageType::ProductListing
217                && schema_type == "Product"
218                && idx < site_map.urls.len()
219            {
220                list_url = Some(site_map.urls[idx].clone());
221                break;
222            }
223        }
224
225        // Build model fields from occurrences
226        let total_instances = node_indices.len();
227        let mut fields: Vec<ModelField> = Vec::new();
228
229        // Always add url and node_id as first fields
230        fields.push(ModelField {
231            name: "url".to_string(),
232            field_type: FieldType::Url,
233            source: FieldSource::Inferred,
234            confidence: 1.0,
235            nullable: false,
236            example_values: example_urls.iter().take(3).cloned().collect(),
237            feature_dim: None,
238        });
239
240        fields.push(ModelField {
241            name: "node_id".to_string(),
242            field_type: FieldType::Integer,
243            source: FieldSource::Inferred,
244            confidence: 1.0,
245            nullable: false,
246            example_values: node_indices.iter().take(3).map(|i| i.to_string()).collect(),
247            feature_dim: None,
248        });
249
250        // Add "name" field for all types (inferred from structured data or title)
251        fields.push(ModelField {
252            name: "name".to_string(),
253            field_type: FieldType::String,
254            source: FieldSource::JsonLd,
255            confidence: 0.95,
256            nullable: false,
257            example_values: Vec::new(),
258            feature_dim: None,
259        });
260
261        for (field_name, occurrences) in &field_occurrences {
262            if field_name == "url" {
263                continue; // Already added
264            }
265
266            // Use the most common type
267            let field_type = occurrences[0].0.clone();
268
269            // Calculate confidence (average)
270            let avg_confidence =
271                occurrences.iter().map(|o| o.1).sum::<f32>() / occurrences.len() as f32;
272
273            // Calculate nullability
274            let nullable = occurrences.len() < total_instances;
275
276            // Collect unique example values (first 5)
277            let mut example_values: Vec<String> = Vec::new();
278            let mut seen: BTreeSet<String> = BTreeSet::new();
279            for o in occurrences {
280                if !o.2.is_empty() && seen.insert(o.2.clone()) {
281                    example_values.push(o.2.clone());
282                    if example_values.len() >= 5 {
283                        break;
284                    }
285                }
286            }
287
288            // Map field name to feature dimension
289            let feature_dim = field_name_to_dim(field_name);
290
291            fields.push(ModelField {
292                name: field_name.clone(),
293                field_type,
294                source: FieldSource::Inferred,
295                confidence: avg_confidence,
296                nullable,
297                example_values,
298                feature_dim,
299            });
300        }
301
302        let model_name = simplify_model_name(schema_type);
303
304        models.push(DataModel {
305            name: model_name,
306            schema_org_type: schema_type.clone(),
307            fields,
308            instance_count: total_instances,
309            example_urls,
310            search_action: None,
311            list_url,
312        });
313    }
314
315    // Sort models by instance count (descending) for better UX
316    models.sort_by(|a, b| b.instance_count.cmp(&a.instance_count));
317
318    // Infer relationships between models
319    let relationships = infer_relationships(site_map, &models);
320
321    // Compile actions
322    let actions = compile_actions(site_map, &models);
323
324    // Attach search actions to models
325    for action in &actions {
326        if action.name == "search" || action.name.ends_with("_search") {
327            for model in &mut models {
328                if action.belongs_to == model.name && model.search_action.is_none() {
329                    model.search_action = Some(action.clone());
330                }
331            }
332        }
333    }
334
335    // Compute stats
336    let total_fields: usize = models.iter().map(|m| m.fields.len()).sum();
337    let total_instances: usize = models.iter().map(|m| m.instance_count).sum();
338    let avg_confidence = if total_fields > 0 {
339        models
340            .iter()
341            .flat_map(|m| m.fields.iter().map(|f| f.confidence))
342            .sum::<f32>()
343            / total_fields as f32
344    } else {
345        0.0
346    };
347
348    CompiledSchema {
349        domain: domain.to_string(),
350        compiled_at: Utc::now(),
351        models: models.clone(),
352        actions,
353        relationships,
354        stats: SchemaStats {
355            total_models: models.len(),
356            total_fields,
357            total_instances,
358            avg_confidence,
359        },
360    }
361}
362
363/// Simplify Schema.org type names to cleaner model names.
364fn simplify_model_name(schema_type: &str) -> String {
365    match schema_type {
366        "FAQPage" => "FAQ".to_string(),
367        "TechArticle" => "Article".to_string(),
368        "MediaObject" => "Media".to_string(),
369        "DiscussionForumPosting" => "ForumPost".to_string(),
370        "SocialMediaPosting" => "SocialPost".to_string(),
371        "ContactPoint" => "Contact".to_string(),
372        "CheckoutPage" => "Checkout".to_string(),
373        "LoginPage" => "Auth".to_string(),
374        "WebSite" => "Site".to_string(),
375        "SearchResultsPage" => "SearchResults".to_string(),
376        "ProductListing" => "Category".to_string(),
377        other => other.to_string(),
378    }
379}
380
381/// Map field names back to feature vector dimensions.
382fn field_name_to_dim(name: &str) -> Option<usize> {
383    match name {
384        "price" => Some(FEAT_PRICE),
385        "original_price" => Some(FEAT_PRICE_ORIGINAL),
386        "discount_percent" => Some(FEAT_DISCOUNT_PCT),
387        "availability" => Some(FEAT_AVAILABILITY),
388        "rating" => Some(FEAT_RATING),
389        "review_count" => Some(FEAT_REVIEW_COUNT_LOG),
390        "review_sentiment" => Some(FEAT_REVIEW_SENTIMENT),
391        "free_shipping" => Some(FEAT_SHIPPING_FREE),
392        "shipping_speed_days" => Some(FEAT_SHIPPING_SPEED),
393        "seller_reputation" => Some(FEAT_SELLER_REPUTATION),
394        "variant_count" => Some(FEAT_VARIANT_COUNT),
395        "deal_score" => Some(FEAT_DEAL_SCORE),
396        "image_count" => Some(FEAT_IMAGE_COUNT),
397        _ => None,
398    }
399}
400
401/// Format a feature value as a human-readable example string.
402fn format_feature_value(dim: usize, value: f32) -> String {
403    match dim {
404        FEAT_PRICE | FEAT_PRICE_ORIGINAL => format!("{value:.2}"),
405        FEAT_DISCOUNT_PCT => format!("{:.0}%", value * 100.0),
406        FEAT_RATING => format!("{value:.1}"),
407        FEAT_REVIEW_COUNT_LOG => format!("{}", (10.0f32.powf(value)) as u64),
408        FEAT_AVAILABILITY => {
409            if value > 0.5 {
410                "in_stock".to_string()
411            } else {
412                "out_of_stock".to_string()
413            }
414        }
415        _ => format!("{value:.2}"),
416    }
417}
418
419/// Add standard Schema.org fields for known types when structured data is present.
420fn add_schema_org_fields(
421    schema_type: &str,
422    fields: &mut BTreeMap<String, Vec<(FieldType, f32, String)>>,
423) {
424    let schema_fields: &[(&str, FieldType)] = match schema_type {
425        "Product" => &[
426            ("brand", FieldType::String),
427            ("category", FieldType::String),
428            ("sku", FieldType::String),
429            ("image_url", FieldType::Url),
430            ("description", FieldType::String),
431            ("currency", FieldType::String),
432        ],
433        "Article" => &[
434            ("author", FieldType::String),
435            ("published_date", FieldType::DateTime),
436            ("category", FieldType::String),
437            ("image_url", FieldType::Url),
438            ("description", FieldType::String),
439        ],
440        "Organization" => &[
441            ("description", FieldType::String),
442            ("logo_url", FieldType::Url),
443            ("address", FieldType::String),
444            ("phone", FieldType::String),
445            ("email", FieldType::String),
446        ],
447        "Event" => &[
448            ("start_date", FieldType::DateTime),
449            ("end_date", FieldType::DateTime),
450            ("location", FieldType::String),
451            ("organizer", FieldType::String),
452            ("description", FieldType::String),
453        ],
454        "Review" => &[
455            ("author", FieldType::String),
456            ("body", FieldType::String),
457            ("date_published", FieldType::DateTime),
458        ],
459        "Offer" => &[
460            ("description", FieldType::String),
461            ("valid_from", FieldType::DateTime),
462            ("valid_through", FieldType::DateTime),
463        ],
464        _ => &[],
465    };
466
467    for (name, ftype) in schema_fields {
468        fields.entry(name.to_string()).or_default().push((
469            ftype.clone(),
470            FieldSource::JsonLd.default_confidence(),
471            String::new(),
472        ));
473    }
474}
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479    use crate::map::builder::SiteMapBuilder;
480
481    fn build_test_sitemap() -> SiteMap {
482        let mut builder = SiteMapBuilder::new("shop.example.com");
483
484        // Home page
485        let mut home_feats = [0.0f32; FEATURE_DIM];
486        home_feats[FEAT_HAS_STRUCTURED_DATA] = 1.0;
487        home_feats[FEAT_PAGE_TYPE_CONFIDENCE] = 0.9;
488        builder.add_node("https://shop.example.com/", PageType::Home, home_feats, 240);
489
490        // Product listing
491        let mut listing_feats = [0.0f32; FEATURE_DIM];
492        listing_feats[FEAT_HAS_STRUCTURED_DATA] = 0.5;
493        builder.add_node(
494            "https://shop.example.com/category/electronics",
495            PageType::ProductListing,
496            listing_feats,
497            200,
498        );
499
500        // Several products
501        for i in 0..10 {
502            let mut feats = [0.0f32; FEATURE_DIM];
503            feats[FEAT_PRICE] = 100.0 + (i as f32 * 25.0);
504            feats[FEAT_PRICE_ORIGINAL] = 150.0 + (i as f32 * 25.0);
505            feats[FEAT_DISCOUNT_PCT] = 0.3;
506            feats[FEAT_AVAILABILITY] = if i % 3 == 0 { 0.0 } else { 1.0 };
507            feats[FEAT_RATING] = 3.5 + (i as f32 * 0.15);
508            feats[FEAT_REVIEW_COUNT_LOG] = 2.0 + (i as f32 * 0.1);
509            feats[FEAT_HAS_STRUCTURED_DATA] = 1.0;
510            feats[FEAT_SELLER_REPUTATION] = 0.8;
511            feats[FEAT_VARIANT_COUNT] = (2 + i % 5) as f32;
512            feats[FEAT_IMAGE_COUNT] = (3 + i % 4) as f32;
513
514            builder.add_node(
515                &format!("https://shop.example.com/product/{i}"),
516                PageType::ProductDetail,
517                feats,
518                220,
519            );
520
521            // Edges: listing → product
522            builder.add_edge(
523                1, // listing
524                2 + i as u32,
525                EdgeType::ContentLink,
526                1,
527                EdgeFlags::default(),
528            );
529        }
530
531        // Articles
532        for i in 0..5 {
533            let mut feats = [0.0f32; FEATURE_DIM];
534            feats[FEAT_TEXT_LENGTH_LOG] = 3.0 + i as f32 * 0.2;
535            feats[FEAT_READING_LEVEL] = 0.6;
536            feats[FEAT_RATING] = 4.0;
537            feats[FEAT_HAS_STRUCTURED_DATA] = 0.8;
538            feats[FEAT_IMAGE_COUNT] = 2.0;
539
540            builder.add_node(
541                &format!("https://shop.example.com/blog/{i}"),
542                PageType::Article,
543                feats,
544                180,
545            );
546        }
547
548        // Cart page
549        let cart_feats = [0.0f32; FEATURE_DIM];
550        builder.add_node(
551            "https://shop.example.com/cart",
552            PageType::Cart,
553            cart_feats,
554            200,
555        );
556
557        // Add some inter-type edges (product → product "related")
558        for i in 2..8 {
559            builder.add_edge(i, i + 1, EdgeType::Related, 2, EdgeFlags::default());
560        }
561
562        // Add actions on product pages
563        for i in 2..12 {
564            builder.add_action(i, OpCode::new(0x02, 0x00), -1, 0, 1); // add_to_cart
565        }
566
567        builder.build()
568    }
569
570    #[test]
571    fn test_infer_schema_discovers_models() {
572        let map = build_test_sitemap();
573        let schema = infer_schema(&map, "shop.example.com");
574
575        assert_eq!(schema.domain, "shop.example.com");
576        assert!(
577            schema.stats.total_models >= 2,
578            "should find Product and Article at minimum"
579        );
580
581        // Find the Product model
582        let product = schema.models.iter().find(|m| m.name == "Product");
583        assert!(product.is_some(), "should discover Product model");
584
585        let product = product.unwrap();
586        assert_eq!(product.instance_count, 10);
587        assert!(
588            product.fields.len() >= 5,
589            "Product should have several fields"
590        );
591
592        // Check specific fields exist
593        let field_names: Vec<&str> = product.fields.iter().map(|f| f.name.as_str()).collect();
594        assert!(field_names.contains(&"price"), "Product should have price");
595        assert!(
596            field_names.contains(&"rating"),
597            "Product should have rating"
598        );
599        assert!(field_names.contains(&"url"), "Product should have url");
600        assert!(
601            field_names.contains(&"node_id"),
602            "Product should have node_id"
603        );
604    }
605
606    #[test]
607    fn test_infer_schema_handles_empty_map() {
608        let builder = SiteMapBuilder::new("empty.com");
609        let map = builder.build();
610        let schema = infer_schema(&map, "empty.com");
611
612        assert_eq!(schema.stats.total_models, 0);
613        assert!(schema.models.is_empty());
614    }
615
616    #[test]
617    fn test_schema_field_nullability() {
618        let map = build_test_sitemap();
619        let schema = infer_schema(&map, "shop.example.com");
620
621        let product = schema.models.iter().find(|m| m.name == "Product").unwrap();
622
623        // url and node_id are not nullable
624        let url_field = product.fields.iter().find(|f| f.name == "url").unwrap();
625        assert!(!url_field.nullable);
626    }
627
628    #[test]
629    fn test_schema_stats() {
630        let map = build_test_sitemap();
631        let schema = infer_schema(&map, "shop.example.com");
632
633        assert!(schema.stats.total_models > 0);
634        assert!(schema.stats.total_fields > 0);
635        assert!(schema.stats.total_instances > 0);
636        assert!(schema.stats.avg_confidence > 0.0);
637        assert!(schema.stats.avg_confidence <= 1.0);
638    }
639
640    #[test]
641    fn test_page_type_to_schema_org_mapping() {
642        assert_eq!(
643            page_type_to_schema_org(PageType::ProductDetail),
644            Some("Product")
645        );
646        assert_eq!(page_type_to_schema_org(PageType::Article), Some("Article"));
647        assert_eq!(page_type_to_schema_org(PageType::Cart), Some("Cart"));
648        assert_eq!(page_type_to_schema_org(PageType::Unknown), None);
649        assert_eq!(page_type_to_schema_org(PageType::ErrorPage), None);
650    }
651
652    #[test]
653    fn test_simplify_model_name() {
654        assert_eq!(simplify_model_name("FAQPage"), "FAQ");
655        assert_eq!(simplify_model_name("Product"), "Product");
656        assert_eq!(simplify_model_name("TechArticle"), "Article");
657        assert_eq!(simplify_model_name("WebSite"), "Site");
658    }
659
660    // ── v4 Test Suite: Phase 1A — Schema Inference ──
661
662    /// Build a realistic e-commerce sitemap simulating amazon.com-like structure.
663    fn build_ecommerce_sitemap(domain: &str, product_count: usize) -> SiteMap {
664        let mut builder = SiteMapBuilder::new(domain);
665
666        // Home
667        let mut hf = [0.0f32; FEATURE_DIM];
668        hf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
669        hf[FEAT_SEARCH_AVAILABLE] = 1.0;
670        builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
671
672        // Search results
673        let mut sf = [0.0f32; FEATURE_DIM];
674        sf[FEAT_PAGINATION_PRESENT] = 1.0;
675        builder.add_node(
676            &format!("https://{domain}/search"),
677            PageType::SearchResults,
678            sf,
679            200,
680        );
681
682        // Category listing pages
683        for i in 0..3 {
684            let mut cf = [0.0f32; FEATURE_DIM];
685            cf[FEAT_PAGINATION_PRESENT] = 1.0;
686            cf[FEAT_FILTER_COUNT] = 5.0;
687            cf[FEAT_HAS_STRUCTURED_DATA] = 0.8;
688            builder.add_node(
689                &format!("https://{domain}/category/{i}"),
690                PageType::ProductListing,
691                cf,
692                200,
693            );
694        }
695
696        // Products
697        let cat_base = 2; // first category node index
698        for i in 0..product_count {
699            let mut pf = [0.0f32; FEATURE_DIM];
700            pf[FEAT_PRICE] = 10.0 + (i as f32 * 15.0);
701            pf[FEAT_PRICE_ORIGINAL] = 15.0 + (i as f32 * 15.0);
702            pf[FEAT_DISCOUNT_PCT] = 0.2;
703            pf[FEAT_AVAILABILITY] = if i % 5 == 0 { 0.0 } else { 1.0 };
704            pf[FEAT_RATING] = 2.5 + (i as f32 * 0.1).min(2.5);
705            pf[FEAT_REVIEW_COUNT_LOG] = 1.0 + (i as f32 * 0.05);
706            pf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
707            pf[FEAT_SELLER_REPUTATION] = 0.75;
708            pf[FEAT_VARIANT_COUNT] = (1 + i % 6) as f32;
709            pf[FEAT_IMAGE_COUNT] = (2 + i % 5) as f32;
710            pf[FEAT_DEAL_SCORE] = (i as f32 * 0.05).min(1.0);
711            pf[FEAT_CATEGORY_PRICE_PERCENTILE] = i as f32 / product_count as f32;
712
713            let node = builder.add_node(
714                &format!("https://{domain}/product/{i}"),
715                PageType::ProductDetail,
716                pf,
717                210,
718            );
719
720            // Category → product edge
721            let cat = cat_base + (i % 3) as u32;
722            builder.add_edge(cat, node, EdgeType::ContentLink, 1, EdgeFlags::default());
723
724            // Add cart action
725            builder.add_action(node, OpCode::new(0x02, 0x00), -1, 0, 1);
726        }
727
728        // Cart
729        let cart_feats = [0.0f32; FEATURE_DIM];
730        builder.add_node(
731            &format!("https://{domain}/cart"),
732            PageType::Cart,
733            cart_feats,
734            200,
735        );
736
737        // Login
738        let login_feats = [0.0f32; FEATURE_DIM];
739        builder.add_node(
740            &format!("https://{domain}/login"),
741            PageType::Login,
742            login_feats,
743            200,
744        );
745
746        builder.build()
747    }
748
749    /// Build a news/article sitemap simulating bbc.com-like structure.
750    fn build_news_sitemap(domain: &str, article_count: usize) -> SiteMap {
751        let mut builder = SiteMapBuilder::new(domain);
752
753        let mut hf = [0.0f32; FEATURE_DIM];
754        hf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
755        builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
756
757        for i in 0..article_count {
758            let mut af = [0.0f32; FEATURE_DIM];
759            af[FEAT_TEXT_LENGTH_LOG] = 3.0 + (i as f32 * 0.1);
760            af[FEAT_READING_LEVEL] = 0.5 + (i as f32 * 0.02);
761            af[FEAT_SENTIMENT] = 0.3 + (i as f32 * 0.05);
762            af[FEAT_HAS_STRUCTURED_DATA] = 1.0;
763            af[FEAT_IMAGE_COUNT] = (1 + i % 5) as f32;
764
765            builder.add_node(
766                &format!("https://{domain}/article/{i}"),
767                PageType::Article,
768                af,
769                200,
770            );
771        }
772
773        builder.build()
774    }
775
776    #[test]
777    fn test_v4_schema_inference_ecommerce() {
778        let map = build_ecommerce_sitemap("amazon.example.com", 20);
779        let schema = infer_schema(&map, "amazon.example.com");
780
781        // Must find Product model
782        let product = schema.models.iter().find(|m| m.name == "Product");
783        assert!(product.is_some(), "Must discover Product model");
784        let product = product.unwrap();
785        assert_eq!(product.instance_count, 20);
786
787        // Product must have key fields
788        let field_names: Vec<&str> = product.fields.iter().map(|f| f.name.as_str()).collect();
789        assert!(field_names.contains(&"price"), "Product needs price field");
790        assert!(
791            field_names.contains(&"rating"),
792            "Product needs rating field"
793        );
794        assert!(
795            field_names.contains(&"availability"),
796            "Product needs availability"
797        );
798
799        // Price field should be Float type with feature_dim
800        let price = product.fields.iter().find(|f| f.name == "price").unwrap();
801        assert_eq!(price.field_type, FieldType::Float);
802        assert_eq!(price.feature_dim, Some(FEAT_PRICE));
803
804        // Must also find Site/Home model
805        let site = schema
806            .models
807            .iter()
808            .find(|m| m.schema_org_type == "WebSite");
809        assert!(
810            site.is_some(),
811            "Should discover WebSite model from Home page"
812        );
813    }
814
815    #[test]
816    fn test_v4_schema_inference_news() {
817        let map = build_news_sitemap("bbc.example.com", 15);
818        let schema = infer_schema(&map, "bbc.example.com");
819
820        let article = schema.models.iter().find(|m| m.name == "Article");
821        assert!(article.is_some(), "Must discover Article model");
822        let article = article.unwrap();
823        assert_eq!(article.instance_count, 15);
824
825        let field_names: Vec<&str> = article.fields.iter().map(|f| f.name.as_str()).collect();
826        assert!(
827            field_names.contains(&"word_count"),
828            "Article needs word_count"
829        );
830        assert!(
831            field_names.contains(&"reading_level"),
832            "Article needs reading_level"
833        );
834    }
835
836    #[test]
837    fn test_v4_schema_inference_multi_site() {
838        // Test that schema inference works across diverse site types
839        let sites: Vec<(&str, PageType, usize)> = vec![
840            ("recipes.example.com", PageType::Article, 10),
841            ("events.example.com", PageType::Calendar, 8),
842            ("docs.example.com", PageType::Documentation, 12),
843        ];
844
845        for (domain, page_type, count) in &sites {
846            let mut builder = SiteMapBuilder::new(domain);
847            let hf = [0.0f32; FEATURE_DIM];
848            builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
849
850            for i in 0..*count {
851                let mut feats = [0.0f32; FEATURE_DIM];
852                feats[FEAT_HAS_STRUCTURED_DATA] = 0.9;
853                feats[FEAT_TEXT_LENGTH_LOG] = 2.0 + i as f32 * 0.1;
854                builder.add_node(
855                    &format!("https://{domain}/item/{i}"),
856                    *page_type,
857                    feats,
858                    200,
859                );
860            }
861
862            let map = builder.build();
863            let schema = infer_schema(&map, domain);
864            assert!(
865                schema.stats.total_models >= 1,
866                "{domain} should have at least 1 model"
867            );
868        }
869    }
870
871    #[test]
872    fn test_v4_schema_field_types_correct() {
873        let map = build_ecommerce_sitemap("typed.example.com", 10);
874        let schema = infer_schema(&map, "typed.example.com");
875
876        let product = schema.models.iter().find(|m| m.name == "Product").unwrap();
877
878        // Verify field type correctness
879        for field in &product.fields {
880            match field.name.as_str() {
881                "price"
882                | "original_price"
883                | "discount"
884                | "rating"
885                | "deal_score"
886                | "category_price_percentile" => {
887                    assert_eq!(
888                        field.field_type,
889                        FieldType::Float,
890                        "{} should be Float",
891                        field.name
892                    );
893                }
894                "image_count" | "review_count" | "variant_count" => {
895                    assert_eq!(
896                        field.field_type,
897                        FieldType::Integer,
898                        "{} should be Integer",
899                        field.name
900                    );
901                }
902                "url" => {
903                    assert_eq!(field.field_type, FieldType::Url, "url should be Url");
904                }
905                _ => {} // other fields are fine
906            }
907        }
908    }
909
910    #[test]
911    fn test_v4_schema_actions_discovered() {
912        let map = build_ecommerce_sitemap("actions.example.com", 10);
913        let schema = infer_schema(&map, "actions.example.com");
914
915        assert!(
916            !schema.actions.is_empty(),
917            "Should discover actions from opcodes"
918        );
919
920        // Should have add_to_cart action
921        let atc = schema.actions.iter().find(|a| a.name == "add_to_cart");
922        assert!(atc.is_some(), "Should find add_to_cart action");
923    }
924
925    #[test]
926    fn test_v4_schema_confidence_ranges() {
927        let map = build_ecommerce_sitemap("conf.example.com", 10);
928        let schema = infer_schema(&map, "conf.example.com");
929
930        for model in &schema.models {
931            for field in &model.fields {
932                assert!(
933                    field.confidence > 0.0 && field.confidence <= 1.0,
934                    "Field {} confidence should be in (0,1], got {}",
935                    field.name,
936                    field.confidence
937                );
938            }
939        }
940
941        assert!(
942            schema.stats.avg_confidence > 0.0 && schema.stats.avg_confidence <= 1.0,
943            "Avg confidence should be in (0,1]"
944        );
945    }
946}