1use crate::compiler::actions::compile_actions;
7use crate::compiler::models::*;
8use crate::compiler::relationships::infer_relationships;
9use crate::map::types::*;
10use chrono::Utc;
11use std::collections::{BTreeMap, BTreeSet, HashMap};
12
13fn page_type_to_schema_org(pt: PageType) -> Option<&'static str> {
15 match pt {
16 PageType::ProductDetail => Some("Product"),
17 PageType::ProductListing => Some("ProductListing"),
18 PageType::Article => Some("Article"),
19 PageType::ReviewList => Some("Review"),
20 PageType::Faq => Some("FAQPage"),
21 PageType::AboutPage => Some("Organization"),
22 PageType::ContactPage => Some("ContactPoint"),
23 PageType::PricingPage => Some("Offer"),
24 PageType::Documentation => Some("TechArticle"),
25 PageType::MediaPage => Some("MediaObject"),
26 PageType::Forum => Some("DiscussionForumPosting"),
27 PageType::SocialFeed => Some("SocialMediaPosting"),
28 PageType::Calendar => Some("Event"),
29 PageType::Cart => Some("Cart"),
30 PageType::Checkout => Some("CheckoutPage"),
31 PageType::Account => Some("Account"),
32 PageType::Login => Some("LoginPage"),
33 PageType::Home => Some("WebSite"),
34 PageType::SearchResults => Some("SearchResultsPage"),
35 PageType::Dashboard => Some("Dashboard"),
36 _ => None,
37 }
38}
39
40fn feature_dim_to_field(dim: usize, value: f32, pt: PageType) -> Option<(String, FieldType, f32)> {
42 if value == 0.0 {
44 return None;
45 }
46
47 match dim {
48 FEAT_PRICE if pt == PageType::ProductDetail || pt == PageType::PricingPage => {
50 Some(("price".to_string(), FieldType::Float, value))
51 }
52 FEAT_PRICE_ORIGINAL if pt == PageType::ProductDetail => {
53 Some(("original_price".to_string(), FieldType::Float, value))
54 }
55 FEAT_DISCOUNT_PCT if pt == PageType::ProductDetail => {
56 Some(("discount_percent".to_string(), FieldType::Float, value))
57 }
58 FEAT_AVAILABILITY if pt == PageType::ProductDetail => Some((
59 "availability".to_string(),
60 FieldType::Enum(vec![
61 "in_stock".to_string(),
62 "out_of_stock".to_string(),
63 "preorder".to_string(),
64 ]),
65 value,
66 )),
67 FEAT_RATING
68 if pt == PageType::ProductDetail
69 || pt == PageType::ReviewList
70 || pt == PageType::Article =>
71 {
72 Some(("rating".to_string(), FieldType::Float, value))
73 }
74 FEAT_REVIEW_COUNT_LOG if pt == PageType::ProductDetail || pt == PageType::ReviewList => {
75 Some(("review_count".to_string(), FieldType::Integer, value))
76 }
77 FEAT_REVIEW_SENTIMENT if pt == PageType::ProductDetail || pt == PageType::ReviewList => {
78 Some(("review_sentiment".to_string(), FieldType::Float, value))
79 }
80 FEAT_SHIPPING_FREE if pt == PageType::ProductDetail => {
81 Some(("free_shipping".to_string(), FieldType::Bool, value))
82 }
83 FEAT_SHIPPING_SPEED if pt == PageType::ProductDetail => {
84 Some(("shipping_speed_days".to_string(), FieldType::Integer, value))
85 }
86 FEAT_SELLER_REPUTATION if pt == PageType::ProductDetail => {
87 Some(("seller_reputation".to_string(), FieldType::Float, value))
88 }
89 FEAT_VARIANT_COUNT if pt == PageType::ProductDetail => {
90 Some(("variant_count".to_string(), FieldType::Integer, value))
91 }
92 FEAT_DEAL_SCORE if pt == PageType::ProductDetail => {
93 Some(("deal_score".to_string(), FieldType::Float, value))
94 }
95 FEAT_CATEGORY_PRICE_PERCENTILE if pt == PageType::ProductDetail => Some((
96 "category_price_percentile".to_string(),
97 FieldType::Float,
98 value,
99 )),
100 FEAT_TEXT_LENGTH_LOG if pt == PageType::Article || pt == PageType::Documentation => {
102 Some(("word_count".to_string(), FieldType::Integer, value))
103 }
104 FEAT_READING_LEVEL if pt == PageType::Article || pt == PageType::Documentation => {
105 Some(("reading_level".to_string(), FieldType::Float, value))
106 }
107 FEAT_SENTIMENT if pt == PageType::Article => {
108 Some(("sentiment".to_string(), FieldType::Float, value))
109 }
110 FEAT_IMAGE_COUNT => Some(("image_count".to_string(), FieldType::Integer, value)),
111 FEAT_VIDEO_PRESENT if value > 0.5 => {
112 Some(("has_video".to_string(), FieldType::Bool, value))
113 }
114 FEAT_BREADCRUMB_DEPTH if value > 0.0 => {
116 Some(("breadcrumb_depth".to_string(), FieldType::Integer, value))
117 }
118 _ => None,
119 }
120}
121
122pub fn infer_schema(site_map: &SiteMap, domain: &str) -> CompiledSchema {
127 let mut type_groups: HashMap<String, Vec<usize>> = HashMap::new();
129
130 for (idx, node) in site_map.nodes.iter().enumerate() {
131 let confidence = node.confidence as f32 / 255.0;
133 if confidence < 0.3 {
134 continue;
135 }
136
137 if let Some(schema_type) = page_type_to_schema_org(node.page_type) {
138 type_groups
139 .entry(schema_type.to_string())
140 .or_default()
141 .push(idx);
142 }
143 }
144
145 let mut models: Vec<DataModel> = Vec::new();
147
148 for (schema_type, node_indices) in &type_groups {
149 if node_indices.is_empty() {
150 continue;
151 }
152
153 let singleton_types = [
155 "Cart",
156 "CheckoutPage",
157 "Account",
158 "LoginPage",
159 "WebSite",
160 "SearchResultsPage",
161 "Dashboard",
162 ];
163 if node_indices.len() < 2 && !singleton_types.contains(&schema_type.as_str()) {
164 continue;
165 }
166
167 let mut field_occurrences: BTreeMap<String, Vec<(FieldType, f32, String)>> =
169 BTreeMap::new();
170 let mut example_urls: Vec<String> = Vec::new();
171 let mut list_url: Option<String> = None;
172
173 let representative_pt = site_map.nodes[node_indices[0]].page_type;
175
176 for &idx in node_indices {
177 if example_urls.len() < 5 && idx < site_map.urls.len() {
179 example_urls.push(site_map.urls[idx].clone());
180 }
181
182 if !field_occurrences.contains_key("url") {
184 field_occurrences.insert(
185 "url".to_string(),
186 vec![(FieldType::Url, 1.0, String::new())],
187 );
188 }
189
190 if idx < site_map.features.len() {
192 let features = &site_map.features[idx];
193 for (dim, &value) in features.iter().enumerate() {
194 if let Some((field_name, field_type, val)) =
195 feature_dim_to_field(dim, value, representative_pt)
196 {
197 let example = format_feature_value(dim, val);
198 field_occurrences.entry(field_name).or_default().push((
199 field_type,
200 FieldSource::Inferred.default_confidence(),
201 example,
202 ));
203 }
204 }
205
206 if features[FEAT_HAS_STRUCTURED_DATA] > 0.5 {
208 add_schema_org_fields(schema_type, &mut field_occurrences);
210 }
211 }
212 }
213
214 for (idx, node) in site_map.nodes.iter().enumerate() {
216 if node.page_type == PageType::ProductListing
217 && schema_type == "Product"
218 && idx < site_map.urls.len()
219 {
220 list_url = Some(site_map.urls[idx].clone());
221 break;
222 }
223 }
224
225 let total_instances = node_indices.len();
227 let mut fields: Vec<ModelField> = Vec::new();
228
229 fields.push(ModelField {
231 name: "url".to_string(),
232 field_type: FieldType::Url,
233 source: FieldSource::Inferred,
234 confidence: 1.0,
235 nullable: false,
236 example_values: example_urls.iter().take(3).cloned().collect(),
237 feature_dim: None,
238 });
239
240 fields.push(ModelField {
241 name: "node_id".to_string(),
242 field_type: FieldType::Integer,
243 source: FieldSource::Inferred,
244 confidence: 1.0,
245 nullable: false,
246 example_values: node_indices.iter().take(3).map(|i| i.to_string()).collect(),
247 feature_dim: None,
248 });
249
250 fields.push(ModelField {
252 name: "name".to_string(),
253 field_type: FieldType::String,
254 source: FieldSource::JsonLd,
255 confidence: 0.95,
256 nullable: false,
257 example_values: Vec::new(),
258 feature_dim: None,
259 });
260
261 for (field_name, occurrences) in &field_occurrences {
262 if field_name == "url" {
263 continue; }
265
266 let field_type = occurrences[0].0.clone();
268
269 let avg_confidence =
271 occurrences.iter().map(|o| o.1).sum::<f32>() / occurrences.len() as f32;
272
273 let nullable = occurrences.len() < total_instances;
275
276 let mut example_values: Vec<String> = Vec::new();
278 let mut seen: BTreeSet<String> = BTreeSet::new();
279 for o in occurrences {
280 if !o.2.is_empty() && seen.insert(o.2.clone()) {
281 example_values.push(o.2.clone());
282 if example_values.len() >= 5 {
283 break;
284 }
285 }
286 }
287
288 let feature_dim = field_name_to_dim(field_name);
290
291 fields.push(ModelField {
292 name: field_name.clone(),
293 field_type,
294 source: FieldSource::Inferred,
295 confidence: avg_confidence,
296 nullable,
297 example_values,
298 feature_dim,
299 });
300 }
301
302 let model_name = simplify_model_name(schema_type);
303
304 models.push(DataModel {
305 name: model_name,
306 schema_org_type: schema_type.clone(),
307 fields,
308 instance_count: total_instances,
309 example_urls,
310 search_action: None,
311 list_url,
312 });
313 }
314
315 models.sort_by(|a, b| b.instance_count.cmp(&a.instance_count));
317
318 let relationships = infer_relationships(site_map, &models);
320
321 let actions = compile_actions(site_map, &models);
323
324 for action in &actions {
326 if action.name == "search" || action.name.ends_with("_search") {
327 for model in &mut models {
328 if action.belongs_to == model.name && model.search_action.is_none() {
329 model.search_action = Some(action.clone());
330 }
331 }
332 }
333 }
334
335 let total_fields: usize = models.iter().map(|m| m.fields.len()).sum();
337 let total_instances: usize = models.iter().map(|m| m.instance_count).sum();
338 let avg_confidence = if total_fields > 0 {
339 models
340 .iter()
341 .flat_map(|m| m.fields.iter().map(|f| f.confidence))
342 .sum::<f32>()
343 / total_fields as f32
344 } else {
345 0.0
346 };
347
348 CompiledSchema {
349 domain: domain.to_string(),
350 compiled_at: Utc::now(),
351 models: models.clone(),
352 actions,
353 relationships,
354 stats: SchemaStats {
355 total_models: models.len(),
356 total_fields,
357 total_instances,
358 avg_confidence,
359 },
360 }
361}
362
363fn simplify_model_name(schema_type: &str) -> String {
365 match schema_type {
366 "FAQPage" => "FAQ".to_string(),
367 "TechArticle" => "Article".to_string(),
368 "MediaObject" => "Media".to_string(),
369 "DiscussionForumPosting" => "ForumPost".to_string(),
370 "SocialMediaPosting" => "SocialPost".to_string(),
371 "ContactPoint" => "Contact".to_string(),
372 "CheckoutPage" => "Checkout".to_string(),
373 "LoginPage" => "Auth".to_string(),
374 "WebSite" => "Site".to_string(),
375 "SearchResultsPage" => "SearchResults".to_string(),
376 "ProductListing" => "Category".to_string(),
377 other => other.to_string(),
378 }
379}
380
381fn field_name_to_dim(name: &str) -> Option<usize> {
383 match name {
384 "price" => Some(FEAT_PRICE),
385 "original_price" => Some(FEAT_PRICE_ORIGINAL),
386 "discount_percent" => Some(FEAT_DISCOUNT_PCT),
387 "availability" => Some(FEAT_AVAILABILITY),
388 "rating" => Some(FEAT_RATING),
389 "review_count" => Some(FEAT_REVIEW_COUNT_LOG),
390 "review_sentiment" => Some(FEAT_REVIEW_SENTIMENT),
391 "free_shipping" => Some(FEAT_SHIPPING_FREE),
392 "shipping_speed_days" => Some(FEAT_SHIPPING_SPEED),
393 "seller_reputation" => Some(FEAT_SELLER_REPUTATION),
394 "variant_count" => Some(FEAT_VARIANT_COUNT),
395 "deal_score" => Some(FEAT_DEAL_SCORE),
396 "image_count" => Some(FEAT_IMAGE_COUNT),
397 _ => None,
398 }
399}
400
401fn format_feature_value(dim: usize, value: f32) -> String {
403 match dim {
404 FEAT_PRICE | FEAT_PRICE_ORIGINAL => format!("{value:.2}"),
405 FEAT_DISCOUNT_PCT => format!("{:.0}%", value * 100.0),
406 FEAT_RATING => format!("{value:.1}"),
407 FEAT_REVIEW_COUNT_LOG => format!("{}", (10.0f32.powf(value)) as u64),
408 FEAT_AVAILABILITY => {
409 if value > 0.5 {
410 "in_stock".to_string()
411 } else {
412 "out_of_stock".to_string()
413 }
414 }
415 _ => format!("{value:.2}"),
416 }
417}
418
419fn add_schema_org_fields(
421 schema_type: &str,
422 fields: &mut BTreeMap<String, Vec<(FieldType, f32, String)>>,
423) {
424 let schema_fields: &[(&str, FieldType)] = match schema_type {
425 "Product" => &[
426 ("brand", FieldType::String),
427 ("category", FieldType::String),
428 ("sku", FieldType::String),
429 ("image_url", FieldType::Url),
430 ("description", FieldType::String),
431 ("currency", FieldType::String),
432 ],
433 "Article" => &[
434 ("author", FieldType::String),
435 ("published_date", FieldType::DateTime),
436 ("category", FieldType::String),
437 ("image_url", FieldType::Url),
438 ("description", FieldType::String),
439 ],
440 "Organization" => &[
441 ("description", FieldType::String),
442 ("logo_url", FieldType::Url),
443 ("address", FieldType::String),
444 ("phone", FieldType::String),
445 ("email", FieldType::String),
446 ],
447 "Event" => &[
448 ("start_date", FieldType::DateTime),
449 ("end_date", FieldType::DateTime),
450 ("location", FieldType::String),
451 ("organizer", FieldType::String),
452 ("description", FieldType::String),
453 ],
454 "Review" => &[
455 ("author", FieldType::String),
456 ("body", FieldType::String),
457 ("date_published", FieldType::DateTime),
458 ],
459 "Offer" => &[
460 ("description", FieldType::String),
461 ("valid_from", FieldType::DateTime),
462 ("valid_through", FieldType::DateTime),
463 ],
464 _ => &[],
465 };
466
467 for (name, ftype) in schema_fields {
468 fields.entry(name.to_string()).or_default().push((
469 ftype.clone(),
470 FieldSource::JsonLd.default_confidence(),
471 String::new(),
472 ));
473 }
474}
475
476#[cfg(test)]
477mod tests {
478 use super::*;
479 use crate::map::builder::SiteMapBuilder;
480
481 fn build_test_sitemap() -> SiteMap {
482 let mut builder = SiteMapBuilder::new("shop.example.com");
483
484 let mut home_feats = [0.0f32; FEATURE_DIM];
486 home_feats[FEAT_HAS_STRUCTURED_DATA] = 1.0;
487 home_feats[FEAT_PAGE_TYPE_CONFIDENCE] = 0.9;
488 builder.add_node("https://shop.example.com/", PageType::Home, home_feats, 240);
489
490 let mut listing_feats = [0.0f32; FEATURE_DIM];
492 listing_feats[FEAT_HAS_STRUCTURED_DATA] = 0.5;
493 builder.add_node(
494 "https://shop.example.com/category/electronics",
495 PageType::ProductListing,
496 listing_feats,
497 200,
498 );
499
500 for i in 0..10 {
502 let mut feats = [0.0f32; FEATURE_DIM];
503 feats[FEAT_PRICE] = 100.0 + (i as f32 * 25.0);
504 feats[FEAT_PRICE_ORIGINAL] = 150.0 + (i as f32 * 25.0);
505 feats[FEAT_DISCOUNT_PCT] = 0.3;
506 feats[FEAT_AVAILABILITY] = if i % 3 == 0 { 0.0 } else { 1.0 };
507 feats[FEAT_RATING] = 3.5 + (i as f32 * 0.15);
508 feats[FEAT_REVIEW_COUNT_LOG] = 2.0 + (i as f32 * 0.1);
509 feats[FEAT_HAS_STRUCTURED_DATA] = 1.0;
510 feats[FEAT_SELLER_REPUTATION] = 0.8;
511 feats[FEAT_VARIANT_COUNT] = (2 + i % 5) as f32;
512 feats[FEAT_IMAGE_COUNT] = (3 + i % 4) as f32;
513
514 builder.add_node(
515 &format!("https://shop.example.com/product/{i}"),
516 PageType::ProductDetail,
517 feats,
518 220,
519 );
520
521 builder.add_edge(
523 1, 2 + i as u32,
525 EdgeType::ContentLink,
526 1,
527 EdgeFlags::default(),
528 );
529 }
530
531 for i in 0..5 {
533 let mut feats = [0.0f32; FEATURE_DIM];
534 feats[FEAT_TEXT_LENGTH_LOG] = 3.0 + i as f32 * 0.2;
535 feats[FEAT_READING_LEVEL] = 0.6;
536 feats[FEAT_RATING] = 4.0;
537 feats[FEAT_HAS_STRUCTURED_DATA] = 0.8;
538 feats[FEAT_IMAGE_COUNT] = 2.0;
539
540 builder.add_node(
541 &format!("https://shop.example.com/blog/{i}"),
542 PageType::Article,
543 feats,
544 180,
545 );
546 }
547
548 let cart_feats = [0.0f32; FEATURE_DIM];
550 builder.add_node(
551 "https://shop.example.com/cart",
552 PageType::Cart,
553 cart_feats,
554 200,
555 );
556
557 for i in 2..8 {
559 builder.add_edge(i, i + 1, EdgeType::Related, 2, EdgeFlags::default());
560 }
561
562 for i in 2..12 {
564 builder.add_action(i, OpCode::new(0x02, 0x00), -1, 0, 1); }
566
567 builder.build()
568 }
569
570 #[test]
571 fn test_infer_schema_discovers_models() {
572 let map = build_test_sitemap();
573 let schema = infer_schema(&map, "shop.example.com");
574
575 assert_eq!(schema.domain, "shop.example.com");
576 assert!(
577 schema.stats.total_models >= 2,
578 "should find Product and Article at minimum"
579 );
580
581 let product = schema.models.iter().find(|m| m.name == "Product");
583 assert!(product.is_some(), "should discover Product model");
584
585 let product = product.unwrap();
586 assert_eq!(product.instance_count, 10);
587 assert!(
588 product.fields.len() >= 5,
589 "Product should have several fields"
590 );
591
592 let field_names: Vec<&str> = product.fields.iter().map(|f| f.name.as_str()).collect();
594 assert!(field_names.contains(&"price"), "Product should have price");
595 assert!(
596 field_names.contains(&"rating"),
597 "Product should have rating"
598 );
599 assert!(field_names.contains(&"url"), "Product should have url");
600 assert!(
601 field_names.contains(&"node_id"),
602 "Product should have node_id"
603 );
604 }
605
606 #[test]
607 fn test_infer_schema_handles_empty_map() {
608 let builder = SiteMapBuilder::new("empty.com");
609 let map = builder.build();
610 let schema = infer_schema(&map, "empty.com");
611
612 assert_eq!(schema.stats.total_models, 0);
613 assert!(schema.models.is_empty());
614 }
615
616 #[test]
617 fn test_schema_field_nullability() {
618 let map = build_test_sitemap();
619 let schema = infer_schema(&map, "shop.example.com");
620
621 let product = schema.models.iter().find(|m| m.name == "Product").unwrap();
622
623 let url_field = product.fields.iter().find(|f| f.name == "url").unwrap();
625 assert!(!url_field.nullable);
626 }
627
628 #[test]
629 fn test_schema_stats() {
630 let map = build_test_sitemap();
631 let schema = infer_schema(&map, "shop.example.com");
632
633 assert!(schema.stats.total_models > 0);
634 assert!(schema.stats.total_fields > 0);
635 assert!(schema.stats.total_instances > 0);
636 assert!(schema.stats.avg_confidence > 0.0);
637 assert!(schema.stats.avg_confidence <= 1.0);
638 }
639
640 #[test]
641 fn test_page_type_to_schema_org_mapping() {
642 assert_eq!(
643 page_type_to_schema_org(PageType::ProductDetail),
644 Some("Product")
645 );
646 assert_eq!(page_type_to_schema_org(PageType::Article), Some("Article"));
647 assert_eq!(page_type_to_schema_org(PageType::Cart), Some("Cart"));
648 assert_eq!(page_type_to_schema_org(PageType::Unknown), None);
649 assert_eq!(page_type_to_schema_org(PageType::ErrorPage), None);
650 }
651
652 #[test]
653 fn test_simplify_model_name() {
654 assert_eq!(simplify_model_name("FAQPage"), "FAQ");
655 assert_eq!(simplify_model_name("Product"), "Product");
656 assert_eq!(simplify_model_name("TechArticle"), "Article");
657 assert_eq!(simplify_model_name("WebSite"), "Site");
658 }
659
660 fn build_ecommerce_sitemap(domain: &str, product_count: usize) -> SiteMap {
664 let mut builder = SiteMapBuilder::new(domain);
665
666 let mut hf = [0.0f32; FEATURE_DIM];
668 hf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
669 hf[FEAT_SEARCH_AVAILABLE] = 1.0;
670 builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
671
672 let mut sf = [0.0f32; FEATURE_DIM];
674 sf[FEAT_PAGINATION_PRESENT] = 1.0;
675 builder.add_node(
676 &format!("https://{domain}/search"),
677 PageType::SearchResults,
678 sf,
679 200,
680 );
681
682 for i in 0..3 {
684 let mut cf = [0.0f32; FEATURE_DIM];
685 cf[FEAT_PAGINATION_PRESENT] = 1.0;
686 cf[FEAT_FILTER_COUNT] = 5.0;
687 cf[FEAT_HAS_STRUCTURED_DATA] = 0.8;
688 builder.add_node(
689 &format!("https://{domain}/category/{i}"),
690 PageType::ProductListing,
691 cf,
692 200,
693 );
694 }
695
696 let cat_base = 2; for i in 0..product_count {
699 let mut pf = [0.0f32; FEATURE_DIM];
700 pf[FEAT_PRICE] = 10.0 + (i as f32 * 15.0);
701 pf[FEAT_PRICE_ORIGINAL] = 15.0 + (i as f32 * 15.0);
702 pf[FEAT_DISCOUNT_PCT] = 0.2;
703 pf[FEAT_AVAILABILITY] = if i % 5 == 0 { 0.0 } else { 1.0 };
704 pf[FEAT_RATING] = 2.5 + (i as f32 * 0.1).min(2.5);
705 pf[FEAT_REVIEW_COUNT_LOG] = 1.0 + (i as f32 * 0.05);
706 pf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
707 pf[FEAT_SELLER_REPUTATION] = 0.75;
708 pf[FEAT_VARIANT_COUNT] = (1 + i % 6) as f32;
709 pf[FEAT_IMAGE_COUNT] = (2 + i % 5) as f32;
710 pf[FEAT_DEAL_SCORE] = (i as f32 * 0.05).min(1.0);
711 pf[FEAT_CATEGORY_PRICE_PERCENTILE] = i as f32 / product_count as f32;
712
713 let node = builder.add_node(
714 &format!("https://{domain}/product/{i}"),
715 PageType::ProductDetail,
716 pf,
717 210,
718 );
719
720 let cat = cat_base + (i % 3) as u32;
722 builder.add_edge(cat, node, EdgeType::ContentLink, 1, EdgeFlags::default());
723
724 builder.add_action(node, OpCode::new(0x02, 0x00), -1, 0, 1);
726 }
727
728 let cart_feats = [0.0f32; FEATURE_DIM];
730 builder.add_node(
731 &format!("https://{domain}/cart"),
732 PageType::Cart,
733 cart_feats,
734 200,
735 );
736
737 let login_feats = [0.0f32; FEATURE_DIM];
739 builder.add_node(
740 &format!("https://{domain}/login"),
741 PageType::Login,
742 login_feats,
743 200,
744 );
745
746 builder.build()
747 }
748
749 fn build_news_sitemap(domain: &str, article_count: usize) -> SiteMap {
751 let mut builder = SiteMapBuilder::new(domain);
752
753 let mut hf = [0.0f32; FEATURE_DIM];
754 hf[FEAT_HAS_STRUCTURED_DATA] = 1.0;
755 builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
756
757 for i in 0..article_count {
758 let mut af = [0.0f32; FEATURE_DIM];
759 af[FEAT_TEXT_LENGTH_LOG] = 3.0 + (i as f32 * 0.1);
760 af[FEAT_READING_LEVEL] = 0.5 + (i as f32 * 0.02);
761 af[FEAT_SENTIMENT] = 0.3 + (i as f32 * 0.05);
762 af[FEAT_HAS_STRUCTURED_DATA] = 1.0;
763 af[FEAT_IMAGE_COUNT] = (1 + i % 5) as f32;
764
765 builder.add_node(
766 &format!("https://{domain}/article/{i}"),
767 PageType::Article,
768 af,
769 200,
770 );
771 }
772
773 builder.build()
774 }
775
776 #[test]
777 fn test_v4_schema_inference_ecommerce() {
778 let map = build_ecommerce_sitemap("amazon.example.com", 20);
779 let schema = infer_schema(&map, "amazon.example.com");
780
781 let product = schema.models.iter().find(|m| m.name == "Product");
783 assert!(product.is_some(), "Must discover Product model");
784 let product = product.unwrap();
785 assert_eq!(product.instance_count, 20);
786
787 let field_names: Vec<&str> = product.fields.iter().map(|f| f.name.as_str()).collect();
789 assert!(field_names.contains(&"price"), "Product needs price field");
790 assert!(
791 field_names.contains(&"rating"),
792 "Product needs rating field"
793 );
794 assert!(
795 field_names.contains(&"availability"),
796 "Product needs availability"
797 );
798
799 let price = product.fields.iter().find(|f| f.name == "price").unwrap();
801 assert_eq!(price.field_type, FieldType::Float);
802 assert_eq!(price.feature_dim, Some(FEAT_PRICE));
803
804 let site = schema
806 .models
807 .iter()
808 .find(|m| m.schema_org_type == "WebSite");
809 assert!(
810 site.is_some(),
811 "Should discover WebSite model from Home page"
812 );
813 }
814
815 #[test]
816 fn test_v4_schema_inference_news() {
817 let map = build_news_sitemap("bbc.example.com", 15);
818 let schema = infer_schema(&map, "bbc.example.com");
819
820 let article = schema.models.iter().find(|m| m.name == "Article");
821 assert!(article.is_some(), "Must discover Article model");
822 let article = article.unwrap();
823 assert_eq!(article.instance_count, 15);
824
825 let field_names: Vec<&str> = article.fields.iter().map(|f| f.name.as_str()).collect();
826 assert!(
827 field_names.contains(&"word_count"),
828 "Article needs word_count"
829 );
830 assert!(
831 field_names.contains(&"reading_level"),
832 "Article needs reading_level"
833 );
834 }
835
836 #[test]
837 fn test_v4_schema_inference_multi_site() {
838 let sites: Vec<(&str, PageType, usize)> = vec![
840 ("recipes.example.com", PageType::Article, 10),
841 ("events.example.com", PageType::Calendar, 8),
842 ("docs.example.com", PageType::Documentation, 12),
843 ];
844
845 for (domain, page_type, count) in &sites {
846 let mut builder = SiteMapBuilder::new(domain);
847 let hf = [0.0f32; FEATURE_DIM];
848 builder.add_node(&format!("https://{domain}/"), PageType::Home, hf, 250);
849
850 for i in 0..*count {
851 let mut feats = [0.0f32; FEATURE_DIM];
852 feats[FEAT_HAS_STRUCTURED_DATA] = 0.9;
853 feats[FEAT_TEXT_LENGTH_LOG] = 2.0 + i as f32 * 0.1;
854 builder.add_node(
855 &format!("https://{domain}/item/{i}"),
856 *page_type,
857 feats,
858 200,
859 );
860 }
861
862 let map = builder.build();
863 let schema = infer_schema(&map, domain);
864 assert!(
865 schema.stats.total_models >= 1,
866 "{domain} should have at least 1 model"
867 );
868 }
869 }
870
871 #[test]
872 fn test_v4_schema_field_types_correct() {
873 let map = build_ecommerce_sitemap("typed.example.com", 10);
874 let schema = infer_schema(&map, "typed.example.com");
875
876 let product = schema.models.iter().find(|m| m.name == "Product").unwrap();
877
878 for field in &product.fields {
880 match field.name.as_str() {
881 "price"
882 | "original_price"
883 | "discount"
884 | "rating"
885 | "deal_score"
886 | "category_price_percentile" => {
887 assert_eq!(
888 field.field_type,
889 FieldType::Float,
890 "{} should be Float",
891 field.name
892 );
893 }
894 "image_count" | "review_count" | "variant_count" => {
895 assert_eq!(
896 field.field_type,
897 FieldType::Integer,
898 "{} should be Integer",
899 field.name
900 );
901 }
902 "url" => {
903 assert_eq!(field.field_type, FieldType::Url, "url should be Url");
904 }
905 _ => {} }
907 }
908 }
909
910 #[test]
911 fn test_v4_schema_actions_discovered() {
912 let map = build_ecommerce_sitemap("actions.example.com", 10);
913 let schema = infer_schema(&map, "actions.example.com");
914
915 assert!(
916 !schema.actions.is_empty(),
917 "Should discover actions from opcodes"
918 );
919
920 let atc = schema.actions.iter().find(|a| a.name == "add_to_cart");
922 assert!(atc.is_some(), "Should find add_to_cart action");
923 }
924
925 #[test]
926 fn test_v4_schema_confidence_ranges() {
927 let map = build_ecommerce_sitemap("conf.example.com", 10);
928 let schema = infer_schema(&map, "conf.example.com");
929
930 for model in &schema.models {
931 for field in &model.fields {
932 assert!(
933 field.confidence > 0.0 && field.confidence <= 1.0,
934 "Field {} confidence should be in (0,1], got {}",
935 field.name,
936 field.confidence
937 );
938 }
939 }
940
941 assert!(
942 schema.stats.avg_confidence > 0.0 && schema.stats.avg_confidence <= 1.0,
943 "Avg confidence should be in (0,1]"
944 );
945 }
946}