use crate::error::Result;
use crate::extract;
use crate::fetch::{self, PageContent};
use crate::page_facts::PageFacts;
use crate::platform::Strategy;
use crate::transforms::Intent;
use crate::watch::{Engine, Extraction};
use std::collections::HashMap;
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct ValidationResult {
pub success: bool,
pub extracted_content: Option<String>,
pub data_quality: DataQuality,
pub error: Option<String>,
pub runtime_ms: u64,
pub warnings: Vec<String>,
}
impl ValidationResult {
pub fn success(content: String, quality: DataQuality, runtime_ms: u64) -> Self {
Self {
success: true,
extracted_content: Some(content),
data_quality: quality,
error: None,
runtime_ms,
warnings: Vec::new(),
}
}
pub fn failure(error: String, runtime_ms: u64) -> Self {
Self {
success: false,
extracted_content: None,
data_quality: DataQuality::default(),
error: Some(error),
runtime_ms,
warnings: Vec::new(),
}
}
pub fn with_warning(mut self, warning: String) -> Self {
self.warnings.push(warning);
self
}
pub fn confidence(&self) -> f32 {
if !self.success {
return 0.0;
}
self.data_quality.score()
}
}
#[derive(Debug, Clone, Default)]
pub struct DataQuality {
pub not_empty: bool,
pub has_expected_type: bool,
pub not_template: bool,
pub selector_found: bool,
pub stability_hint: f32,
pub notes: Vec<String>,
}
impl DataQuality {
pub fn score(&self) -> f32 {
let mut score = 0.0;
let mut weights = 0.0;
if self.not_empty {
score += 0.4;
}
weights += 0.4;
if self.has_expected_type {
score += 0.2;
}
weights += 0.2;
if self.not_template {
score += 0.2;
}
weights += 0.2;
if self.selector_found {
score += 0.1;
}
weights += 0.1;
score += self.stability_hint * 0.1;
weights += 0.1;
if weights > 0.0 {
score / weights
} else {
0.0
}
}
pub fn summary(&self) -> String {
let mut issues = Vec::new();
if !self.not_empty {
issues.push("content too short");
}
if !self.has_expected_type {
issues.push("content type mismatch");
}
if !self.not_template {
issues.push("may be template/placeholder");
}
if !self.selector_found {
issues.push("selector not found");
}
if issues.is_empty() {
format!("Good quality (stability: {:.0}%)", self.stability_hint * 100.0)
} else {
format!("Issues: {}", issues.join(", "))
}
}
}
pub fn validate_strategy(
url: &str,
strategy: &Strategy,
intent: Intent,
facts: &PageFacts,
) -> ValidationResult {
let start = Instant::now();
let html = match &strategy.engine {
Engine::Playwright => {
facts.js_rendered_html.as_deref().unwrap_or(&facts.html)
}
_ => &facts.html,
};
let extracted = match extract_for_validation(html, &strategy.extraction, url) {
Ok(content) => content,
Err(e) => {
return ValidationResult::failure(
format!("Extraction failed: {}", e),
start.elapsed().as_millis() as u64,
);
}
};
let quality = assess_quality(&extracted, &strategy.extraction, intent);
let runtime_ms = start.elapsed().as_millis() as u64;
let stability_warning = quality.stability_hint < 0.5;
let type_warning = !quality.has_expected_type;
let mut result = if quality.not_empty && quality.not_template {
ValidationResult::success(extracted, quality, runtime_ms)
} else {
let error = if !quality.not_empty {
"Extracted content too short (< 100 chars)"
} else {
"Content appears to be template/placeholder"
};
ValidationResult::failure(error.to_string(), runtime_ms)
};
if type_warning {
result = result.with_warning("Content may not match expected type for intent".to_string());
}
if stability_warning {
result = result.with_warning("Extraction method may be unstable".to_string());
}
result
}
fn extract_for_validation(html: &str, extraction: &Extraction, url: &str) -> Result<String> {
let page_content = PageContent {
url: url.to_string(),
title: None,
html: html.to_string(),
text: None,
};
extract::extract(&page_content, extraction)
}
fn assess_quality(content: &str, extraction: &Extraction, intent: Intent) -> DataQuality {
let mut quality = DataQuality {
not_empty: content.len() >= 100,
has_expected_type: check_expected_type(content, intent),
not_template: !is_template_content(content),
selector_found: true, stability_hint: stability_for_extraction(extraction),
notes: Vec::new(),
};
if let Extraction::Selector { selector } = extraction {
if content.is_empty() || content.trim().len() < 10 {
quality.selector_found = false;
quality.notes.push(format!("Selector '{}' may not exist", selector));
}
}
if content.contains("{{") || content.contains("${") {
quality.notes.push("Contains template syntax".to_string());
}
if content.to_lowercase().contains("loading") && content.len() < 500 {
quality.notes.push("May be loading placeholder".to_string());
}
quality
}
fn check_expected_type(content: &str, intent: Intent) -> bool {
let lower = content.to_lowercase();
match intent {
Intent::Price => {
has_price_pattern(content)
}
Intent::Stock => {
lower.contains("stock")
|| lower.contains("available")
|| lower.contains("sold out")
|| lower.contains("add to cart")
|| lower.contains("buy now")
|| lower.contains("out of stock")
|| lower.contains("in stock")
|| lower.contains("notify me")
}
Intent::Release => {
lower.contains("version")
|| lower.contains("release")
|| lower.contains("changelog")
|| lower.contains("v1")
|| lower.contains("v2")
|| content.contains(".") }
Intent::Jobs => {
lower.contains("job")
|| lower.contains("position")
|| lower.contains("apply")
|| lower.contains("hiring")
|| lower.contains("career")
|| lower.contains("salary")
}
Intent::News => {
content.len() >= 200
}
Intent::Generic => {
true
}
}
}
fn has_price_pattern(content: &str) -> bool {
let patterns = [
r"\$\d", r"€\d", r"£\d", r"¥\d",
r"\d+\.\d{2}", r"USD", r"EUR", r"GBP",
];
for pattern in patterns {
if let Ok(re) = regex::Regex::new(pattern) {
if re.is_match(content) {
return true;
}
}
}
let lower = content.to_lowercase();
lower.contains("price") || lower.contains("cost") || lower.contains("$")
}
fn is_template_content(content: &str) -> bool {
let lower = content.to_lowercase();
if content.contains("{{") && content.contains("}}") {
return true;
}
if content.contains("${") && content.contains("}") {
return true;
}
if content.contains("__") && content.matches("__").count() >= 2 {
if lower.contains("placeholder") || lower.contains("template") {
return true;
}
}
if lower.contains("$0.00") || lower.contains("$0") || lower.contains("price: 0") {
return true;
}
if (lower.contains("loading") || lower.contains("please wait"))
&& content.len() < 300
{
return true;
}
let placeholders = [
"lorem ipsum",
"sample text",
"placeholder",
"coming soon",
"to be announced",
"tba",
"tbd",
];
for placeholder in placeholders {
if lower.contains(placeholder) {
return true;
}
}
false
}
fn stability_for_extraction(extraction: &Extraction) -> f32 {
match extraction {
Extraction::Rss => 0.95, Extraction::JsonLd { .. } => 0.9, Extraction::Meta { .. } => 0.85, Extraction::Auto => 0.7, Extraction::Selector { selector } => {
selector_stability(selector)
}
Extraction::Full => 0.5, }
}
fn selector_stability(selector: &str) -> f32 {
let mut score: f32 = 0.7;
if selector.contains('#') {
score += 0.15;
}
if selector.contains("[data-") {
score += 0.1;
}
if selector.contains('.') {
score += 0.05;
}
if selector.len() > 50 {
score -= 0.1;
}
if !selector.contains('#') && !selector.contains('.') && !selector.contains('[') {
score -= 0.2;
}
score.clamp(0.0, 1.0)
}
pub fn try_strategies_with_fallback(
url: &str,
strategies: Vec<Strategy>,
intent: Intent,
facts: &PageFacts,
) -> (Strategy, ValidationResult) {
let mut best_strategy: Option<Strategy> = None;
let mut best_result: Option<ValidationResult> = None;
let mut best_score: f32 = -1.0;
for strategy in strategies {
let result = validate_strategy(url, &strategy, intent, facts);
if result.success {
let score = result.confidence();
if score > best_score {
best_score = score;
best_strategy = Some(strategy.clone());
best_result = Some(result);
}
if score >= 0.85 {
break;
}
} else if best_strategy.is_none() {
best_strategy = Some(strategy.clone());
best_result = Some(result);
}
}
let strategy = best_strategy.unwrap_or_else(|| Strategy {
engine: Engine::Http,
extraction: Extraction::Auto,
reason: "Default fallback".to_string(),
confidence: 0.3,
notes: None,
});
let result = best_result.unwrap_or_else(|| {
ValidationResult::failure("No strategies available".to_string(), 0)
});
(strategy, result)
}
pub fn validate_url_config(
url: &str,
engine: Engine,
extraction: Extraction,
intent: Intent,
) -> Result<ValidationResult> {
let start = Instant::now();
let headers = HashMap::new();
let content = match fetch::fetch(url, engine.clone(), &headers) {
Ok(c) => c,
Err(e) => {
return Ok(ValidationResult::failure(
format!("Fetch failed: {}", e),
start.elapsed().as_millis() as u64,
));
}
};
let extracted = match extract::extract(&content, &extraction) {
Ok(e) => e,
Err(e) => {
return Ok(ValidationResult::failure(
format!("Extraction failed: {}", e),
start.elapsed().as_millis() as u64,
));
}
};
let quality = assess_quality(&extracted, &extraction, intent);
let runtime_ms = start.elapsed().as_millis() as u64;
let result = if quality.not_empty && quality.not_template {
ValidationResult::success(extracted, quality, runtime_ms)
} else {
let error = if !quality.not_empty {
"Extracted content too short"
} else {
"Content appears to be template/placeholder"
};
ValidationResult::failure(error.to_string(), runtime_ms)
};
Ok(result)
}
pub fn quick_validate(content: &str, extraction: &Extraction, intent: Intent) -> DataQuality {
let page_content = PageContent {
url: String::new(),
title: None,
html: content.to_string(),
text: None,
};
let extracted = extract::extract(&page_content, extraction).unwrap_or_default();
assess_quality(&extracted, extraction, intent)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_price_detection() {
assert!(has_price_pattern("$99.99"));
assert!(has_price_pattern("Price: $50"));
assert!(has_price_pattern("€49.00"));
assert!(has_price_pattern("USD 100"));
assert!(!has_price_pattern("Hello world"));
}
#[test]
fn test_template_detection() {
assert!(is_template_content("Price: {{price}}"));
assert!(is_template_content("${product.name}"));
assert!(is_template_content("$0.00"));
assert!(is_template_content("Loading..."));
assert!(is_template_content("Lorem ipsum dolor sit amet"));
assert!(!is_template_content("Nike Air Max - $149.99 - In Stock"));
}
#[test]
fn test_intent_type_checking() {
assert!(check_expected_type("$99.99", Intent::Price));
assert!(check_expected_type("In Stock - Add to Cart", Intent::Stock));
assert!(check_expected_type("Version 2.0 released", Intent::Release));
assert!(check_expected_type("Software Engineer - Apply Now", Intent::Jobs));
}
#[test]
fn test_selector_stability() {
assert!(selector_stability("#main-content") > 0.8);
assert!(selector_stability("[data-product-id]") > 0.7);
assert!(selector_stability(".product-price") > 0.6);
assert!(selector_stability("div") < 0.6);
}
#[test]
fn test_quality_score() {
let good_quality = DataQuality {
not_empty: true,
has_expected_type: true,
not_template: true,
selector_found: true,
stability_hint: 0.9,
notes: vec![],
};
assert!(good_quality.score() > 0.8);
let poor_quality = DataQuality {
not_empty: false,
has_expected_type: false,
not_template: false,
selector_found: false,
stability_hint: 0.3,
notes: vec![],
};
assert!(poor_quality.score() < 0.2);
}
}