use anyhow::Result;
use colored::Colorize;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct OptimizationSuggestion {
pub severity: SuggestionSeverity,
pub title: String,
pub description: String,
pub recommendation: String,
pub confidence: f64, pub estimated_speedup: Option<f64>, }
#[derive(Debug, Clone, PartialEq)]
pub enum SuggestionSeverity {
Critical, Warning, Info, }
#[derive(Debug, Clone)]
pub struct QueryComplexityMetrics {
pub triple_patterns: usize,
pub optional_count: usize,
pub union_count: usize,
pub filter_count: usize,
pub subquery_count: usize,
pub aggregation_count: usize,
pub property_path_count: usize,
pub estimated_cardinality: f64,
pub complexity_score: f64,
}
impl QueryComplexityMetrics {
pub fn calculate_score(&self) -> f64 {
let base_score = (self.triple_patterns as f64 * 1.0)
+ (self.optional_count as f64 * 5.0)
+ (self.union_count as f64 * 4.0)
+ (self.filter_count as f64 * 2.0)
+ (self.subquery_count as f64 * 8.0)
+ (self.aggregation_count as f64 * 3.0)
+ (self.property_path_count as f64 * 10.0);
(base_score.ln() * 15.0).clamp(0.0, 100.0)
}
}
#[derive(Debug)]
pub struct QueryPatternAnalyzer {
#[allow(dead_code)]
performance_history: HashMap<String, Vec<f64>>,
}
impl QueryPatternAnalyzer {
pub fn new() -> Self {
Self {
performance_history: HashMap::new(),
}
}
pub fn analyze_complexity(&self, query: &str) -> QueryComplexityMetrics {
let query_upper = query.to_uppercase();
let triple_patterns = count_triple_patterns(query);
let optional_count = query_upper.matches("OPTIONAL").count();
let union_count = query_upper.matches("UNION").count();
let filter_count = query_upper.matches("FILTER").count();
let subquery_count = query_upper.matches("SELECT").count().saturating_sub(1);
let aggregation_count = query_upper.matches("COUNT").count()
+ query_upper.matches("SUM").count()
+ query_upper.matches("AVG").count()
+ query_upper.matches("MAX").count()
+ query_upper.matches("MIN").count()
+ query_upper.matches("GROUP_CONCAT").count()
+ query_upper.matches("SAMPLE").count();
let property_path_count = count_property_paths(query);
let estimated_cardinality =
estimate_result_cardinality(triple_patterns, optional_count, union_count, filter_count);
let mut metrics = QueryComplexityMetrics {
triple_patterns,
optional_count,
union_count,
filter_count,
subquery_count,
aggregation_count,
property_path_count,
estimated_cardinality,
complexity_score: 0.0,
};
metrics.complexity_score = metrics.calculate_score();
metrics
}
pub fn predict_execution_time(&self, query: &str) -> Option<f64> {
let metrics = self.analyze_complexity(query);
let predicted_time = 10.0 + (metrics.triple_patterns as f64 * 5.0)
+ (metrics.optional_count as f64 * 50.0)
+ (metrics.union_count as f64 * 30.0)
+ (metrics.filter_count as f64 * 15.0)
+ (metrics.subquery_count as f64 * 100.0)
+ (metrics.aggregation_count as f64 * 25.0)
+ (metrics.property_path_count as f64 * 200.0);
Some(predicted_time)
}
}
impl Default for QueryPatternAnalyzer {
fn default() -> Self {
Self::new()
}
}
fn count_triple_patterns(query: &str) -> usize {
let var_pattern = query.matches('?').count();
(var_pattern / 3).max(1)
}
fn count_property_paths(query: &str) -> usize {
let mut count = 0;
let chars: Vec<char> = query.chars().collect();
for i in 0..chars.len() {
if (chars[i] == '*' || chars[i] == '+') && i > 0 {
let prev_char = chars[i - 1];
if prev_char == ':'
|| prev_char == '>'
|| prev_char.is_alphanumeric()
|| prev_char == '_'
{
if i >= 6 {
let before = String::from_iter(&chars[i.saturating_sub(7)..i]);
if !before.to_uppercase().contains("SELECT") {
count += 1;
}
} else {
count += 1;
}
}
}
}
for (i, _) in query.match_indices('{') {
if i > 0 && i + 1 < query.len() {
let before = &query[..i];
let is_sparql_keyword = before.ends_with("WHERE ")
|| before.ends_with("FILTER")
|| before.ends_with("SELECT")
|| before.ends_with("GROUP")
|| before.ends_with("OPTIONAL")
|| before.ends_with("UNION")
|| before.ends_with("GRAPH")
|| before.ends_with("SERVICE")
|| before.ends_with("BIND")
|| before.ends_with("VALUES")
|| before.ends_with("MINUS");
if !is_sparql_keyword {
let after = &query[i + 1..];
if after.chars().next().is_some_and(|c| c.is_ascii_digit()) {
count += 1;
}
}
}
}
count
}
fn estimate_result_cardinality(
triple_patterns: usize,
optional_count: usize,
union_count: usize,
filter_count: usize,
) -> f64 {
let base_cardinality = 1000.0; let selectivity = 0.1_f64.powi(filter_count as i32);
let optional_multiplier = 1.0 + (optional_count as f64 * 0.5); let union_multiplier = 1.0 + (union_count as f64);
(base_cardinality
* triple_patterns as f64
* selectivity
* optional_multiplier
* union_multiplier)
.log10()
}
pub fn analyze_query_for_optimization(query: &str) -> Vec<OptimizationSuggestion> {
let mut suggestions = Vec::new();
let query_upper = query.to_uppercase();
let analyzer = QueryPatternAnalyzer::new();
let metrics = analyzer.analyze_complexity(query);
if let Some(predicted_time) = analyzer.predict_execution_time(query) {
if predicted_time > 1000.0 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Critical,
title: format!("High complexity query (predicted: {:.0}ms)", predicted_time),
description: format!(
"Query complexity score: {:.1}/100. This may execute slowly.",
metrics.complexity_score
),
recommendation: "Consider breaking into smaller queries or adding constraints"
.to_string(),
confidence: 0.75,
estimated_speedup: Some(5.0),
});
}
}
if !query_upper.contains("LIMIT") && query_upper.contains("SELECT") {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: "Missing LIMIT clause".to_string(),
description: "Query may return unbounded results, risking memory exhaustion"
.to_string(),
recommendation: "Add LIMIT clause: SELECT ... LIMIT 1000".to_string(),
confidence: 0.95,
estimated_speedup: Some(10.0),
});
}
if query_upper.contains("SELECT *") || query_upper.contains("SELECT\n*") {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Info,
title: "SELECT * usage".to_string(),
description: "Selecting all variables may retrieve unnecessary data".to_string(),
recommendation: "Specify only required variables: SELECT ?s ?p instead of SELECT *"
.to_string(),
confidence: 0.70,
estimated_speedup: Some(1.5),
});
}
if metrics.optional_count > 3 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: format!("Multiple OPTIONAL clauses ({})", metrics.optional_count),
description:
"Excessive OPTIONAL patterns cause combinatorial explosion in join evaluation"
.to_string(),
recommendation: "Consider restructuring with UNION or using BOUND() filters"
.to_string(),
confidence: 0.85,
estimated_speedup: Some(metrics.optional_count as f64),
});
}
if metrics.filter_count > 5 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: format!("Many FILTER clauses ({})", metrics.filter_count),
description: "Excessive filtering indicates inefficient query structure".to_string(),
recommendation: "Move FILTER conditions into triple patterns for index usage"
.to_string(),
confidence: 0.80,
estimated_speedup: Some(2.0),
});
}
if query_upper.contains("REGEX")
&& !query_upper.contains("\"^")
&& !query_upper.contains("STRSTARTS")
{
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Critical,
title: "Unanchored REGEX pattern".to_string(),
description: "REGEX without anchor (^) requires full string scan on every value"
.to_string(),
recommendation: "Use STRSTARTS() for prefix matching: STRSTARTS(?var, \"pattern\")"
.to_string(),
confidence: 0.98,
estimated_speedup: Some(100.0), });
}
if query_upper.contains("SELECT DISTINCT") || query_upper.contains("SELECT\nDISTINCT") {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Info,
title: "DISTINCT usage".to_string(),
description: "DISTINCT requires hash-based deduplication of all results".to_string(),
recommendation:
"Verify DISTINCT is necessary - well-structured queries avoid duplicates"
.to_string(),
confidence: 0.65,
estimated_speedup: Some(1.3),
});
}
if metrics.union_count > 3 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: format!("Multiple UNION clauses ({})", metrics.union_count),
description: "Many UNIONs require executing multiple sub-queries and merging results"
.to_string(),
recommendation: "Consider property paths or alternative patterns to simplify"
.to_string(),
confidence: 0.75,
estimated_speedup: Some(2.0),
});
}
if query_upper.contains("ORDER BY") && !query_upper.contains("LIMIT") {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: "ORDER BY without LIMIT".to_string(),
description: "Sorting unbounded results requires full materialization and sorting"
.to_string(),
recommendation: "Add LIMIT to enable top-k optimization: ORDER BY ... LIMIT 100"
.to_string(),
confidence: 0.92,
estimated_speedup: Some(20.0),
});
}
if metrics.aggregation_count > 5 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Info,
title: format!(
"Complex aggregation ({} functions)",
metrics.aggregation_count
),
description: "Query uses many aggregation functions which may be expensive".to_string(),
recommendation: "Consider if some aggregations can be pre-computed or materialized"
.to_string(),
confidence: 0.70,
estimated_speedup: Some(1.5),
});
}
if metrics.subquery_count > 0 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Info,
title: format!("Subquery usage detected ({})", metrics.subquery_count),
description: "Subqueries prevent some optimizations and may execute multiple times"
.to_string(),
recommendation: "Consider flattening subqueries using JOIN patterns if possible"
.to_string(),
confidence: 0.60,
estimated_speedup: Some(1.8),
});
}
if metrics.property_path_count > 0 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Critical,
title: format!(
"Unbounded property paths detected ({})",
metrics.property_path_count
),
description: "Property paths (*, +) may traverse millions of triples in large graphs"
.to_string(),
recommendation:
"Add length constraints or use specific predicates: ?s predicate{1,5} ?o"
.to_string(),
confidence: 0.95,
estimated_speedup: Some(1000.0), });
}
if metrics.triple_patterns > 5 && metrics.filter_count == 0 {
suggestions.push(OptimizationSuggestion {
severity: SuggestionSeverity::Warning,
title: "Potential cartesian product".to_string(),
description: "Many triple patterns without filters may cause joins to explode"
.to_string(),
recommendation: "Add constraints or ensure patterns share variables".to_string(),
confidence: 0.68,
estimated_speedup: Some(10.0),
});
}
suggestions
}
pub fn display_suggestions(suggestions: &[OptimizationSuggestion]) {
if suggestions.is_empty() {
println!(
"{}",
"✅ No optimization issues detected! Query looks efficient."
.green()
.bold()
);
return;
}
println!(
"\n{}",
"🔍 Advanced Query Optimization Analysis".bold().cyan()
);
println!("{}", "━".repeat(70));
let mut critical = 0;
let mut warnings = 0;
let mut info = 0;
let mut total_speedup = 1.0_f64;
for suggestion in suggestions {
match suggestion.severity {
SuggestionSeverity::Critical => critical += 1,
SuggestionSeverity::Warning => warnings += 1,
SuggestionSeverity::Info => info += 1,
}
if let Some(speedup) = suggestion.estimated_speedup {
total_speedup *= speedup.min(10.0); }
let (icon, color_fn): (&str, fn(&str) -> colored::ColoredString) = match suggestion.severity
{
SuggestionSeverity::Critical => ("🔴", |s: &str| s.red()),
SuggestionSeverity::Warning => ("🟡", |s: &str| s.yellow()),
SuggestionSeverity::Info => ("ℹ️ ", |s: &str| s.blue()),
};
println!("\n{} {}", icon, color_fn(&suggestion.title).bold());
println!(" {}", suggestion.description);
println!(" 💡 {}", suggestion.recommendation.italic());
println!(
" 📊 Confidence: {:.0}%{}",
suggestion.confidence * 100.0,
if let Some(speedup) = suggestion.estimated_speedup {
format!(" | Est. speedup: {:.1}x", speedup)
} else {
String::new()
}
);
}
println!("\n{}", "━".repeat(70));
println!(
"{}",
format!(
"Summary: {} critical, {} warnings, {} info",
critical, warnings, info
)
.bold()
);
if total_speedup > 1.5 {
println!(
"{}",
format!(
"🚀 Potential performance improvement: {:.1}x faster with all optimizations",
total_speedup.clamp(1.0, 1000.0)
)
.green()
.bold()
);
}
println!();
}
pub async fn optimize_command(query: String, file: bool) -> Result<()> {
let sparql_query = if file {
std::fs::read_to_string(&query)?
} else {
query
};
println!(
"\n{}",
"🔧 Advanced SPARQL Query Optimizer (SciRS2-Powered)"
.bold()
.cyan()
);
println!("{}", "━".repeat(70));
println!("\nAnalyzing query for optimization opportunities...\n");
let preview = if sparql_query.len() > 200 {
format!("{}...", &sparql_query[..197])
} else {
sparql_query.clone()
};
println!("Query:\n{}\n", preview.bright_black());
let analyzer = QueryPatternAnalyzer::new();
let metrics = analyzer.analyze_complexity(&sparql_query);
println!("{}", "📈 Query Complexity Metrics:".bold());
println!(" Triple patterns: {}", metrics.triple_patterns);
println!(" OPTIONAL clauses: {}", metrics.optional_count);
println!(" UNION operations: {}", metrics.union_count);
println!(" FILTER clauses: {}", metrics.filter_count);
println!(" Subqueries: {}", metrics.subquery_count);
println!(" Aggregations: {}", metrics.aggregation_count);
println!(" Property paths: {}", metrics.property_path_count);
println!(
" Est. cardinality: 10^{:.1} results",
metrics.estimated_cardinality
);
println!(" Complexity score: {:.1}/100", metrics.complexity_score);
if let Some(predicted_time) = analyzer.predict_execution_time(&sparql_query) {
let time_str = if predicted_time < 1000.0 {
format!("{:.0}ms", predicted_time)
} else {
format!("{:.1}s", predicted_time / 1000.0)
};
println!(" Predicted time: {}", time_str);
}
println!();
let suggestions = analyze_query_for_optimization(&sparql_query);
display_suggestions(&suggestions);
println!("{}", "📚 SPARQL Optimization Best Practices:".bold());
println!(" • Use LIMIT to bound result sets (enables streaming)");
println!(" • Specify exact variables instead of SELECT * (reduces data transfer)");
println!(" • Place most selective patterns first (reduces intermediate results)");
println!(" • Use FILTER on indexed properties (enables index lookups)");
println!(" • Avoid unanchored REGEX - use STRSTARTS/CONTAINS (100x faster)");
println!(" • Add constraints to property paths (prevents graph explosion)");
println!(" • Test with EXPLAIN for actual execution plans");
println!(" • Monitor with performance profiler for bottlenecks");
println!();
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_analyze_missing_limit() {
let query = "SELECT ?s ?p ?o WHERE { ?s ?p ?o }";
let suggestions = analyze_query_for_optimization(query);
assert!(!suggestions.is_empty());
assert!(suggestions
.iter()
.any(|s| s.title.contains("LIMIT") && s.severity == SuggestionSeverity::Warning));
}
#[test]
fn test_analyze_select_star() {
let query = "SELECT * WHERE { ?s ?p ?o } LIMIT 10";
let suggestions = analyze_query_for_optimization(query);
assert!(suggestions.iter().any(|s| s.title.contains("SELECT *")));
}
#[test]
fn test_analyze_unanchored_regex() {
let query =
r#"SELECT ?s WHERE { ?s rdfs:label ?label . FILTER(REGEX(?label, "test")) } LIMIT 10"#;
let suggestions = analyze_query_for_optimization(query);
assert!(suggestions
.iter()
.any(|s| s.title.contains("REGEX") && s.severity == SuggestionSeverity::Critical));
let regex_suggestion = suggestions
.iter()
.find(|s| s.title.contains("REGEX"))
.unwrap();
assert!(regex_suggestion.confidence > 0.9);
}
#[test]
fn test_analyze_optimal_query() {
let query = r#"SELECT ?person ?name WHERE {
?person rdf:type foaf:Person .
?person foaf:name ?name .
FILTER(STRSTARTS(?name, "John"))
} LIMIT 100"#;
let suggestions = analyze_query_for_optimization(query);
for s in &suggestions {
eprintln!(
"Suggestion [{:?}]: {} (confidence: {:.0}%)",
s.severity,
s.title,
s.confidence * 100.0
);
}
let critical_count = suggestions
.iter()
.filter(|s| s.severity == SuggestionSeverity::Critical)
.count();
assert_eq!(critical_count, 0);
}
#[test]
fn test_analyze_multiple_optionals() {
let mut query = String::from("SELECT ?s WHERE { ?s ?p ?o .");
for _ in 0..5 {
query.push_str(" OPTIONAL { ?s ?x ?y } .");
}
query.push_str(" } LIMIT 10");
let suggestions = analyze_query_for_optimization(&query);
assert!(suggestions.iter().any(|s| s.title.contains("OPTIONAL")));
}
#[test]
fn test_complexity_metrics() {
let analyzer = QueryPatternAnalyzer::new();
let query = r#"SELECT ?s ?p ?o WHERE {
?s ?p ?o .
OPTIONAL { ?s rdfs:label ?label }
FILTER(?o > 100)
} LIMIT 10"#;
let metrics = analyzer.analyze_complexity(query);
assert!(metrics.triple_patterns > 0);
assert_eq!(metrics.optional_count, 1);
assert_eq!(metrics.filter_count, 1);
assert!(metrics.complexity_score > 0.0);
assert!(metrics.complexity_score <= 100.0);
}
#[test]
fn test_prediction() {
let analyzer = QueryPatternAnalyzer::new();
let simple_query = "SELECT ?s WHERE { ?s ?p ?o } LIMIT 10";
let complex_query = r#"SELECT ?s WHERE {
?s ?p ?o .
OPTIONAL { ?s ?x ?y }
OPTIONAL { ?s ?a ?b }
OPTIONAL { ?s ?c ?d }
}"#;
let simple_time = analyzer.predict_execution_time(simple_query).unwrap();
let complex_time = analyzer.predict_execution_time(complex_query).unwrap();
assert!(complex_time > simple_time);
}
#[test]
fn test_property_path_detection() {
let query = "SELECT ?s ?o WHERE { ?s foaf:knows* ?o } LIMIT 10";
let suggestions = analyze_query_for_optimization(query);
assert!(suggestions.iter().any(
|s| s.title.contains("property path") && s.severity == SuggestionSeverity::Critical
));
}
#[test]
fn test_confidence_scores() {
let query = "SELECT ?s ?p ?o WHERE { ?s ?p ?o }"; let suggestions = analyze_query_for_optimization(query);
for suggestion in &suggestions {
assert!(suggestion.confidence > 0.0);
assert!(suggestion.confidence <= 1.0);
}
}
#[test]
fn test_estimated_speedup() {
let query = r#"SELECT ?s WHERE { ?s rdfs:label ?label . FILTER(REGEX(?label, "test")) }"#;
let suggestions = analyze_query_for_optimization(query);
let regex_suggestion = suggestions
.iter()
.find(|s| s.title.contains("REGEX"))
.unwrap();
assert!(regex_suggestion.estimated_speedup.unwrap() > 50.0);
}
}