Skip to main content

tensorlogic_adapters/
schema_analysis.rs

1//! Schema analysis and statistics.
2//!
3//! This module provides tools for analyzing symbol tables and generating
4//! insights about schema structure, complexity, and usage patterns.
5
6use std::collections::{HashMap, HashSet};
7
8use crate::SymbolTable;
9
10/// Comprehensive statistics about a schema.
11#[derive(Clone, Debug)]
12pub struct SchemaStatistics {
13    /// Number of domains defined.
14    pub domain_count: usize,
15    /// Number of predicates defined.
16    pub predicate_count: usize,
17    /// Number of variables bound.
18    pub variable_count: usize,
19    /// Total cardinality across all domains.
20    pub total_cardinality: usize,
21    /// Average domain cardinality.
22    pub avg_cardinality: f64,
23    /// Maximum domain cardinality.
24    pub max_cardinality: usize,
25    /// Minimum domain cardinality.
26    pub min_cardinality: usize,
27    /// Distribution of predicate arities.
28    pub arity_distribution: HashMap<usize, usize>,
29    /// Most common domain types in predicates.
30    pub domain_usage_frequency: HashMap<String, usize>,
31    /// Domains that are never used in predicates.
32    pub unused_domains: Vec<String>,
33    /// Predicates grouped by arity.
34    pub predicates_by_arity: HashMap<usize, Vec<String>>,
35}
36
37impl SchemaStatistics {
38    /// Compute statistics for a symbol table.
39    ///
40    /// # Example
41    ///
42    /// ```rust
43    /// use tensorlogic_adapters::{SymbolTable, DomainInfo, PredicateInfo, SchemaStatistics};
44    ///
45    /// let mut table = SymbolTable::new();
46    /// table.add_domain(DomainInfo::new("Person", 100)).unwrap();
47    /// table.add_predicate(PredicateInfo::new(
48    ///     "knows",
49    ///     vec!["Person".to_string(), "Person".to_string()]
50    /// )).unwrap();
51    ///
52    /// let stats = SchemaStatistics::compute(&table);
53    /// assert_eq!(stats.domain_count, 1);
54    /// assert_eq!(stats.predicate_count, 1);
55    /// ```
56    pub fn compute(table: &SymbolTable) -> Self {
57        let domain_count = table.domains.len();
58        let predicate_count = table.predicates.len();
59        let variable_count = table.variables.len();
60
61        // Compute cardinality statistics
62        let cardinalities: Vec<usize> = table.domains.values().map(|d| d.cardinality).collect();
63        let total_cardinality: usize = cardinalities.iter().sum();
64        let avg_cardinality = if domain_count > 0 {
65            total_cardinality as f64 / domain_count as f64
66        } else {
67            0.0
68        };
69        let max_cardinality = cardinalities.iter().copied().max().unwrap_or(0);
70        let min_cardinality = cardinalities.iter().copied().min().unwrap_or(0);
71
72        // Compute arity distribution
73        let mut arity_distribution = HashMap::new();
74        let mut predicates_by_arity: HashMap<usize, Vec<String>> = HashMap::new();
75        for (name, pred) in &table.predicates {
76            let arity = pred.arg_domains.len();
77            *arity_distribution.entry(arity).or_insert(0) += 1;
78            predicates_by_arity
79                .entry(arity)
80                .or_default()
81                .push(name.clone());
82        }
83
84        // Compute domain usage frequency
85        let mut domain_usage_frequency = HashMap::new();
86        for pred in table.predicates.values() {
87            for domain in &pred.arg_domains {
88                *domain_usage_frequency.entry(domain.clone()).or_insert(0) += 1;
89            }
90        }
91
92        // Find unused domains
93        let used_domains: HashSet<_> = domain_usage_frequency.keys().cloned().collect();
94        let unused_domains: Vec<String> = table
95            .domains
96            .keys()
97            .filter(|d| !used_domains.contains(*d))
98            .cloned()
99            .collect();
100
101        Self {
102            domain_count,
103            predicate_count,
104            variable_count,
105            total_cardinality,
106            avg_cardinality,
107            max_cardinality,
108            min_cardinality,
109            arity_distribution,
110            domain_usage_frequency,
111            unused_domains,
112            predicates_by_arity,
113        }
114    }
115
116    /// Get the most frequently used domains.
117    pub fn most_used_domains(&self, n: usize) -> Vec<(String, usize)> {
118        let mut usage: Vec<_> = self.domain_usage_frequency.iter().collect();
119        usage.sort_by(|a, b| b.1.cmp(a.1));
120        usage
121            .into_iter()
122            .take(n)
123            .map(|(d, &count)| (d.clone(), count))
124            .collect()
125    }
126
127    /// Get the least frequently used domains (excluding unused).
128    pub fn least_used_domains(&self, n: usize) -> Vec<(String, usize)> {
129        let mut usage: Vec<_> = self.domain_usage_frequency.iter().collect();
130        usage.sort_by(|a, b| a.1.cmp(b.1));
131        usage
132            .into_iter()
133            .take(n)
134            .map(|(d, &count)| (d.clone(), count))
135            .collect()
136    }
137
138    /// Calculate schema complexity score.
139    ///
140    /// This is a heuristic metric based on:
141    /// - Number of domains
142    /// - Number of predicates
143    /// - Arity distribution
144    /// - Domain usage patterns
145    pub fn complexity_score(&self) -> f64 {
146        let domain_factor = self.domain_count as f64;
147        let predicate_factor = self.predicate_count as f64;
148        let arity_diversity = self.arity_distribution.len() as f64;
149        let usage_variance = self.compute_usage_variance();
150
151        // Weighted combination
152        domain_factor * 0.2 + predicate_factor * 0.3 + arity_diversity * 0.2 + usage_variance * 0.3
153    }
154
155    fn compute_usage_variance(&self) -> f64 {
156        if self.domain_usage_frequency.is_empty() {
157            return 0.0;
158        }
159
160        let counts: Vec<f64> = self
161            .domain_usage_frequency
162            .values()
163            .map(|&c| c as f64)
164            .collect();
165        let mean = counts.iter().sum::<f64>() / counts.len() as f64;
166        let variance = counts.iter().map(|c| (c - mean).powi(2)).sum::<f64>() / counts.len() as f64;
167        variance.sqrt()
168    }
169}
170
171/// Schema recommendations based on analysis.
172#[derive(Clone, Debug)]
173pub struct SchemaRecommendations {
174    /// Issues found in the schema.
175    pub issues: Vec<SchemaIssue>,
176    /// Suggestions for improvement.
177    pub suggestions: Vec<String>,
178}
179
180/// Types of schema issues that can be detected.
181#[derive(Clone, Debug, PartialEq, Eq)]
182pub enum SchemaIssue {
183    /// Domain is never used in any predicate.
184    UnusedDomain(String),
185    /// Domain has zero cardinality.
186    ZeroCardinalityDomain(String),
187    /// Very high cardinality that might cause performance issues.
188    HighCardinalityDomain(String, usize),
189    /// Predicate with unusually high arity.
190    HighArityPredicate(String, usize),
191    /// No predicates defined.
192    NoPredicates,
193    /// No domains defined.
194    NoDomains,
195}
196
197impl SchemaIssue {
198    /// Get a human-readable description of the issue.
199    pub fn description(&self) -> String {
200        match self {
201            Self::UnusedDomain(name) => format!("Domain '{}' is defined but never used", name),
202            Self::ZeroCardinalityDomain(name) => {
203                format!("Domain '{}' has zero cardinality", name)
204            }
205            Self::HighCardinalityDomain(name, card) => {
206                format!(
207                    "Domain '{}' has very high cardinality ({}), which may impact performance",
208                    name, card
209                )
210            }
211            Self::HighArityPredicate(name, arity) => {
212                format!(
213                    "Predicate '{}' has high arity ({}), consider decomposition",
214                    name, arity
215                )
216            }
217            Self::NoPredicates => "Schema has no predicates defined".to_string(),
218            Self::NoDomains => "Schema has no domains defined".to_string(),
219        }
220    }
221
222    /// Get the severity level (1=info, 2=warning, 3=error).
223    pub fn severity(&self) -> u8 {
224        match self {
225            Self::UnusedDomain(_) => 1,
226            Self::ZeroCardinalityDomain(_) => 2,
227            Self::HighCardinalityDomain(_, _) => 1,
228            Self::HighArityPredicate(_, _) => 1,
229            Self::NoPredicates => 2,
230            Self::NoDomains => 3,
231        }
232    }
233}
234
235/// Analyzer for generating schema recommendations.
236pub struct SchemaAnalyzer;
237
238impl SchemaAnalyzer {
239    /// Analyze a symbol table and generate recommendations.
240    ///
241    /// # Example
242    ///
243    /// ```rust
244    /// use tensorlogic_adapters::{SymbolTable, DomainInfo, SchemaAnalyzer};
245    ///
246    /// let mut table = SymbolTable::new();
247    /// table.add_domain(DomainInfo::new("Person", 0)).unwrap();
248    ///
249    /// let recommendations = SchemaAnalyzer::analyze(&table);
250    /// assert!(!recommendations.issues.is_empty());
251    /// ```
252    pub fn analyze(table: &SymbolTable) -> SchemaRecommendations {
253        let mut issues = Vec::new();
254        let mut suggestions = Vec::new();
255
256        // Check for no domains
257        if table.domains.is_empty() {
258            issues.push(SchemaIssue::NoDomains);
259            suggestions.push("Define at least one domain for your schema".to_string());
260            return SchemaRecommendations {
261                issues,
262                suggestions,
263            };
264        }
265
266        // Check for no predicates
267        if table.predicates.is_empty() {
268            issues.push(SchemaIssue::NoPredicates);
269            suggestions.push("Define predicates to enable reasoning over your domains".to_string());
270        }
271
272        // Analyze domains
273        let stats = SchemaStatistics::compute(table);
274        const HIGH_CARDINALITY_THRESHOLD: usize = 100_000;
275
276        for (name, domain) in &table.domains {
277            // Check for zero cardinality
278            if domain.cardinality == 0 {
279                issues.push(SchemaIssue::ZeroCardinalityDomain(name.clone()));
280            }
281
282            // Check for high cardinality
283            if domain.cardinality > HIGH_CARDINALITY_THRESHOLD {
284                issues.push(SchemaIssue::HighCardinalityDomain(
285                    name.clone(),
286                    domain.cardinality,
287                ));
288            }
289
290            // Check for unused domains
291            if stats.unused_domains.contains(name) {
292                issues.push(SchemaIssue::UnusedDomain(name.clone()));
293                suggestions.push(format!(
294                    "Consider removing unused domain '{}' or defining predicates that use it",
295                    name
296                ));
297            }
298        }
299
300        // Analyze predicates
301        const HIGH_ARITY_THRESHOLD: usize = 5;
302        for (name, pred) in &table.predicates {
303            if pred.arg_domains.len() > HIGH_ARITY_THRESHOLD {
304                issues.push(SchemaIssue::HighArityPredicate(
305                    name.clone(),
306                    pred.arg_domains.len(),
307                ));
308                suggestions.push(format!(
309                    "Consider decomposing high-arity predicate '{}' into smaller predicates",
310                    name
311                ));
312            }
313        }
314
315        // General suggestions
316        if stats.domain_count > 0 && stats.predicate_count == 0 {
317            suggestions
318                .push("Add predicates to establish relationships between your domains".to_string());
319        }
320
321        if stats.variable_count == 0 && stats.predicate_count > 0 {
322            suggestions
323                .push("Consider binding variables to enable quantification in rules".to_string());
324        }
325
326        SchemaRecommendations {
327            issues,
328            suggestions,
329        }
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336    use crate::{DomainInfo, PredicateInfo};
337
338    #[test]
339    fn test_statistics_empty_table() {
340        let table = SymbolTable::new();
341        let stats = SchemaStatistics::compute(&table);
342
343        assert_eq!(stats.domain_count, 0);
344        assert_eq!(stats.predicate_count, 0);
345        assert_eq!(stats.variable_count, 0);
346    }
347
348    #[test]
349    fn test_statistics_with_data() {
350        let mut table = SymbolTable::new();
351        table.add_domain(DomainInfo::new("Person", 100)).unwrap();
352        table.add_domain(DomainInfo::new("Location", 50)).unwrap();
353        table
354            .add_predicate(PredicateInfo::new(
355                "knows",
356                vec!["Person".into(), "Person".into()],
357            ))
358            .unwrap();
359        table
360            .add_predicate(PredicateInfo::new(
361                "at",
362                vec!["Person".into(), "Location".into()],
363            ))
364            .unwrap();
365
366        let stats = SchemaStatistics::compute(&table);
367
368        assert_eq!(stats.domain_count, 2);
369        assert_eq!(stats.predicate_count, 2);
370        assert_eq!(stats.total_cardinality, 150);
371        assert_eq!(stats.avg_cardinality, 75.0);
372        assert_eq!(stats.max_cardinality, 100);
373        assert_eq!(stats.min_cardinality, 50);
374
375        // Check domain usage
376        assert_eq!(stats.domain_usage_frequency.get("Person"), Some(&3));
377        assert_eq!(stats.domain_usage_frequency.get("Location"), Some(&1));
378        assert!(stats.unused_domains.is_empty());
379    }
380
381    #[test]
382    fn test_unused_domains() {
383        let mut table = SymbolTable::new();
384        table.add_domain(DomainInfo::new("Person", 100)).unwrap();
385        table.add_domain(DomainInfo::new("Unused", 50)).unwrap();
386        table
387            .add_predicate(PredicateInfo::new("age", vec!["Person".into()]))
388            .unwrap();
389
390        let stats = SchemaStatistics::compute(&table);
391        assert_eq!(stats.unused_domains, vec!["Unused"]);
392    }
393
394    #[test]
395    fn test_arity_distribution() {
396        let mut table = SymbolTable::new();
397        table.add_domain(DomainInfo::new("D", 10)).unwrap();
398        table
399            .add_predicate(PredicateInfo::new("p1", vec!["D".into()]))
400            .unwrap();
401        table
402            .add_predicate(PredicateInfo::new("p2", vec!["D".into(), "D".into()]))
403            .unwrap();
404        table
405            .add_predicate(PredicateInfo::new("p3", vec!["D".into(), "D".into()]))
406            .unwrap();
407
408        let stats = SchemaStatistics::compute(&table);
409        assert_eq!(stats.arity_distribution.get(&1), Some(&1));
410        assert_eq!(stats.arity_distribution.get(&2), Some(&2));
411    }
412
413    #[test]
414    fn test_analyzer_no_domains() {
415        let table = SymbolTable::new();
416        let recs = SchemaAnalyzer::analyze(&table);
417
418        assert!(!recs.issues.is_empty());
419        assert!(recs.issues.contains(&SchemaIssue::NoDomains));
420    }
421
422    #[test]
423    fn test_analyzer_zero_cardinality() {
424        let mut table = SymbolTable::new();
425        table.add_domain(DomainInfo::new("Person", 0)).unwrap();
426
427        let recs = SchemaAnalyzer::analyze(&table);
428        assert!(recs
429            .issues
430            .contains(&SchemaIssue::ZeroCardinalityDomain("Person".to_string())));
431    }
432
433    #[test]
434    fn test_analyzer_unused_domain() {
435        let mut table = SymbolTable::new();
436        table.add_domain(DomainInfo::new("Used", 10)).unwrap();
437        table.add_domain(DomainInfo::new("Unused", 10)).unwrap();
438        table
439            .add_predicate(PredicateInfo::new("p", vec!["Used".into()]))
440            .unwrap();
441
442        let recs = SchemaAnalyzer::analyze(&table);
443        assert!(recs
444            .issues
445            .contains(&SchemaIssue::UnusedDomain("Unused".to_string())));
446    }
447
448    #[test]
449    fn test_analyzer_high_arity() {
450        let mut table = SymbolTable::new();
451        table.add_domain(DomainInfo::new("D", 10)).unwrap();
452        let args = vec!["D".to_string(); 10]; // 10-arity predicate
453        table
454            .add_predicate(PredicateInfo::new("complex", args))
455            .unwrap();
456
457        let recs = SchemaAnalyzer::analyze(&table);
458        assert!(recs
459            .issues
460            .iter()
461            .any(|i| matches!(i, SchemaIssue::HighArityPredicate(_, _))));
462    }
463
464    #[test]
465    fn test_complexity_score() {
466        let mut table = SymbolTable::new();
467        table.add_domain(DomainInfo::new("Person", 100)).unwrap();
468        table
469            .add_predicate(PredicateInfo::new("p", vec!["Person".into()]))
470            .unwrap();
471
472        let stats = SchemaStatistics::compute(&table);
473        let score = stats.complexity_score();
474        assert!(score > 0.0);
475    }
476
477    #[test]
478    fn test_most_used_domains() {
479        let mut table = SymbolTable::new();
480        table.add_domain(DomainInfo::new("A", 10)).unwrap();
481        table.add_domain(DomainInfo::new("B", 10)).unwrap();
482        table
483            .add_predicate(PredicateInfo::new("p1", vec!["A".into(), "A".into()]))
484            .unwrap();
485        table
486            .add_predicate(PredicateInfo::new("p2", vec!["B".into()]))
487            .unwrap();
488
489        let stats = SchemaStatistics::compute(&table);
490        let most_used = stats.most_used_domains(1);
491        assert_eq!(most_used[0].0, "A");
492        assert_eq!(most_used[0].1, 2);
493    }
494}