graphlite 0.0.1

GraphLite - A lightweight ISO GQL Graph Database
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
// Copyright (c) 2024-2025 DeepGraph Inc.
// SPDX-License-Identifier: Apache-2.0
//
//! Cost estimation for pattern optimization strategies
//!
//! This module provides cost estimation functionality for different pattern execution strategies.
//! It helps the query planner choose between path traversal, joins, and Cartesian products.

use crate::ast::ast::PathPattern;
use crate::plan::pattern_optimization::pattern_analysis::{
    JoinStep, LinearPath, PatternPlanStrategy,
};
use std::collections::HashMap;

/// Statistics about graph data used for cost estimation
///
/// **Planned Feature** - Graph statistics collection for cost estimation
/// See ROADMAP.md: "Pattern Optimization System"
/// Target: v0.3.0
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct GraphStatistics {
    /// Number of nodes per label
    pub node_counts: HashMap<String, u64>,
    /// Number of relationships per type
    pub relationship_counts: HashMap<String, u64>,
    /// Average relationships per node by type
    pub avg_relationships_per_node: HashMap<String, f64>,
    /// Selectivity estimates for common patterns
    pub pattern_selectivity: HashMap<String, f64>,
}

impl Default for GraphStatistics {
    fn default() -> Self {
        Self {
            node_counts: HashMap::new(),
            relationship_counts: HashMap::new(),
            avg_relationships_per_node: HashMap::new(),
            pattern_selectivity: HashMap::new(),
        }
    }
}

/// Cost estimates for different execution strategies
///
/// **Planned Feature** - Execution cost estimation for strategy selection
/// See ROADMAP.md: "Pattern Optimization System"
/// Target: v0.3.0
#[allow(dead_code)]
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct ExecutionCost {
    /// Estimated number of operations
    pub operations: f64,
    /// Estimated memory usage in bytes
    pub memory_bytes: u64,
    /// Estimated intermediate result size
    pub intermediate_results: u64,
    /// Confidence level (0.0 to 1.0) of the estimate
    pub confidence: f64,
}

impl ExecutionCost {
    pub fn new(
        operations: f64,
        memory_bytes: u64,
        intermediate_results: u64,
        confidence: f64,
    ) -> Self {
        Self {
            operations,
            memory_bytes,
            intermediate_results,
            confidence,
        }
    }

    /// Calculate total cost score for comparison (lower is better)
    pub fn total_cost(&self) -> f64 {
        // Weight operations heavily, include memory and intermediate results
        self.operations
            + (self.memory_bytes as f64 * 0.001)
            + (self.intermediate_results as f64 * 0.01)
    }

    /// Compare costs considering confidence levels
    #[allow(dead_code)] // ROADMAP v0.4.0 - Cost comparison for selecting optimal execution strategy
    pub fn is_better_than(&self, other: &ExecutionCost) -> bool {
        let self_adjusted = self.total_cost() / self.confidence.max(0.1);
        let other_adjusted = other.total_cost() / other.confidence.max(0.1);
        self_adjusted < other_adjusted
    }
}

/// Cost estimator for pattern execution strategies
///
/// **Planned Feature** - Cost-based optimizer for pattern execution
/// See ROADMAP.md: "Pattern Optimization System"
/// Target: v0.3.0
#[allow(dead_code)]
pub struct PlanCostEstimator {
    statistics: GraphStatistics,
    /// Cache of previously calculated costs
    cost_cache: HashMap<String, ExecutionCost>,
}

impl PlanCostEstimator {
    pub fn new(statistics: GraphStatistics) -> Self {
        Self {
            statistics,
            cost_cache: HashMap::new(),
        }
    }

    /// Estimate cost for a given pattern execution strategy
    pub fn estimate_cost(&mut self, strategy: &PatternPlanStrategy) -> ExecutionCost {
        match strategy {
            PatternPlanStrategy::PathTraversal(linear_path) => {
                self.estimate_path_traversal_cost(linear_path)
            }
            PatternPlanStrategy::HashJoin {
                patterns,
                join_order,
                ..
            } => self.estimate_hash_join_cost(patterns, join_order),
            PatternPlanStrategy::NestedLoopJoin { patterns, .. } => {
                self.estimate_nested_loop_join_cost(patterns)
            }
            PatternPlanStrategy::CartesianProduct { patterns, .. } => {
                self.estimate_cartesian_product_cost(patterns)
            }
        }
    }

    /// Estimate cost for path traversal strategy
    fn estimate_path_traversal_cost(&self, linear_path: &LinearPath) -> ExecutionCost {
        let mut total_operations = 0.0;
        let mut total_memory = 0u64;
        let mut current_cardinality = 1.0;

        for step in &linear_path.steps {
            // Estimate starting nodes for this step
            let start_nodes = if total_operations == 0.0 {
                // First step - use node count statistics
                // For now, use a default estimate since we don't have direct access to node labels
                1000.0
            } else {
                current_cardinality
            };

            // Estimate relationship traversal cost
            // Get the first label as the relationship type, or use default
            let default_type = "DEFAULT".to_string();
            let relationship_type = step.relationship.labels.first().unwrap_or(&default_type);
            let avg_relationships = self.get_avg_relationships(relationship_type).unwrap_or(2.0);
            let relationship_cost = start_nodes * avg_relationships;

            total_operations += relationship_cost;
            current_cardinality = relationship_cost;

            // Memory for intermediate results (assume 1KB per result)
            total_memory += (current_cardinality as u64) * 1024;
        }

        ExecutionCost::new(
            total_operations,
            total_memory,
            current_cardinality as u64,
            0.8, // High confidence for path traversal
        )
    }

    /// Estimate cost for hash join strategy
    fn estimate_hash_join_cost(
        &self,
        patterns: &[PathPattern],
        join_order: &[JoinStep],
    ) -> ExecutionCost {
        let mut total_operations = 0.0;
        let mut total_memory = 0u64;
        let mut intermediate_size = 0u64;

        // Estimate cost for each pattern individually
        for pattern in patterns {
            let pattern_cost = self.estimate_single_pattern_cost(pattern);
            total_operations += pattern_cost.operations;
            total_memory += pattern_cost.memory_bytes;
            intermediate_size += pattern_cost.intermediate_results;
        }

        // Add join costs
        for _join_step in join_order {
            // Hash join cost is approximately O(n + m) where n, m are input sizes
            let join_cost = intermediate_size as f64 * 1.5;
            total_operations += join_cost;

            // Hash table memory overhead
            total_memory += intermediate_size * 512; // 512 bytes per hash entry
        }

        ExecutionCost::new(
            total_operations,
            total_memory,
            intermediate_size,
            0.7, // Good confidence for hash joins
        )
    }

    /// Estimate cost for nested loop join strategy
    fn estimate_nested_loop_join_cost(&self, patterns: &[PathPattern]) -> ExecutionCost {
        if patterns.len() < 2 {
            return self.estimate_single_pattern_cost(&patterns[0]);
        }

        // Nested loop join is O(n * m) - very expensive
        let mut total_operations = 1.0;
        let mut total_memory = 0u64;

        for pattern in patterns {
            let pattern_cost = self.estimate_single_pattern_cost(pattern);
            total_operations *= pattern_cost.intermediate_results as f64;
            total_memory += pattern_cost.memory_bytes;
        }

        ExecutionCost::new(
            total_operations,
            total_memory,
            total_operations as u64,
            0.9, // High confidence but terrible performance
        )
    }

    /// Estimate cost for Cartesian product strategy
    fn estimate_cartesian_product_cost(&self, patterns: &[PathPattern]) -> ExecutionCost {
        let mut total_operations = 1.0;
        let mut total_memory = 0u64;
        let mut result_cardinality = 1u64;

        for pattern in patterns {
            let pattern_cost = self.estimate_single_pattern_cost(pattern);
            total_operations *= pattern_cost.intermediate_results as f64;
            total_memory += pattern_cost.memory_bytes;
            result_cardinality *= pattern_cost.intermediate_results;
        }

        ExecutionCost::new(
            total_operations,
            total_memory,
            result_cardinality,
            0.95, // Very high confidence - we know it's expensive
        )
    }

    /// Estimate cost for a single pattern
    fn estimate_single_pattern_cost(&self, pattern: &PathPattern) -> ExecutionCost {
        // Simple estimation based on pattern complexity
        let base_operations = match pattern.elements.len() {
            0 | 1 => 100.0,
            2 => 1000.0,
            3 => 10000.0,
            _ => 100000.0,
        };

        // Estimate selectivity based on filters
        let selectivity = self.estimate_pattern_selectivity(pattern);
        let operations = base_operations * selectivity;
        let results = (operations * 0.1) as u64;

        ExecutionCost::new(
            operations,
            results * 1024, // 1KB per result
            results,
            0.6, // Medium confidence for single patterns
        )
    }

    /// Estimate selectivity of a pattern (0.0 to 1.0)
    fn estimate_pattern_selectivity(&self, _pattern: &PathPattern) -> f64 {
        // For now, use simple heuristics
        let base_selectivity = 0.1; // Assume 10% selectivity

        // Reduce selectivity based on number of constraints
        // For now, assume no additional constraints since we don't have direct access to filters
        let constraint_factor = 1.0;

        base_selectivity * constraint_factor
    }

    /// Get average relationships per node for a relationship type
    fn get_avg_relationships(&self, rel_type: &str) -> Option<f64> {
        self.statistics
            .avg_relationships_per_node
            .get(rel_type)
            .copied()
    }

    /// Update statistics with new data
    #[allow(dead_code)] // ROADMAP v0.4.0 - Statistics refresh for adaptive cost estimation
    pub fn update_statistics(&mut self, new_stats: GraphStatistics) {
        self.statistics = new_stats;
        self.cost_cache.clear(); // Clear cache when stats change
    }

    /// Clear the cost cache
    #[allow(dead_code)] // ROADMAP v0.4.0 - Cache invalidation for statistics updates
    pub fn clear_cache(&mut self) {
        self.cost_cache.clear();
    }
}

/// Statistics manager for collecting graph statistics
///
/// **Planned Feature** - Statistics collection and management for query optimization
/// See ROADMAP.md: "Pattern Optimization System"
/// Target: v0.3.0
#[allow(dead_code)]
#[derive(Debug)]
pub struct StatisticsManager {
    /// Current statistics
    statistics: GraphStatistics,
    /// Statistics update frequency
    update_threshold: u64,
    /// Counter for operations since last update
    operations_since_update: u64,
}

impl StatisticsManager {
    pub fn new() -> Self {
        Self {
            statistics: GraphStatistics::default(),
            update_threshold: 1000,
            operations_since_update: 0,
        }
    }

    /// Get current statistics
    #[allow(dead_code)] // ROADMAP v0.4.0 - Statistics access for cost-based optimization
    pub fn get_statistics(&self) -> &GraphStatistics {
        &self.statistics
    }

    /// Record a query execution for statistics
    #[allow(dead_code)] // ROADMAP v0.4.0 - Query profiling for statistics collection
    pub fn record_query_execution(&mut self, _query: &str, _result_count: u64) {
        self.operations_since_update += 1;

        if self.operations_since_update >= self.update_threshold {
            // In a real implementation, this would update statistics from the storage layer
            self.refresh_statistics();
        }
    }

    /// Refresh statistics from storage (placeholder)
    #[allow(dead_code)] // ROADMAP v0.4.0 - Automatic statistics refresh from storage layer
    fn refresh_statistics(&mut self) {
        // TODO: Implement actual statistics collection from storage
        // For now, use default/estimated values

        // Reset counter
        self.operations_since_update = 0;

        // In Phase 3, this will integrate with the storage layer
        // to get actual node counts, relationship counts, etc.
    }

    /// Manually set statistics (for testing)
    #[allow(dead_code)] // ROADMAP v0.4.0 - Test utilities for cost estimation
    pub fn set_statistics(&mut self, stats: GraphStatistics) {
        self.statistics = stats;
    }

    /// Get a cost estimator with current statistics
    pub fn create_cost_estimator(&self) -> PlanCostEstimator {
        PlanCostEstimator::new(self.statistics.clone())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_execution_cost_comparison() {
        let cost1 = ExecutionCost::new(1000.0, 1024, 100, 0.8);
        let cost2 = ExecutionCost::new(2000.0, 2048, 200, 0.9);

        assert!(cost1.is_better_than(&cost2));
        assert!(!cost2.is_better_than(&cost1));
    }

    #[test]
    fn test_cost_estimator_basic() {
        let mut stats = GraphStatistics::default();
        stats.node_counts.insert("Person".to_string(), 1000);
        stats.relationship_counts.insert("KNOWS".to_string(), 500);
        stats
            .avg_relationships_per_node
            .insert("KNOWS".to_string(), 2.0);

        let mut estimator = PlanCostEstimator::new(stats);

        // Test with a Cartesian product strategy (simpler to create)
        use crate::ast::ast::{Location, PathPattern};

        let pattern = PathPattern {
            assignment: None,
            path_type: None,
            elements: vec![],
            location: Location::default(),
        };

        let strategy = PatternPlanStrategy::CartesianProduct {
            patterns: vec![pattern],
            estimated_cost: 0.0, // Use default cost value
        };
        let cost = estimator.estimate_cost(&strategy);

        assert!(cost.operations > 0.0);
        assert!(cost.confidence > 0.0);
    }

    #[test]
    fn test_statistics_manager() {
        let mut manager = StatisticsManager::new();
        let stats = manager.get_statistics();

        // Should start with empty statistics
        assert!(stats.node_counts.is_empty());
        assert!(stats.relationship_counts.is_empty());

        // Record some operations
        manager.record_query_execution("MATCH (n) RETURN n", 100);

        // Should work without errors
        let estimator = manager.create_cost_estimator();
        assert_eq!(estimator.statistics.node_counts.len(), 0);
    }
}