Skip to main content

lance_graph/datafusion_planner/
analysis.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Query Analysis Phase
5//!
6//! Assigns unique IDs to relationship instances and collects variable-to-label mappings
7
8use crate::ast::RelationshipDirection;
9use crate::error::Result;
10use crate::logical_plan::*;
11use std::collections::{HashMap, HashSet};
12
13/// Analysis result containing all metadata needed for planning
14#[derive(Debug, Clone, Default)]
15pub struct QueryAnalysis {
16    /// Variable → Label mappings (e.g., "n" → "Person")
17    pub var_to_label: HashMap<String, String>,
18
19    /// Relationship instances with unique IDs to avoid column conflicts
20    pub relationship_instances: Vec<RelationshipInstance>,
21
22    /// All datasets required for this query
23    pub required_datasets: HashSet<String>,
24}
25
26/// Represents a single relationship expansion with a unique instance ID
27#[derive(Debug, Clone)]
28pub struct RelationshipInstance {
29    pub id: usize, // Unique instance number
30    pub rel_type: String,
31    pub source_var: String,
32    pub target_var: String,
33    pub direction: RelationshipDirection,
34    pub alias: String, // e.g., "friend_of_1", "friend_of_2"
35}
36
37/// Planning context that tracks state during plan building
38pub struct PlanningContext<'a> {
39    pub analysis: &'a QueryAnalysis,
40    pub(crate) relationship_instance_idx: HashMap<String, usize>,
41}
42
43impl<'a> PlanningContext<'a> {
44    pub fn new(analysis: &'a QueryAnalysis) -> Self {
45        Self {
46            analysis,
47            relationship_instance_idx: HashMap::new(),
48        }
49    }
50
51    /// Get the next relationship instance for a given type (returns a clone)
52    pub fn next_relationship_instance(&mut self, rel_type: &str) -> Result<RelationshipInstance> {
53        let idx = self
54            .relationship_instance_idx
55            .entry(rel_type.to_string())
56            .and_modify(|i| *i += 1)
57            .or_insert(0);
58
59        self.analysis
60            .relationship_instances
61            .iter()
62            .filter(|r| r.rel_type == rel_type)
63            .nth(*idx)
64            .cloned()
65            .ok_or_else(|| crate::error::GraphError::PlanError {
66                message: format!("No relationship instance found for: {}", rel_type),
67                location: snafu::Location::new(file!(), line!(), column!()),
68            })
69    }
70}
71
72/// Analyze the logical plan to extract metadata
73pub fn analyze(logical_plan: &LogicalOperator) -> Result<QueryAnalysis> {
74    let mut analysis = QueryAnalysis::default();
75    let mut rel_counter: HashMap<String, usize> = HashMap::new();
76
77    analyze_operator(logical_plan, &mut analysis, &mut rel_counter)?;
78    Ok(analysis)
79}
80
81/// Recursively analyze operators to build QueryAnalysis
82fn analyze_operator(
83    op: &LogicalOperator,
84    analysis: &mut QueryAnalysis,
85    rel_counter: &mut HashMap<String, usize>,
86) -> Result<()> {
87    match op {
88        LogicalOperator::ScanByLabel {
89            variable, label, ..
90        } => {
91            analysis
92                .var_to_label
93                .insert(variable.clone(), label.clone());
94            analysis.required_datasets.insert(label.clone());
95        }
96        LogicalOperator::Expand {
97            input,
98            source_variable,
99            target_variable,
100            target_label,
101            relationship_types,
102            direction,
103            relationship_variable,
104            ..
105        } => {
106            // Recursively analyze input first
107            analyze_operator(input, analysis, rel_counter)?;
108
109            // Register the target variable with its label from the logical plan
110            analysis
111                .var_to_label
112                .insert(target_variable.clone(), target_label.clone());
113
114            // Assign unique instance ID for this relationship
115            if let Some(rel_type) = relationship_types.first() {
116                let instance_id = rel_counter
117                    .entry(rel_type.clone())
118                    .and_modify(|c| *c += 1)
119                    .or_insert(1);
120
121                // Use relationship variable if provided, otherwise use type_instanceId
122                let alias = if let Some(rel_var) = relationship_variable {
123                    rel_var.clone()
124                } else {
125                    format!("{}_{}", rel_type.to_lowercase(), instance_id)
126                };
127
128                analysis.relationship_instances.push(RelationshipInstance {
129                    id: *instance_id,
130                    rel_type: rel_type.clone(),
131                    source_var: source_variable.clone(),
132                    target_var: target_variable.clone(),
133                    direction: direction.clone(),
134                    alias,
135                });
136
137                analysis.required_datasets.insert(rel_type.clone());
138            }
139        }
140        LogicalOperator::VariableLengthExpand {
141            input,
142            source_variable,
143            target_variable,
144            relationship_types,
145            direction,
146            relationship_variable,
147            min_length,
148            max_length,
149            ..
150        } => {
151            // Recursively analyze input first
152            analyze_operator(input, analysis, rel_counter)?;
153
154            // Infer target variable's label from source variable
155            // For (a:Person)-[:KNOWS]->(b), b also gets label Person
156            if let Some(source_label) = analysis.var_to_label.get(source_variable).cloned() {
157                analysis
158                    .var_to_label
159                    .insert(target_variable.clone(), source_label);
160            }
161
162            // For variable-length paths, register multiple instances (one per hop)
163            // We need to register instances for all possible hop counts
164            if let Some(rel_type) = relationship_types.first() {
165                let max_hops = max_length.unwrap_or(crate::MAX_VARIABLE_LENGTH_HOPS);
166                let min_hops = min_length.unwrap_or(1).max(1);
167
168                // Register instances for each hop count we'll generate
169                for hop_count in min_hops..=max_hops {
170                    for _ in 0..hop_count {
171                        let instance_id = rel_counter
172                            .entry(rel_type.clone())
173                            .and_modify(|c| *c += 1)
174                            .or_insert(1);
175
176                        // Use relationship variable if provided, otherwise use type_instanceId
177                        let alias = if let Some(rel_var) = relationship_variable {
178                            format!("{}_{}", rel_var, instance_id)
179                        } else {
180                            format!("{}_{}", rel_type.to_lowercase(), instance_id)
181                        };
182
183                        analysis.relationship_instances.push(RelationshipInstance {
184                            id: *instance_id,
185                            rel_type: rel_type.clone(),
186                            source_var: source_variable.clone(),
187                            target_var: target_variable.clone(),
188                            direction: direction.clone(),
189                            alias,
190                        });
191                    }
192                }
193
194                analysis.required_datasets.insert(rel_type.clone());
195            }
196        }
197        LogicalOperator::Filter { input, .. }
198        | LogicalOperator::Project { input, .. }
199        | LogicalOperator::Sort { input, .. }
200        | LogicalOperator::Limit { input, .. }
201        | LogicalOperator::Offset { input, .. }
202        | LogicalOperator::Distinct { input } => {
203            analyze_operator(input, analysis, rel_counter)?;
204        }
205        LogicalOperator::Join { left, right, .. } => {
206            analyze_operator(left, analysis, rel_counter)?;
207            analyze_operator(right, analysis, rel_counter)?;
208        }
209        LogicalOperator::Unwind { input, .. } => {
210            if let Some(op) = input {
211                analyze_operator(op, analysis, rel_counter)?;
212            }
213        }
214    }
215    Ok(())
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use crate::ast::RelationshipDirection;
222    use crate::logical_plan::LogicalOperator;
223    use std::collections::HashMap;
224
225    #[test]
226    fn test_query_analysis_single_hop() {
227        // Test that analysis correctly identifies relationship instances
228        let scan_a = LogicalOperator::ScanByLabel {
229            variable: "a".to_string(),
230            label: "Person".to_string(),
231            properties: Default::default(),
232        };
233        let expand = LogicalOperator::Expand {
234            input: Box::new(scan_a),
235            source_variable: "a".to_string(),
236            target_variable: "b".to_string(),
237            target_label: "Person".to_string(),
238            relationship_types: vec!["KNOWS".to_string()],
239            direction: RelationshipDirection::Outgoing,
240            relationship_variable: None,
241            properties: Default::default(),
242            target_properties: Default::default(),
243        };
244
245        let cfg = crate::config::GraphConfig::builder()
246            .with_node_label("Person", "id")
247            .with_relationship("KNOWS", "src_id", "dst_id")
248            .build()
249            .unwrap();
250        let _planner = crate::datafusion_planner::DataFusionPlanner::new(cfg);
251        let analysis = analyze(&expand).unwrap();
252
253        // Should have two variable mappings: a and b both map to Person
254        assert_eq!(analysis.var_to_label.len(), 2);
255        assert_eq!(analysis.var_to_label.get("a"), Some(&"Person".to_string()));
256        assert_eq!(analysis.var_to_label.get("b"), Some(&"Person".to_string()));
257
258        // Should have one relationship instance
259        assert_eq!(analysis.relationship_instances.len(), 1);
260        assert_eq!(analysis.relationship_instances[0].rel_type, "KNOWS");
261        assert_eq!(analysis.relationship_instances[0].alias, "knows_1");
262        assert_eq!(analysis.relationship_instances[0].id, 1);
263    }
264
265    #[test]
266    fn test_query_analysis_two_hop() {
267        // Test that two-hop queries get unique relationship instances
268        let scan_a = LogicalOperator::ScanByLabel {
269            variable: "a".to_string(),
270            label: "Person".to_string(),
271            properties: Default::default(),
272        };
273        let expand1 = LogicalOperator::Expand {
274            input: Box::new(scan_a),
275            source_variable: "a".to_string(),
276            target_variable: "b".to_string(),
277            target_label: "Person".to_string(),
278            relationship_types: vec!["KNOWS".to_string()],
279            direction: RelationshipDirection::Outgoing,
280            relationship_variable: None,
281            properties: Default::default(),
282            target_properties: Default::default(),
283        };
284        let expand2 = LogicalOperator::Expand {
285            input: Box::new(expand1),
286            source_variable: "b".to_string(),
287            target_variable: "c".to_string(),
288            target_label: "Person".to_string(),
289            relationship_types: vec!["KNOWS".to_string()],
290            direction: RelationshipDirection::Outgoing,
291            relationship_variable: None,
292            properties: Default::default(),
293            target_properties: Default::default(),
294        };
295
296        let cfg = crate::config::GraphConfig::builder()
297            .with_node_label("Person", "id")
298            .with_relationship("KNOWS", "src_id", "dst_id")
299            .build()
300            .unwrap();
301        let _planner = crate::datafusion_planner::DataFusionPlanner::new(cfg);
302        let analysis = analyze(&expand2).unwrap();
303
304        // Should have two relationship instances with UNIQUE aliases
305        assert_eq!(analysis.relationship_instances.len(), 2);
306        assert_eq!(analysis.relationship_instances[0].alias, "knows_1");
307        assert_eq!(analysis.relationship_instances[1].alias, "knows_2");
308
309        // Both should be KNOWS but with different IDs
310        assert_eq!(analysis.relationship_instances[0].rel_type, "KNOWS");
311        assert_eq!(analysis.relationship_instances[1].rel_type, "KNOWS");
312        assert_eq!(analysis.relationship_instances[0].id, 1);
313        assert_eq!(analysis.relationship_instances[1].id, 2);
314    }
315
316    #[test]
317    fn test_varlength_expand_analysis_registers_instances() {
318        // Test that analysis phase correctly registers multiple relationship instances
319        let scan_a = LogicalOperator::ScanByLabel {
320            variable: "a".to_string(),
321            label: "Person".to_string(),
322            properties: Default::default(),
323        };
324        let vlexpand = LogicalOperator::VariableLengthExpand {
325            input: Box::new(scan_a),
326            source_variable: "a".to_string(),
327            target_variable: "b".to_string(),
328            relationship_types: vec!["KNOWS".to_string()],
329            direction: RelationshipDirection::Outgoing,
330            relationship_variable: None,
331            min_length: Some(1),
332            max_length: Some(2),
333            target_properties: HashMap::new(),
334        };
335
336        let cfg = crate::config::GraphConfig::builder()
337            .with_node_label("Person", "id")
338            .with_relationship("KNOWS", "src_person_id", "dst_person_id")
339            .build()
340            .unwrap();
341        let _planner = crate::datafusion_planner::DataFusionPlanner::new(cfg);
342        let analysis = analyze(&vlexpand).unwrap();
343
344        // For *1..2, should register 1 + 2 = 3 instances
345        let knows_instances: Vec<_> = analysis
346            .relationship_instances
347            .iter()
348            .filter(|r| r.rel_type == "KNOWS")
349            .collect();
350
351        assert_eq!(
352            knows_instances.len(),
353            3,
354            "Expected 3 KNOWS instances (1 for 1-hop + 2 for 2-hop)"
355        );
356    }
357
358    #[test]
359    fn test_planning_context_tracks_instances() {
360        // Test that PlanningContext correctly iterates through instances
361        let instances = vec![
362            RelationshipInstance {
363                id: 1,
364                rel_type: "KNOWS".to_string(),
365                source_var: "a".to_string(),
366                target_var: "b".to_string(),
367                direction: RelationshipDirection::Outgoing,
368                alias: "knows_1".to_string(),
369            },
370            RelationshipInstance {
371                id: 2,
372                rel_type: "KNOWS".to_string(),
373                source_var: "b".to_string(),
374                target_var: "c".to_string(),
375                direction: RelationshipDirection::Outgoing,
376                alias: "knows_2".to_string(),
377            },
378        ];
379
380        let analysis = QueryAnalysis {
381            var_to_label: HashMap::new(),
382            relationship_instances: instances,
383            required_datasets: HashSet::new(),
384        };
385
386        let mut ctx = PlanningContext::new(&analysis);
387
388        // First call should return first instance
389        let inst1 = ctx.next_relationship_instance("KNOWS").unwrap();
390        assert_eq!(inst1.alias, "knows_1");
391
392        // Second call should return second instance
393        let inst2 = ctx.next_relationship_instance("KNOWS").unwrap();
394        assert_eq!(inst2.alias, "knows_2");
395
396        // Third call should error (no more instances)
397        assert!(ctx.next_relationship_instance("KNOWS").is_err());
398    }
399}