Skip to main content

debtmap/data_flow/
mod.rs

1//! Data flow analysis infrastructure for tracking variable mutations and state transitions.
2//!
3//! This module provides the data structures and utilities for analyzing how data flows
4//! through functions, including variable bindings, mutations, and cross-function data
5//! dependencies. The analysis is used to detect impure functions and state mutation patterns.
6//!
7//! # Key Components
8//!
9//! - **DataFlowGraph**: Tracks variable definitions, uses, and mutations
10//! - **Population utilities**: Functions for building graphs from call graph analysis
11//! - **Serialization**: Custom serialization for FunctionId-keyed maps
12
13use crate::analysis::data_flow::DataFlowAnalysis;
14use crate::priority::{call_graph::CallGraph, call_graph::FunctionId};
15use serde::{Deserialize, Serialize};
16use std::collections::{HashMap, HashSet};
17
18pub mod population;
19
20mod function_id_serde {
21    use super::*;
22    use serde::{Deserialize, Deserializer, Serialize, Serializer};
23    use std::collections::HashMap as StdHashMap;
24
25    pub fn serialize<S, V>(map: &HashMap<FunctionId, V>, serializer: S) -> Result<S::Ok, S::Error>
26    where
27        S: Serializer,
28        V: Serialize,
29    {
30        let string_map: StdHashMap<String, &V> = map
31            .iter()
32            .map(|(k, v)| (format!("{}:{}:{}", k.file.display(), k.name, k.line), v))
33            .collect();
34        string_map.serialize(serializer)
35    }
36
37    pub fn deserialize<'de, D, V>(deserializer: D) -> Result<HashMap<FunctionId, V>, D::Error>
38    where
39        D: Deserializer<'de>,
40        V: Deserialize<'de>,
41    {
42        let string_map: StdHashMap<String, V> = StdHashMap::deserialize(deserializer)?;
43        let mut result = HashMap::new();
44        for (key, value) in string_map {
45            let parts: Vec<&str> = key.rsplitn(3, ':').collect();
46            if parts.len() == 3 {
47                let func_id = FunctionId::new(
48                    parts[2].into(),
49                    parts[1].to_string(),
50                    parts[0].parse().unwrap_or(0),
51                );
52                result.insert(func_id, value);
53            }
54        }
55        Ok(result)
56    }
57}
58
59mod function_id_tuple_serde {
60    use super::*;
61    use serde::{Deserialize, Deserializer, Serialize, Serializer};
62    use std::collections::HashMap as StdHashMap;
63
64    pub fn serialize<S, V>(
65        map: &HashMap<(FunctionId, FunctionId), V>,
66        serializer: S,
67    ) -> Result<S::Ok, S::Error>
68    where
69        S: Serializer,
70        V: Serialize,
71    {
72        let string_map: StdHashMap<String, &V> = map
73            .iter()
74            .map(|((k1, k2), v)| {
75                let key = format!(
76                    "{}:{}:{}|{}:{}:{}",
77                    k1.file.display(),
78                    k1.name,
79                    k1.line,
80                    k2.file.display(),
81                    k2.name,
82                    k2.line
83                );
84                (key, v)
85            })
86            .collect();
87        string_map.serialize(serializer)
88    }
89
90    pub fn deserialize<'de, D, V>(
91        deserializer: D,
92    ) -> Result<HashMap<(FunctionId, FunctionId), V>, D::Error>
93    where
94        D: Deserializer<'de>,
95        V: Deserialize<'de>,
96    {
97        let string_map: StdHashMap<String, V> = StdHashMap::deserialize(deserializer)?;
98        let mut result = HashMap::new();
99        for (key, value) in string_map {
100            let parts: Vec<&str> = key.split('|').collect();
101            if parts.len() == 2 {
102                let parts1: Vec<&str> = parts[0].rsplitn(3, ':').collect();
103                let parts2: Vec<&str> = parts[1].rsplitn(3, ':').collect();
104                if parts1.len() == 3 && parts2.len() == 3 {
105                    let func_id1 = FunctionId::new(
106                        parts1[2].into(),
107                        parts1[1].to_string(),
108                        parts1[0].parse().unwrap_or(0),
109                    );
110                    let func_id2 = FunctionId::new(
111                        parts2[2].into(),
112                        parts2[1].to_string(),
113                        parts2[0].parse().unwrap_or(0),
114                    );
115                    result.insert((func_id1, func_id2), value);
116                }
117            }
118        }
119        Ok(result)
120    }
121}
122
123/// DataFlowGraph provides data flow analysis capabilities built on top of the CallGraph.
124/// It tracks variable dependencies, data transformations, and information flow between functions.
125#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct DataFlowGraph {
127    /// The underlying call graph that tracks function relationships
128    call_graph: CallGraph,
129    /// Variable dependencies within each function (function_id -> set of variables)
130    #[serde(with = "function_id_serde")]
131    variable_deps: HashMap<FunctionId, HashSet<String>>,
132    /// Data transformations tracked between functions
133    #[serde(with = "function_id_tuple_serde")]
134    data_transformations: HashMap<(FunctionId, FunctionId), DataTransformation>,
135    /// I/O operations detected in functions
136    #[serde(with = "function_id_serde")]
137    io_operations: HashMap<FunctionId, Vec<IoOperation>>,
138    /// Pure function analysis results
139    #[serde(with = "function_id_serde")]
140    purity_analysis: HashMap<FunctionId, PurityInfo>,
141    /// Full CFG-based data flow analysis from purity detector
142    /// Note: Skipped during serialization due to complexity
143    #[serde(skip)]
144    cfg_analysis: HashMap<FunctionId, DataFlowAnalysis>,
145    /// CFG-based data flow analysis with variable name context
146    /// Note: Skipped during serialization due to complexity
147    #[serde(skip)]
148    cfg_analysis_with_context: HashMap<FunctionId, CfgAnalysisWithContext>,
149    /// Mutation analysis (live vs dead mutations)
150    #[serde(with = "function_id_serde")]
151    mutation_analysis: HashMap<FunctionId, MutationInfo>,
152}
153
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct DataTransformation {
156    /// Variables passed from caller to callee
157    pub input_vars: Vec<String>,
158    /// Variables returned or modified
159    pub output_vars: Vec<String>,
160    /// Type of transformation (e.g., "map", "filter", "reduce")
161    pub transformation_type: String,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct IoOperation {
166    /// Type of I/O operation (file, network, console, etc.)
167    pub operation_type: String,
168    /// Variables involved in the I/O operation
169    pub variables: Vec<String>,
170    /// Line number where the operation occurs
171    pub line: usize,
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct PurityInfo {
176    /// Whether the function is pure (no side effects)
177    pub is_pure: bool,
178    /// Confidence level in the purity analysis (0.0 to 1.0)
179    pub confidence: f32,
180    /// Reasons why the function may not be pure
181    pub impurity_reasons: Vec<String>,
182}
183
184/// Mutation analysis information for a function.
185/// Uses binary signals for reliability - precise counts are not guaranteed.
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct MutationInfo {
188    /// Whether any mutations were detected in the function
189    pub has_mutations: bool,
190    /// Best-effort list of detected mutations (may be incomplete)
191    pub detected_mutations: Vec<String>,
192}
193
194impl MutationInfo {
195    /// Create a MutationInfo indicating no mutations
196    pub fn none() -> Self {
197        Self {
198            has_mutations: false,
199            detected_mutations: Vec::new(),
200        }
201    }
202
203    /// Check if the function is pure (no mutations)
204    pub fn is_pure(&self) -> bool {
205        !self.has_mutations
206    }
207}
208
209/// CFG-based data flow analysis with variable name context for translation.
210///
211/// This wrapper combines `DataFlowAnalysis` (which uses numeric `VarId`s)
212/// with the variable name mapping from the CFG, enabling translation of
213/// VarIds back to human-readable variable names.
214///
215/// # Why This Exists
216///
217/// The CFG uses `VarId { name_id: u32, version: u32 }` for efficiency during
218/// analysis. To display results to users, we need to translate these IDs back
219/// to variable names like "buffer", "x", "result", etc.
220///
221/// # Example
222///
223/// ```ignore
224/// let cfg = ControlFlowGraph::from_block(&block);
225/// let analysis = DataFlowAnalysis::analyze(&cfg);
226/// let ctx = CfgAnalysisWithContext::from_cfg(&cfg, analysis);
227///
228/// // Translate a VarId to a name
229/// let var_id = VarId { name_id: 0, version: 0 };
230/// let name = ctx.var_name(var_id); // "x", "buffer", etc.
231/// ```
232#[derive(Debug, Clone)]
233pub struct CfgAnalysisWithContext {
234    /// The full data flow analysis results
235    pub analysis: DataFlowAnalysis,
236    /// Variable name mapping from CFG (VarId.name_id -> variable name)
237    pub var_names: Vec<String>,
238}
239
240impl CfgAnalysisWithContext {
241    /// Create from a ControlFlowGraph and its analysis
242    pub fn new(var_names: Vec<String>, analysis: DataFlowAnalysis) -> Self {
243        Self {
244            analysis,
245            var_names,
246        }
247    }
248
249    /// Translate a VarId to a variable name
250    pub fn var_name(&self, var_id: crate::analysis::data_flow::VarId) -> String {
251        self.var_names
252            .get(var_id.name_id as usize)
253            .cloned()
254            .unwrap_or_else(|| format!("unknown_{}", var_id.name_id))
255    }
256
257    /// Translate multiple VarIds to names
258    pub fn var_names_for(
259        &self,
260        var_ids: impl Iterator<Item = crate::analysis::data_flow::VarId>,
261    ) -> Vec<String> {
262        var_ids.map(|id| self.var_name(id)).collect()
263    }
264}
265
266impl DataFlowGraph {
267    pub fn new() -> Self {
268        Self {
269            call_graph: CallGraph::new(),
270            variable_deps: HashMap::new(),
271            data_transformations: HashMap::new(),
272            io_operations: HashMap::new(),
273            purity_analysis: HashMap::new(),
274            cfg_analysis: HashMap::new(),
275            cfg_analysis_with_context: HashMap::new(),
276            mutation_analysis: HashMap::new(),
277        }
278    }
279
280    /// Create a DataFlowGraph from an existing CallGraph
281    pub fn from_call_graph(call_graph: CallGraph) -> Self {
282        Self {
283            call_graph,
284            variable_deps: HashMap::new(),
285            data_transformations: HashMap::new(),
286            io_operations: HashMap::new(),
287            purity_analysis: HashMap::new(),
288            cfg_analysis: HashMap::new(),
289            cfg_analysis_with_context: HashMap::new(),
290            mutation_analysis: HashMap::new(),
291        }
292    }
293
294    /// Get the underlying call graph
295    pub fn call_graph(&self) -> &CallGraph {
296        &self.call_graph
297    }
298
299    /// Get variable dependencies for a function
300    pub fn get_variable_dependencies(&self, func_id: &FunctionId) -> Option<&HashSet<String>> {
301        self.variable_deps.get(func_id)
302    }
303
304    /// Add variable dependencies for a function
305    pub fn add_variable_dependencies(&mut self, func_id: FunctionId, variables: HashSet<String>) {
306        self.variable_deps.insert(func_id, variables);
307    }
308
309    /// Get data transformation between two functions
310    pub fn get_data_transformation(
311        &self,
312        from: &FunctionId,
313        to: &FunctionId,
314    ) -> Option<&DataTransformation> {
315        self.data_transformations.get(&(from.clone(), to.clone()))
316    }
317
318    /// Add data transformation between two functions
319    pub fn add_data_transformation(
320        &mut self,
321        from: FunctionId,
322        to: FunctionId,
323        transformation: DataTransformation,
324    ) {
325        self.data_transformations.insert((from, to), transformation);
326    }
327
328    /// Get I/O operations for a function
329    pub fn get_io_operations(&self, func_id: &FunctionId) -> Option<&Vec<IoOperation>> {
330        self.io_operations.get(func_id)
331    }
332
333    /// Add I/O operation for a function
334    pub fn add_io_operation(&mut self, func_id: FunctionId, operation: IoOperation) {
335        self.io_operations
336            .entry(func_id)
337            .or_default()
338            .push(operation);
339    }
340
341    /// Get purity information for a function
342    pub fn get_purity_info(&self, func_id: &FunctionId) -> Option<&PurityInfo> {
343        self.purity_analysis.get(func_id)
344    }
345
346    /// Set purity information for a function
347    pub fn set_purity_info(&mut self, func_id: FunctionId, purity: PurityInfo) {
348        self.purity_analysis.insert(func_id, purity);
349    }
350
351    /// Get CFG-based data flow analysis for a function
352    pub fn get_cfg_analysis(&self, func_id: &FunctionId) -> Option<&DataFlowAnalysis> {
353        self.cfg_analysis.get(func_id)
354    }
355
356    /// Set CFG-based data flow analysis for a function
357    pub fn set_cfg_analysis(&mut self, func_id: FunctionId, analysis: DataFlowAnalysis) {
358        self.cfg_analysis.insert(func_id, analysis);
359    }
360
361    /// Get mutation analysis for a function
362    pub fn get_mutation_info(&self, func_id: &FunctionId) -> Option<&MutationInfo> {
363        self.mutation_analysis.get(func_id)
364    }
365
366    /// Set mutation analysis for a function
367    pub fn set_mutation_info(&mut self, func_id: FunctionId, info: MutationInfo) {
368        self.mutation_analysis.insert(func_id, info);
369    }
370
371    /// Get CFG analysis with translation context
372    pub fn get_cfg_analysis_with_context(
373        &self,
374        func_id: &FunctionId,
375    ) -> Option<&CfgAnalysisWithContext> {
376        self.cfg_analysis_with_context.get(func_id)
377    }
378
379    /// Set CFG analysis with context
380    pub fn set_cfg_analysis_with_context(
381        &mut self,
382        func_id: FunctionId,
383        context: CfgAnalysisWithContext,
384    ) {
385        self.cfg_analysis_with_context.insert(func_id, context);
386    }
387
388    // Escape/taint analysis methods removed - not providing actionable debt signals
389
390    /// Check if a function has side effects based on data flow analysis
391    pub fn has_side_effects(&self, func_id: &FunctionId) -> bool {
392        // Check purity analysis first
393        if let Some(purity) = self.get_purity_info(func_id) {
394            return !purity.is_pure;
395        }
396
397        // Check for I/O operations
398        if let Some(io_ops) = self.get_io_operations(func_id) {
399            return !io_ops.is_empty();
400        }
401
402        // Conservative estimate: assume side effects if we don't have analysis data
403        true
404    }
405
406    /// Get all functions that may be affected by changes to the given function
407    pub fn get_downstream_dependencies(&self, func_id: &FunctionId) -> Vec<FunctionId> {
408        // Use the call graph to find functions that call this one
409        self.call_graph.get_callers(func_id)
410    }
411
412    /// Get all functions that the given function depends on
413    pub fn get_upstream_dependencies(&self, _func_id: &FunctionId) -> Vec<FunctionId> {
414        // This would need to be implemented based on the call graph structure
415        // For now, return an empty vector as a placeholder
416        Vec::new()
417    }
418
419    /// Analyze the data flow impact of modifying a function
420    pub fn analyze_modification_impact(&self, func_id: &FunctionId) -> ModificationImpact {
421        let downstream = self.get_downstream_dependencies(func_id);
422        let upstream = self.get_upstream_dependencies(func_id);
423        let has_io = self
424            .get_io_operations(func_id)
425            .is_some_and(|ops| !ops.is_empty());
426        let is_pure = self.get_purity_info(func_id).is_some_and(|p| p.is_pure);
427
428        ModificationImpact {
429            affected_functions: downstream.len(),
430            dependency_count: upstream.len(),
431            has_side_effects: has_io || !is_pure,
432            risk_level: self.calculate_risk_level(&downstream, has_io, is_pure),
433        }
434    }
435
436    fn calculate_risk_level(
437        &self,
438        downstream: &[FunctionId],
439        has_io: bool,
440        is_pure: bool,
441    ) -> RiskLevel {
442        match (downstream.len(), has_io, is_pure) {
443            (0, false, true) => RiskLevel::Low,
444            (1..=5, false, true) => RiskLevel::Medium,
445            (1..=5, true, _) => RiskLevel::High,
446            (6.., _, _) => RiskLevel::Critical,
447            _ => RiskLevel::Medium,
448        }
449    }
450}
451
452#[derive(Debug, Clone, Serialize, Deserialize)]
453pub struct ModificationImpact {
454    /// Number of functions that may be affected by changes
455    pub affected_functions: usize,
456    /// Number of functions this function depends on
457    pub dependency_count: usize,
458    /// Whether the function has side effects
459    pub has_side_effects: bool,
460    /// Overall risk level of modifying this function
461    pub risk_level: RiskLevel,
462}
463
464#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
465pub enum RiskLevel {
466    Low,
467    Medium,
468    High,
469    Critical,
470}
471
472impl Default for DataFlowGraph {
473    fn default() -> Self {
474        Self::new()
475    }
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481    use std::path::PathBuf;
482
483    fn create_test_function_id(name: &str) -> FunctionId {
484        FunctionId::new(PathBuf::from("test.rs"), name.to_string(), 1)
485    }
486
487    #[test]
488    fn test_data_flow_graph_creation() {
489        let graph = DataFlowGraph::new();
490        assert_eq!(graph.call_graph().node_count(), 0);
491        assert!(graph.variable_deps.is_empty());
492        assert!(graph.data_transformations.is_empty());
493    }
494
495    #[test]
496    fn test_variable_dependencies() {
497        let mut graph = DataFlowGraph::new();
498        let func_id = create_test_function_id("test_func");
499
500        let mut variables = HashSet::new();
501        variables.insert("x".to_string());
502        variables.insert("y".to_string());
503
504        graph.add_variable_dependencies(func_id.clone(), variables);
505
506        let deps = graph.get_variable_dependencies(&func_id).unwrap();
507        assert_eq!(deps.len(), 2);
508        assert!(deps.contains("x"));
509        assert!(deps.contains("y"));
510    }
511
512    #[test]
513    fn test_io_operations() {
514        let mut graph = DataFlowGraph::new();
515        let func_id = create_test_function_id("io_func");
516
517        let io_op = IoOperation {
518            operation_type: "file_read".to_string(),
519            variables: vec!["filename".to_string()],
520            line: 42,
521        };
522
523        graph.add_io_operation(func_id.clone(), io_op);
524
525        let ops = graph.get_io_operations(&func_id).unwrap();
526        assert_eq!(ops.len(), 1);
527        assert_eq!(ops[0].operation_type, "file_read");
528        assert_eq!(ops[0].line, 42);
529    }
530
531    #[test]
532    fn test_purity_analysis() {
533        let mut graph = DataFlowGraph::new();
534        let func_id = create_test_function_id("pure_func");
535
536        let purity = PurityInfo {
537            is_pure: true,
538            confidence: 0.95,
539            impurity_reasons: vec![],
540        };
541
542        graph.set_purity_info(func_id.clone(), purity);
543
544        let purity_info = graph.get_purity_info(&func_id).unwrap();
545        assert!(purity_info.is_pure);
546        assert_eq!(purity_info.confidence, 0.95);
547        assert!(purity_info.impurity_reasons.is_empty());
548
549        assert!(!graph.has_side_effects(&func_id));
550    }
551
552    #[test]
553    fn test_side_effects_detection() {
554        let mut graph = DataFlowGraph::new();
555        let func_id = create_test_function_id("impure_func");
556
557        // Function with I/O operations should have side effects
558        let io_op = IoOperation {
559            operation_type: "console_log".to_string(),
560            variables: vec!["message".to_string()],
561            line: 10,
562        };
563        graph.add_io_operation(func_id.clone(), io_op);
564
565        assert!(graph.has_side_effects(&func_id));
566    }
567
568    #[test]
569    fn test_data_transformation() {
570        let mut graph = DataFlowGraph::new();
571        let from_func = create_test_function_id("caller");
572        let to_func = create_test_function_id("callee");
573
574        let transformation = DataTransformation {
575            input_vars: vec!["input".to_string()],
576            output_vars: vec!["result".to_string()],
577            transformation_type: "map".to_string(),
578        };
579
580        graph.add_data_transformation(from_func.clone(), to_func.clone(), transformation);
581
582        let trans = graph.get_data_transformation(&from_func, &to_func).unwrap();
583        assert_eq!(trans.transformation_type, "map");
584        assert_eq!(trans.input_vars, vec!["input"]);
585        assert_eq!(trans.output_vars, vec!["result"]);
586    }
587
588    #[test]
589    fn test_modification_impact_analysis() {
590        let graph = DataFlowGraph::new();
591        let func_id = create_test_function_id("test_func");
592
593        let impact = graph.analyze_modification_impact(&func_id);
594
595        // Since we have no call graph data, downstream should be empty
596        assert_eq!(impact.affected_functions, 0);
597        assert_eq!(impact.dependency_count, 0);
598        // Should assume side effects when no data is available
599        assert!(impact.has_side_effects);
600    }
601
602    #[test]
603    fn test_risk_level_calculation() {
604        let graph = DataFlowGraph::new();
605
606        // Test low risk: no downstream, no I/O, pure function
607        assert_eq!(graph.calculate_risk_level(&[], false, true), RiskLevel::Low);
608
609        // Test high risk: some downstream with I/O
610        let downstream = vec![create_test_function_id("caller1")];
611        assert_eq!(
612            graph.calculate_risk_level(&downstream, true, false),
613            RiskLevel::High
614        );
615
616        // Test critical risk: many downstream dependencies
617        let many_downstream: Vec<FunctionId> = (0..10)
618            .map(|i| create_test_function_id(&format!("caller_{}", i)))
619            .collect();
620        assert_eq!(
621            graph.calculate_risk_level(&many_downstream, false, true),
622            RiskLevel::Critical
623        );
624    }
625
626    #[test]
627    fn test_from_call_graph() {
628        let call_graph = CallGraph::new();
629        let graph = DataFlowGraph::from_call_graph(call_graph);
630
631        assert_eq!(graph.call_graph().node_count(), 0);
632        assert!(graph.variable_deps.is_empty());
633    }
634
635    #[test]
636    fn test_varid_translation() {
637        use crate::analysis::data_flow::{DataFlowAnalysis, ReachingDefinitions, VarId};
638
639        let var_names = vec!["x".to_string(), "y".to_string(), "buffer".to_string()];
640
641        // Create minimal analysis (escape/taint analysis removed)
642        let analysis = DataFlowAnalysis {
643            reaching_defs: ReachingDefinitions::default(),
644        };
645
646        let ctx = CfgAnalysisWithContext::new(var_names, analysis);
647
648        let var_id = VarId {
649            name_id: 0,
650            version: 0,
651        };
652        assert_eq!(ctx.var_name(var_id), "x");
653
654        let var_id = VarId {
655            name_id: 2,
656            version: 1,
657        };
658        assert_eq!(ctx.var_name(var_id), "buffer");
659    }
660
661    #[test]
662    fn test_translation_with_missing_id() {
663        use crate::analysis::data_flow::{DataFlowAnalysis, ReachingDefinitions, VarId};
664
665        let var_names = vec!["x".to_string()];
666
667        let analysis = DataFlowAnalysis {
668            reaching_defs: ReachingDefinitions::default(),
669        };
670
671        let ctx = CfgAnalysisWithContext::new(var_names, analysis);
672
673        let invalid_id = VarId {
674            name_id: 999,
675            version: 0,
676        };
677        assert_eq!(ctx.var_name(invalid_id), "unknown_999");
678    }
679
680    // Escape/taint translation tests removed - analysis no longer provides actionable signals
681}