scribe_selection/
covering_set.rs

1//! Covering set computation for targeted code selection.
2//!
3//! This module provides functionality to compute minimal covering sets of files
4//! needed to understand a specific code entity (function, class, module, etc.).
5//! It integrates AST parsing, dependency graph traversal, and intelligent selection.
6
7use crate::ast_parser::{AstParser, EntityLocation, EntityQuery};
8use scribe_core::{Result, ScribeError};
9use scribe_graph::{DependencyGraph, TraversalDirection};
10use serde::{Deserialize, Serialize};
11use std::collections::{HashMap, HashSet};
12use std::path::Path;
13
14/// Options for covering set computation
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct CoveringSetOptions {
17    /// Include files that the target depends on
18    pub include_dependencies: bool,
19    /// Include files that depend on the target
20    pub include_dependents: bool,
21    /// Maximum depth for dependency traversal (None = unlimited)
22    pub max_depth: Option<usize>,
23    /// Maximum number of files to include (None = unlimited)
24    pub max_files: Option<usize>,
25    /// Minimum importance score to include a file (0.0-1.0)
26    pub min_importance: Option<f64>,
27}
28
29impl Default for CoveringSetOptions {
30    fn default() -> Self {
31        Self {
32            include_dependencies: true,
33            include_dependents: false,
34            max_depth: None,
35            max_files: None,
36            min_importance: None,
37        }
38    }
39}
40
41impl CoveringSetOptions {
42    /// Create default options optimized for understanding a target
43    pub fn for_understanding() -> Self {
44        Self {
45            include_dependencies: true,
46            include_dependents: false,
47            max_depth: None, // Get all dependencies
48            max_files: Some(100), // Reasonable limit
49            min_importance: Some(0.3), // Exclude very low importance files
50        }
51    }
52
53    /// Create options optimized for impact analysis
54    pub fn for_impact_analysis() -> Self {
55        Self {
56            include_dependencies: true,
57            include_dependents: true, // Include what depends on this
58            max_depth: Some(2), // Don't go too deep
59            max_files: Some(50),
60            min_importance: Some(0.4),
61        }
62    }
63
64    /// Create options for minimal covering set
65    pub fn minimal() -> Self {
66        Self {
67            include_dependencies: true,
68            include_dependents: false,
69            max_depth: Some(1), // Only direct dependencies
70            max_files: Some(20),
71            min_importance: Some(0.5),
72        }
73    }
74}
75
76/// Result of covering set computation
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct CoveringSetResult {
79    /// The target entity that was located
80    pub target_entity: Option<EntityLocation>,
81    /// Files included in the covering set
82    pub files: Vec<CoveringSetFile>,
83    /// Statistics about the computation
84    pub statistics: CoveringSetStatistics,
85    /// Explanation of why files were included
86    pub inclusion_reasons: HashMap<String, String>,
87}
88
89/// Information about a file in the covering set
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct CoveringSetFile {
92    /// File path (relative or absolute)
93    pub path: String,
94    /// Why this file was included
95    pub reason: InclusionReason,
96    /// Distance from target (0 = target file, 1 = direct dependency, etc.)
97    pub distance: usize,
98    /// Importance score if available
99    pub importance: Option<f64>,
100    /// Relevant line ranges (inclusive, 1-indexed)
101    pub line_ranges: Vec<LineRange>,
102}
103
104/// Line range information
105#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
106pub struct LineRange {
107    pub start_line: usize,
108    pub end_line: usize,
109}
110/// Reason a file was included in the covering set
111#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
112pub enum InclusionReason {
113    /// File contains the target entity
114    TargetFile,
115    /// File was directly changed in a diff
116    ChangedFile,
117    /// File is a direct dependency of the target
118    DirectDependency,
119    /// File is a transitive dependency
120    TransitiveDependency,
121    /// File directly depends on the target
122    DirectDependent,
123    /// File transitively depends on the target
124    TransitiveDependent,
125}
126
127/// Statistics about the covering set computation
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct CoveringSetStatistics {
130    /// Total files examined
131    pub files_examined: usize,
132    /// Files in the final covering set
133    pub files_selected: usize,
134    /// Files excluded due to limits
135    pub files_excluded: usize,
136    /// Maximum depth reached
137    pub max_depth_reached: usize,
138    /// Whether any limits were hit
139    pub limits_reached: bool,
140}
141
142/// Computes covering sets for targeted code selection
143pub struct CoveringSetComputer {
144    ast_parser: AstParser,
145}
146
147impl CoveringSetComputer {
148    /// Create a new covering set computer
149    pub fn new() -> Result<Self> {
150        Ok(Self {
151            ast_parser: AstParser::new()?,
152        })
153    }
154
155    /// Compute a covering set for a target entity
156    ///
157    /// # Arguments
158    /// * `query` - Query to locate the target entity
159    /// * `file_contents` - Map of file paths to their contents
160    /// * `graph` - Dependency graph for the project
161    /// * `options` - Options controlling the computation
162    ///
163    /// # Returns
164    /// A covering set result containing the target and all related files
165    pub fn compute_covering_set(
166        &mut self,
167        query: &EntityQuery,
168        file_contents: &HashMap<String, String>,
169        graph: &DependencyGraph,
170        options: &CoveringSetOptions,
171    ) -> Result<CoveringSetResult> {
172        let mut statistics = CoveringSetStatistics {
173            files_examined: file_contents.len(),
174            files_selected: 0,
175            files_excluded: 0,
176            max_depth_reached: 0,
177            limits_reached: false,
178        };
179
180        // Step 1: Find the target entity across all files
181        let target_entity = self.find_target_entity(query, file_contents)?;
182
183        if target_entity.is_none() {
184            return Ok(CoveringSetResult {
185                target_entity: None,
186                files: Vec::new(),
187                statistics,
188                inclusion_reasons: HashMap::new(),
189            });
190        }
191
192        let target = target_entity.clone().unwrap();
193        let target_file = &target.file_path;
194
195        // Step 2: Compute file closure using dependency graph
196        let mut seed_files = vec![target_file.clone()];
197        let direction = self.get_traversal_direction(options);
198        let closure_files = graph.compute_closure(&seed_files, direction, options.max_depth);
199
200        // Step 3: Build covering set with metadata
201        let mut covering_set = Vec::new();
202        let mut inclusion_reasons = HashMap::new();
203
204        // Add target file first
205        covering_set.push(CoveringSetFile {
206            path: target_file.clone(),
207            reason: InclusionReason::TargetFile,
208            distance: 0,
209            importance: None,
210            line_ranges: vec![LineRange {
211                start_line: target.start_line,
212                end_line: target.end_line,
213            }],
214        });
215        inclusion_reasons.insert(
216            target_file.clone(),
217            "Contains the target entity".to_string(),
218        );
219
220        // Add dependency/dependent files
221        for file in closure_files {
222            if file == *target_file {
223                continue; // Already added
224            }
225
226            let (reason, distance) = self.compute_inclusion_info(
227                &file,
228                target_file,
229                graph,
230                options,
231            );
232
233            covering_set.push(CoveringSetFile {
234                path: file.clone(),
235                reason: reason.clone(),
236                distance,
237                importance: None,
238                line_ranges: Vec::new(),
239            });
240
241            let reason_text = self.format_inclusion_reason(&reason, distance);
242            inclusion_reasons.insert(file, reason_text);
243        }
244
245        // Step 4: Apply limits and filters
246        let original_count = covering_set.len();
247        self.apply_limits(&mut covering_set, options, &mut statistics);
248
249        if covering_set.len() < original_count {
250            statistics.limits_reached = true;
251            statistics.files_excluded = original_count - covering_set.len();
252        }
253
254        statistics.files_selected = covering_set.len();
255        statistics.max_depth_reached = covering_set
256            .iter()
257            .map(|f| f.distance)
258            .max()
259            .unwrap_or(0);
260
261        Ok(CoveringSetResult {
262            target_entity,
263            files: covering_set,
264            statistics,
265            inclusion_reasons,
266        })
267    }
268
269    /// Compute a covering set starting from a set of changed files (git diff)
270    ///
271    /// This variant is useful for code review or impact analysis scenarios where
272    /// you want the minimal set of files that explain or are affected by a diff
273    /// without needing to name a specific entity.
274    pub fn compute_covering_set_for_files(
275        &self,
276        changed_files: &[String],
277        graph: &DependencyGraph,
278        line_map: Option<&HashMap<String, Vec<LineRange>>>,
279        options: &CoveringSetOptions,
280    ) -> Result<CoveringSetResult> {
281        let mut statistics = CoveringSetStatistics {
282            files_examined: changed_files.len(),
283            files_selected: 0,
284            files_excluded: 0,
285            max_depth_reached: 0,
286            limits_reached: false,
287        };
288
289        if changed_files.is_empty() {
290            return Ok(CoveringSetResult {
291                target_entity: None,
292                files: Vec::new(),
293                statistics,
294                inclusion_reasons: HashMap::new(),
295            });
296        }
297
298        let direction = self.get_traversal_direction(options);
299        let closure_files = graph.compute_closure(changed_files, direction, options.max_depth);
300
301        let mut covering_set = Vec::new();
302        let mut inclusion_reasons = HashMap::new();
303
304        // Add the changed files as seeds
305        for changed in changed_files {
306            covering_set.push(CoveringSetFile {
307                path: changed.clone(),
308                reason: InclusionReason::ChangedFile,
309                distance: 0,
310                importance: None,
311                line_ranges: line_map
312                    .and_then(|m| m.get(changed))
313                    .cloned()
314                    .unwrap_or_default(),
315            });
316            inclusion_reasons.insert(changed.clone(), "Changed in diff".to_string());
317        }
318
319        // Add dependency/dependent files reachable from any changed file
320        for file in closure_files {
321            if changed_files.contains(&file) {
322                continue;
323            }
324
325            // Pick a representative changed file to compute distance/reason.
326            // We use the first changed file that connects; fallback to the first seed.
327            let reference = changed_files
328                .iter()
329                .find(|target| {
330                    graph.contains_edge(target, &file) || graph.contains_edge(&file, target)
331                })
332                .unwrap_or(&changed_files[0]);
333
334            let (reason, distance) = self.compute_inclusion_info(&file, reference, graph, options);
335
336            covering_set.push(CoveringSetFile {
337                path: file.clone(),
338                reason: reason.clone(),
339                distance,
340                importance: None,
341                line_ranges: Vec::new(),
342            });
343
344            let reason_text = self.format_inclusion_reason(&reason, distance);
345            inclusion_reasons.insert(file, reason_text);
346        }
347
348        let original_count = covering_set.len();
349        self.apply_limits(&mut covering_set, options, &mut statistics);
350
351        if covering_set.len() < original_count {
352            statistics.limits_reached = true;
353            statistics.files_excluded = original_count - covering_set.len();
354        }
355
356        statistics.files_selected = covering_set.len();
357        statistics.max_depth_reached = covering_set
358            .iter()
359            .map(|f| f.distance)
360            .max()
361            .unwrap_or(0);
362
363        Ok(CoveringSetResult {
364            target_entity: None,
365            files: covering_set,
366            statistics,
367            inclusion_reasons,
368        })
369    }
370
371    /// Find the target entity across all files
372    fn find_target_entity(
373        &mut self,
374        query: &EntityQuery,
375        file_contents: &HashMap<String, String>,
376    ) -> Result<Option<EntityLocation>> {
377        for (file_path, content) in file_contents {
378            let entities = self.ast_parser.find_entities(content, file_path, query)?;
379            if let Some(entity) = entities.first() {
380                return Ok(Some(entity.clone()));
381            }
382        }
383        Ok(None)
384    }
385
386    /// Determine traversal direction based on options
387    fn get_traversal_direction(&self, options: &CoveringSetOptions) -> TraversalDirection {
388        match (options.include_dependencies, options.include_dependents) {
389            (true, true) => TraversalDirection::Both,
390            (true, false) => TraversalDirection::Dependencies,
391            (false, true) => TraversalDirection::Dependents,
392            (false, false) => TraversalDirection::Dependencies, // Default to dependencies
393        }
394    }
395
396    /// Compute inclusion reason and distance for a file
397    fn compute_inclusion_info(
398        &self,
399        file: &str,
400        target_file: &str,
401        graph: &DependencyGraph,
402        options: &CoveringSetOptions,
403    ) -> (InclusionReason, usize) {
404        let file_string = file.to_string();
405        let target_string = target_file.to_string();
406
407        // Check if it's a direct dependency
408        if graph.contains_edge(&target_string, &file_string) {
409            return (InclusionReason::DirectDependency, 1);
410        }
411
412        // Check if it's a direct dependent
413        if graph.contains_edge(&file_string, &target_string) {
414            return (InclusionReason::DirectDependent, 1);
415        }
416
417        // Otherwise it's transitive - compute distance
418        let deps = graph.transitive_dependencies(&target_string, options.max_depth);
419        if deps.contains(&file_string) {
420            let distance = self.compute_distance(target_file, file, graph);
421            return (InclusionReason::TransitiveDependency, distance);
422        }
423
424        let dependents = graph.transitive_dependents(&target_string, options.max_depth);
425        if dependents.contains(&file_string) {
426            let distance = self.compute_distance(file, target_file, graph);
427            return (InclusionReason::TransitiveDependent, distance);
428        }
429
430        // Fallback
431        (InclusionReason::TransitiveDependency, 2)
432    }
433
434    /// Compute distance between two files in the graph (simple BFS)
435    fn compute_distance(&self, from: &str, to: &str, graph: &DependencyGraph) -> usize {
436        use std::collections::{HashSet, VecDeque};
437
438        let mut queue = VecDeque::new();
439        let mut visited = HashSet::new();
440
441        queue.push_back((from.to_string(), 0));
442        visited.insert(from.to_string());
443
444        while let Some((current, dist)) = queue.pop_front() {
445            if current == to {
446                return dist;
447            }
448
449            if let Some(neighbors) = graph.outgoing_neighbors(&current) {
450                for neighbor in neighbors {
451                    if !visited.contains(neighbor) {
452                        visited.insert(neighbor.to_string());
453                        queue.push_back((neighbor.to_string(), dist + 1));
454                    }
455                }
456            }
457        }
458
459        // Not reachable
460        999
461    }
462
463    /// Format inclusion reason as human-readable text
464    fn format_inclusion_reason(&self, reason: &InclusionReason, distance: usize) -> String {
465        match reason {
466            InclusionReason::TargetFile => "Contains the target entity".to_string(),
467            InclusionReason::ChangedFile => "Changed in diff".to_string(),
468            InclusionReason::DirectDependency => "Direct dependency of target".to_string(),
469            InclusionReason::TransitiveDependency => {
470                format!("Transitive dependency (distance: {})", distance)
471            }
472            InclusionReason::DirectDependent => "Directly depends on target".to_string(),
473            InclusionReason::TransitiveDependent => {
474                format!("Transitively depends on target (distance: {})", distance)
475            }
476        }
477    }
478
479    /// Apply limits and filters to the covering set
480    fn apply_limits(
481        &self,
482        covering_set: &mut Vec<CoveringSetFile>,
483        options: &CoveringSetOptions,
484        statistics: &mut CoveringSetStatistics,
485    ) {
486        // Sort by distance (closer files first) and importance
487        covering_set.sort_by_key(|f| (f.distance, f.path.clone()));
488
489        // Apply max_files limit
490        if let Some(max_files) = options.max_files {
491            if covering_set.len() > max_files {
492                covering_set.truncate(max_files);
493                statistics.limits_reached = true;
494            }
495        }
496
497        // Filter by importance if specified
498        if let Some(min_importance) = options.min_importance {
499            covering_set.retain(|f| {
500                // Always keep target file
501                if matches!(f.reason, InclusionReason::TargetFile) {
502                    return true;
503                }
504                // Keep if importance is above threshold (or unknown)
505                f.importance.map_or(true, |imp| imp >= min_importance)
506            });
507        }
508    }
509}
510
511impl Default for CoveringSetComputer {
512    fn default() -> Self {
513        Self::new().expect("Failed to create CoveringSetComputer")
514    }
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520    use crate::ast_parser::EntityQuery;
521
522    #[test]
523    fn test_covering_set_options() {
524        let opts = CoveringSetOptions::default();
525        assert!(opts.include_dependencies);
526        assert!(!opts.include_dependents);
527
528        let minimal = CoveringSetOptions::minimal();
529        assert_eq!(minimal.max_depth, Some(1));
530        assert_eq!(minimal.max_files, Some(20));
531    }
532
533    #[test]
534    fn test_covering_set_computer_creation() {
535        let computer = CoveringSetComputer::new();
536        assert!(computer.is_ok());
537    }
538
539    #[test]
540    fn test_inclusion_reason_formatting() {
541        let computer = CoveringSetComputer::new().unwrap();
542
543        let reason = computer.format_inclusion_reason(&InclusionReason::TargetFile, 0);
544        assert_eq!(reason, "Contains the target entity");
545
546        let reason = computer.format_inclusion_reason(&InclusionReason::ChangedFile, 0);
547        assert_eq!(reason, "Changed in diff");
548
549        let reason = computer.format_inclusion_reason(&InclusionReason::DirectDependency, 1);
550        assert_eq!(reason, "Direct dependency of target");
551
552        let reason = computer.format_inclusion_reason(&InclusionReason::TransitiveDependency, 3);
553        assert!(reason.contains("distance: 3"));
554    }
555
556    #[test]
557    fn test_covering_set_for_changed_files() {
558        let computer = CoveringSetComputer::new().unwrap();
559        let graph = DependencyGraph::new();
560
561        let changed = vec!["src/lib.rs".to_string(), "src/main.rs".to_string()];
562        let result = computer
563            .compute_covering_set_for_files(
564                &changed,
565                &graph,
566                None,
567                &CoveringSetOptions::default(),
568            )
569            .unwrap();
570
571        assert!(result.target_entity.is_none());
572        assert_eq!(result.files.len(), 2);
573        assert!(result
574            .files
575            .iter()
576            .all(|f| f.reason == InclusionReason::ChangedFile));
577    }
578}