scribe_scaling/
positioning.rs

1//! Context Positioning Optimization
2//!
3//! Strategic file positioning based on transformer model attention patterns.
4//! Models have better reasoning at the head and tail of context, so we position:
5//! - HEAD (20%): Query-specific high centrality files  
6//! - MIDDLE (60%): Low centrality supporting files
7//! - TAIL (20%): Core functionality, high centrality files
8
9use petgraph::algo::kosaraju_scc;
10use petgraph::visit::EdgeRef;
11use petgraph::{graph::NodeIndex, Directed, Graph};
12use rayon::prelude::*;
13use serde::{Deserialize, Serialize};
14use std::collections::{HashMap, HashSet};
15use std::path::Path;
16use tracing::{debug, info, warn};
17
18use crate::error::{ScalingError, ScalingResult};
19use crate::streaming::FileMetadata;
20use scribe_core::file;
21
22/// Configuration for context positioning optimization
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct ContextPositioningConfig {
25    /// Enable context positioning optimization
26    pub enable_positioning: bool,
27
28    /// Percentage of context for HEAD positioning (query-relevant, high centrality)
29    pub head_percentage: f64,
30
31    /// Percentage of context for TAIL positioning (core functionality)  
32    pub tail_percentage: f64,
33
34    /// Weight for centrality in positioning decisions
35    pub centrality_weight: f64,
36
37    /// Weight for file relatedness in grouping decisions
38    pub relatedness_weight: f64,
39
40    /// Weight for query relevance in HEAD positioning
41    pub query_relevance_weight: f64,
42
43    /// Auto-exclude test files from selection (focuses on code and docs only)
44    pub auto_exclude_tests: bool,
45}
46
47impl Default for ContextPositioningConfig {
48    fn default() -> Self {
49        Self {
50            enable_positioning: true,
51            head_percentage: 0.20,
52            tail_percentage: 0.20,
53            centrality_weight: 0.4,
54            relatedness_weight: 0.3,
55            query_relevance_weight: 0.3,
56            auto_exclude_tests: false,
57        }
58    }
59}
60
61/// Centrality scores for files in the codebase
62#[derive(Debug, Clone, Default, Serialize, Deserialize)]
63pub struct CentralityScores {
64    /// Betweenness centrality: files connecting different parts
65    pub betweenness: f64,
66
67    /// PageRank centrality: heavily referenced files
68    pub pagerank: f64,
69
70    /// Degree centrality: files with many connections
71    pub degree: f64,
72
73    /// Combined centrality score
74    pub combined: f64,
75}
76
77/// File with centrality and positioning metadata
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct FileWithCentrality {
80    pub metadata: FileMetadata,
81    pub centrality: CentralityScores,
82    pub query_relevance: f64,
83    pub relatedness_group: String,
84}
85
86/// Three-tier context positioning structure
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct ContextPositioning {
89    /// HEAD: Query-specific high centrality files (first ~20%)
90    pub head_files: Vec<FileWithCentrality>,
91
92    /// MIDDLE: Low centrality supporting files (~60%)  
93    pub middle_files: Vec<FileWithCentrality>,
94
95    /// TAIL: Core functionality, high centrality (~20%)
96    pub tail_files: Vec<FileWithCentrality>,
97}
98
99/// Result of context positioning with reasoning
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct PositionedSelection {
102    pub positioning: ContextPositioning,
103    pub total_tokens: usize,
104    pub positioning_reasoning: String,
105}
106
107/// Context positioning optimizer
108pub struct ContextPositioner {
109    config: ContextPositioningConfig,
110}
111
112impl ContextPositioner {
113    /// Create new context positioner with configuration
114    pub fn new(config: ContextPositioningConfig) -> Self {
115        Self { config }
116    }
117
118    /// Create with default configuration
119    pub fn with_defaults() -> Self {
120        Self::new(ContextPositioningConfig::default())
121    }
122
123    /// Apply context positioning to selected files
124    pub async fn position_files(
125        &self,
126        files: Vec<FileMetadata>,
127        query_hint: Option<&str>,
128    ) -> ScalingResult<PositionedSelection> {
129        if !self.config.enable_positioning || files.is_empty() {
130            return Ok(self.create_simple_positioning(files));
131        }
132
133        // Filter out test files if auto-exclude is enabled
134        let filtered_files = if self.config.auto_exclude_tests {
135            let original_count = files.len();
136            let non_test_files: Vec<FileMetadata> = files
137                .into_iter()
138                .filter(|file| !self.is_test_file(&file.path))
139                .collect();
140            let filtered_count = non_test_files.len();
141
142            if original_count != filtered_count {
143                info!(
144                    "Auto-excluded {} test files, {} files remaining",
145                    original_count - filtered_count,
146                    filtered_count
147                );
148            }
149
150            non_test_files
151        } else {
152            files
153        };
154
155        info!(
156            "Starting context positioning for {} files",
157            filtered_files.len()
158        );
159
160        // Phase 1: Calculate centrality scores for all files
161        let files_with_centrality = self.calculate_centrality_scores(filtered_files).await?;
162
163        // Phase 2: Calculate query relevance if hint provided
164        let files_with_relevance = self
165            .calculate_query_relevance(files_with_centrality, query_hint)
166            .await?;
167
168        // Phase 3: Group by relatedness
169        let files_with_groups = self.group_by_relatedness(files_with_relevance).await?;
170
171        // Phase 4: Apply three-tier positioning strategy
172        let positioning = self.apply_positioning_strategy(files_with_groups).await?;
173
174        // Phase 5: Calculate total tokens and generate reasoning
175        let total_tokens = self.calculate_total_tokens(&positioning);
176        let reasoning = self.generate_positioning_reasoning(&positioning, query_hint);
177
178        info!(
179            "Context positioning complete: HEAD={}, MIDDLE={}, TAIL={}",
180            positioning.head_files.len(),
181            positioning.middle_files.len(),
182            positioning.tail_files.len()
183        );
184
185        Ok(PositionedSelection {
186            positioning,
187            total_tokens,
188            positioning_reasoning: reasoning,
189        })
190    }
191
192    /// Calculate centrality scores for all files using optimized algorithms
193    async fn calculate_centrality_scores(
194        &self,
195        files: Vec<FileMetadata>,
196    ) -> ScalingResult<Vec<FileWithCentrality>> {
197        debug!("Calculating centrality scores for {} files", files.len());
198
199        if files.is_empty() {
200            return Ok(Vec::new());
201        }
202
203        // Build optimized dependency graph
204        let (graph, node_map) = self.build_dependency_graph(&files).await?;
205
206        // Calculate all centrality measures efficiently using petgraph
207        let centrality_scores = self.calculate_all_centralities(&graph, &node_map).await?;
208
209        // Map centrality scores back to files in parallel
210        let files_with_centrality: Vec<FileWithCentrality> = files
211            .into_par_iter()
212            .map(|file| {
213                let file_key = self.file_to_key(&file.path);
214                let centrality = centrality_scores
215                    .get(&file_key)
216                    .cloned()
217                    .unwrap_or_default();
218
219                FileWithCentrality {
220                    metadata: file,
221                    centrality,
222                    query_relevance: 0.0,             // Will be set later
223                    relatedness_group: String::new(), // Will be set later
224                }
225            })
226            .collect();
227
228        debug!(
229            "Calculated centrality for {} files",
230            files_with_centrality.len()
231        );
232        Ok(files_with_centrality)
233    }
234
235    /// Build dependency graph from file relationships using petgraph
236    async fn build_dependency_graph(
237        &self,
238        files: &[FileMetadata],
239    ) -> ScalingResult<(Graph<String, (), Directed>, HashMap<String, NodeIndex>)> {
240        let mut graph = Graph::new();
241        let mut node_map = HashMap::new();
242
243        // First pass: create nodes for all files
244        for file in files {
245            let file_key = self.file_to_key(&file.path);
246            let node_idx = graph.add_node(file_key.clone());
247            node_map.insert(file_key, node_idx);
248        }
249
250        // Second pass: create edges based on dependencies
251        for file in files {
252            let file_key = self.file_to_key(&file.path);
253            let dependencies = self.extract_dependencies(file).await?;
254
255            if let Some(&from_idx) = node_map.get(&file_key) {
256                for dep in dependencies {
257                    if let Some(&to_idx) = node_map.get(&dep) {
258                        graph.add_edge(from_idx, to_idx, ());
259                    }
260                }
261            }
262        }
263
264        debug!(
265            "Built dependency graph: {} nodes, {} edges",
266            graph.node_count(),
267            graph.edge_count()
268        );
269
270        Ok((graph, node_map))
271    }
272
273    /// Extract dependencies from a file (imports, includes, etc.)
274    async fn extract_dependencies(&self, file: &FileMetadata) -> ScalingResult<Vec<String>> {
275        // Simple dependency extraction based on file patterns and language
276        let mut dependencies = Vec::new();
277
278        let path_str = file.path.to_string_lossy();
279        let dir_path = file
280            .path
281            .parent()
282            .map(|p| p.to_string_lossy().to_string())
283            .unwrap_or_default();
284
285        // For Rust files, assume mod.rs and lib.rs are central
286        if file.language == "Rust" {
287            let filename = file.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
288
289            if filename == "mod.rs" || filename == "lib.rs" {
290                // These are likely dependency targets
291            } else {
292                // Regular Rust files likely depend on lib.rs or mod.rs
293                dependencies.push(format!("{}/lib.rs", dir_path));
294                dependencies.push(format!("{}/mod.rs", dir_path));
295            }
296        }
297
298        // For Python files, __init__.py files are central
299        if file.language == "Python" {
300            let filename = file.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
301
302            if filename != "__init__.py" {
303                dependencies.push(format!("{}/__init__.py", dir_path));
304            }
305        }
306
307        // For JavaScript/TypeScript, index files are central
308        if file.language == "JavaScript" || file.language == "TypeScript" {
309            dependencies.push(format!("{}/index.js", dir_path));
310            dependencies.push(format!("{}/index.ts", dir_path));
311        }
312
313        // Configuration files often depend on package manifests
314        if file.file_type == "Configuration" {
315            dependencies.push("package.json".to_string());
316            dependencies.push("Cargo.toml".to_string());
317            dependencies.push("pyproject.toml".to_string());
318        }
319
320        Ok(dependencies)
321    }
322
323    /// Calculate all centrality measures efficiently using petgraph algorithms
324    async fn calculate_all_centralities(
325        &self,
326        graph: &Graph<String, (), Directed>,
327        node_map: &HashMap<String, NodeIndex>,
328    ) -> ScalingResult<HashMap<String, CentralityScores>> {
329        let mut centrality_scores = HashMap::new();
330
331        if graph.node_count() == 0 {
332            return Ok(centrality_scores);
333        }
334
335        // Calculate PageRank using simplified approach (petgraph doesn't have built-in PageRank)
336        // We'll implement a basic version or use degree centrality as approximation
337        let pagerank_scores = self.calculate_simple_pagerank(graph, node_map)?;
338
339        // Calculate degree centrality in parallel
340        let degree_scores: Vec<(NodeIndex, f64)> = node_map
341            .par_iter()
342            .map(|(_, &node_idx)| {
343                let in_degree = graph.edges_directed(node_idx, petgraph::Incoming).count();
344                let out_degree = graph.edges_directed(node_idx, petgraph::Outgoing).count();
345                let total_degree = in_degree + out_degree;
346                let max_possible = graph.node_count().saturating_sub(1);
347
348                let normalized_degree = if max_possible == 0 {
349                    0.0
350                } else {
351                    total_degree as f64 / max_possible as f64
352                };
353
354                (node_idx, normalized_degree)
355            })
356            .collect();
357
358        // Calculate betweenness centrality using strongly connected components
359        let betweenness_scores = self.calculate_betweenness_from_scc(graph, node_map)?;
360
361        // Combine all scores
362        for (file_key, &node_idx) in node_map {
363            let pagerank = pagerank_scores
364                .get(node_idx.index())
365                .copied()
366                .unwrap_or(0.0);
367            let degree = degree_scores
368                .iter()
369                .find(|(idx, _)| *idx == node_idx)
370                .map(|(_, score)| *score)
371                .unwrap_or(0.0);
372            let betweenness = betweenness_scores.get(&node_idx).copied().unwrap_or(0.0);
373
374            // Combine centrality scores with weights
375            let combined = (degree * 0.3) + (pagerank * 0.5) + (betweenness * 0.2);
376
377            centrality_scores.insert(
378                file_key.clone(),
379                CentralityScores {
380                    degree,
381                    pagerank,
382                    betweenness,
383                    combined,
384                },
385            );
386        }
387
388        debug!(
389            "Calculated centrality scores for {} files",
390            centrality_scores.len()
391        );
392        Ok(centrality_scores)
393    }
394
395    /// Calculate betweenness centrality using strongly connected components
396    fn calculate_betweenness_from_scc(
397        &self,
398        graph: &Graph<String, (), Directed>,
399        node_map: &HashMap<String, NodeIndex>,
400    ) -> ScalingResult<HashMap<NodeIndex, f64>> {
401        let mut betweenness_scores = HashMap::new();
402
403        // Use Kosaraju's algorithm to find strongly connected components
404        let sccs = kosaraju_scc(graph);
405
406        // Calculate betweenness based on component connectivity
407        for &node_idx in node_map.values() {
408            let mut betweenness = 0.0;
409
410            // Find which SCC this node belongs to
411            let node_scc = sccs.iter().position(|scc| scc.contains(&node_idx));
412
413            if let Some(scc_idx) = node_scc {
414                // Count connections to other SCCs
415                let out_edges: HashSet<usize> = graph
416                    .edges_directed(node_idx, petgraph::Outgoing)
417                    .filter_map(|edge| {
418                        let target = edge.target();
419                        sccs.iter().position(|scc| scc.contains(&target))
420                    })
421                    .filter(|&target_scc| target_scc != scc_idx)
422                    .collect();
423
424                let in_edges: HashSet<usize> = graph
425                    .edges_directed(node_idx, petgraph::Incoming)
426                    .filter_map(|edge| {
427                        let source = edge.source();
428                        sccs.iter().position(|scc| scc.contains(&source))
429                    })
430                    .filter(|&source_scc| source_scc != scc_idx)
431                    .collect();
432
433                // Betweenness is based on how many different components this node connects
434                betweenness = (out_edges.len() + in_edges.len()) as f64;
435
436                // Normalize by maximum possible connections
437                let max_components = sccs.len().saturating_sub(1);
438                if max_components > 0 {
439                    betweenness /= max_components as f64;
440                }
441            }
442
443            betweenness_scores.insert(node_idx, betweenness);
444        }
445
446        Ok(betweenness_scores)
447    }
448
449    /// Calculate simplified PageRank scores
450    fn calculate_simple_pagerank(
451        &self,
452        graph: &Graph<String, (), Directed>,
453        node_map: &HashMap<String, NodeIndex>,
454    ) -> ScalingResult<Vec<f64>> {
455        let node_count = graph.node_count();
456        if node_count == 0 {
457            return Ok(Vec::new());
458        }
459
460        let mut scores = vec![1.0 / node_count as f64; node_count];
461        let damping = 0.85;
462        let iterations = 10; // Simple approximation
463
464        for _ in 0..iterations {
465            let mut new_scores = vec![(1.0 - damping) / node_count as f64; node_count];
466
467            for &node_idx in node_map.values() {
468                let out_degree = graph.edges_directed(node_idx, petgraph::Outgoing).count();
469                if out_degree > 0 {
470                    let contribution = scores[node_idx.index()] * damping / out_degree as f64;
471
472                    for edge in graph.edges_directed(node_idx, petgraph::Outgoing) {
473                        let target_idx = edge.target().index();
474                        new_scores[target_idx] += contribution;
475                    }
476                }
477            }
478
479            scores = new_scores;
480        }
481
482        Ok(scores)
483    }
484
485    /// Calculate query relevance scores if query hint provided
486    async fn calculate_query_relevance(
487        &self,
488        mut files: Vec<FileWithCentrality>,
489        query_hint: Option<&str>,
490    ) -> ScalingResult<Vec<FileWithCentrality>> {
491        if let Some(query) = query_hint {
492            debug!("Calculating query relevance for: {}", query);
493
494            let query_lower = query.to_lowercase();
495            let query_words: Vec<&str> = query_lower.split_whitespace().collect();
496
497            for file in &mut files {
498                file.query_relevance =
499                    self.calculate_file_query_relevance(&file.metadata, &query_words);
500            }
501        }
502
503        Ok(files)
504    }
505
506    /// Calculate query relevance for a single file
507    fn calculate_file_query_relevance(&self, file: &FileMetadata, query_words: &[&str]) -> f64 {
508        let path_str = file.path.to_string_lossy().to_lowercase();
509        let filename = file
510            .path
511            .file_name()
512            .and_then(|n| n.to_str())
513            .unwrap_or("")
514            .to_lowercase();
515
516        let mut relevance = 0.0;
517
518        for word in query_words {
519            // Exact matches in filename get highest score
520            if filename.contains(word) {
521                relevance += 1.0;
522            }
523            // Partial matches in path get medium score
524            else if path_str.contains(word) {
525                relevance += 0.5;
526            }
527            // Language matches get small boost
528            else if file.language.to_lowercase().contains(word) {
529                relevance += 0.2;
530            }
531        }
532
533        // Boost for entry points that might be relevant
534        if filename.contains("main")
535            || filename.contains("index")
536            || filename == "lib.rs"
537            || filename == "__init__.py"
538        {
539            relevance += 0.3;
540        }
541
542        relevance
543    }
544
545    /// Group files by relatedness
546    async fn group_by_relatedness(
547        &self,
548        mut files: Vec<FileWithCentrality>,
549    ) -> ScalingResult<Vec<FileWithCentrality>> {
550        debug!("Grouping {} files by relatedness", files.len());
551
552        for file in &mut files {
553            file.relatedness_group = self.determine_relatedness_group(&file.metadata);
554        }
555
556        Ok(files)
557    }
558
559    /// Determine relatedness group for a file
560    fn determine_relatedness_group(&self, file: &FileMetadata) -> String {
561        let path_str = file.path.to_string_lossy();
562
563        // Group by directory structure (first 2 levels)
564        let path_components: Vec<&str> = path_str.split('/').collect();
565        let group = if path_components.len() >= 2 {
566            format!("{}/{}", path_components[0], path_components[1])
567        } else if path_components.len() == 1 {
568            path_components[0].to_string()
569        } else {
570            "root".to_string()
571        };
572
573        // Add language suffix for better grouping
574        format!("{}::{}", group, file.language)
575    }
576
577    /// Apply three-tier positioning strategy
578    async fn apply_positioning_strategy(
579        &self,
580        files: Vec<FileWithCentrality>,
581    ) -> ScalingResult<ContextPositioning> {
582        if files.is_empty() {
583            return Ok(ContextPositioning {
584                head_files: Vec::new(),
585                middle_files: Vec::new(),
586                tail_files: Vec::new(),
587            });
588        }
589
590        let total_files = files.len();
591        let head_count = ((total_files as f64 * self.config.head_percentage) as usize).max(1);
592        let tail_count = ((total_files as f64 * self.config.tail_percentage) as usize).max(1);
593
594        debug!(
595            "Positioning strategy: HEAD={}, TAIL={}, MIDDLE={}",
596            head_count,
597            tail_count,
598            total_files - head_count - tail_count
599        );
600
601        // Sort files for HEAD positioning: query relevance + centrality
602        let mut head_candidates = files.clone();
603        head_candidates.sort_by(|a, b| {
604            let score_a = (a.query_relevance * self.config.query_relevance_weight)
605                + (a.centrality.combined * self.config.centrality_weight);
606            let score_b = (b.query_relevance * self.config.query_relevance_weight)
607                + (b.centrality.combined * self.config.centrality_weight);
608            score_b
609                .partial_cmp(&score_a)
610                .unwrap_or(std::cmp::Ordering::Equal)
611        });
612
613        // Sort files for TAIL positioning: pure centrality
614        let mut tail_candidates = files.clone();
615        tail_candidates.sort_by(|a, b| {
616            b.centrality
617                .combined
618                .partial_cmp(&a.centrality.combined)
619                .unwrap_or(std::cmp::Ordering::Equal)
620        });
621
622        // Select HEAD files (query-relevant + high centrality)
623        let mut selected_files = HashSet::new();
624        let mut head_files = Vec::new();
625
626        for file in head_candidates.into_iter().take(head_count) {
627            let file_key = self.file_to_key(&file.metadata.path);
628            selected_files.insert(file_key);
629            head_files.push(file);
630        }
631
632        // Select TAIL files (high centrality, not already in head)
633        let mut tail_files = Vec::new();
634        for file in tail_candidates {
635            if tail_files.len() >= tail_count {
636                break;
637            }
638            let file_key = self.file_to_key(&file.metadata.path);
639            if !selected_files.contains(&file_key) {
640                selected_files.insert(file_key);
641                tail_files.push(file);
642            }
643        }
644
645        // Remaining files go to MIDDLE
646        let mut middle_files = Vec::new();
647        for file in files {
648            let file_key = self.file_to_key(&file.metadata.path);
649            if !selected_files.contains(&file_key) {
650                middle_files.push(file);
651            }
652        }
653
654        // Group related files within each tier
655        self.group_within_tier(&mut head_files);
656        self.group_within_tier(&mut middle_files);
657        self.group_within_tier(&mut tail_files);
658
659        Ok(ContextPositioning {
660            head_files,
661            middle_files,
662            tail_files,
663        })
664    }
665
666    /// Group related files within a tier to improve locality
667    fn group_within_tier(&self, files: &mut Vec<FileWithCentrality>) {
668        files.sort_by(|a, b| {
669            // Primary sort: relatedness group
670            let group_cmp = a.relatedness_group.cmp(&b.relatedness_group);
671            if group_cmp != std::cmp::Ordering::Equal {
672                return group_cmp;
673            }
674
675            // Secondary sort: centrality within group
676            b.centrality
677                .combined
678                .partial_cmp(&a.centrality.combined)
679                .unwrap_or(std::cmp::Ordering::Equal)
680        });
681    }
682
683    /// Calculate total tokens for positioned files
684    fn calculate_total_tokens(&self, positioning: &ContextPositioning) -> usize {
685        let head_tokens = positioning
686            .head_files
687            .iter()
688            .map(|f| self.estimate_tokens(&f.metadata))
689            .sum::<usize>();
690
691        let middle_tokens = positioning
692            .middle_files
693            .iter()
694            .map(|f| self.estimate_tokens(&f.metadata))
695            .sum::<usize>();
696
697        let tail_tokens = positioning
698            .tail_files
699            .iter()
700            .map(|f| self.estimate_tokens(&f.metadata))
701            .sum::<usize>();
702
703        head_tokens + middle_tokens + tail_tokens
704    }
705
706    /// Generate positioning reasoning explanation
707    fn generate_positioning_reasoning(
708        &self,
709        positioning: &ContextPositioning,
710        query_hint: Option<&str>,
711    ) -> String {
712        let mut reasoning = Vec::new();
713
714        reasoning.push("🎯 Context Positioning Strategy Applied".to_string());
715        reasoning.push("".to_string());
716
717        // HEAD section reasoning
718        reasoning.push(format!(
719            "📍 HEAD ({} files): Query-specific high centrality files",
720            positioning.head_files.len()
721        ));
722        if let Some(query) = query_hint {
723            reasoning.push(format!("   Query hint: '{}'", query));
724        }
725        for (i, file) in positioning.head_files.iter().take(3).enumerate() {
726            reasoning.push(format!(
727                "   {}. {} (centrality: {:.3}, relevance: {:.3})",
728                i + 1,
729                file.metadata
730                    .path
731                    .file_name()
732                    .and_then(|n| n.to_str())
733                    .unwrap_or("?"),
734                file.centrality.combined,
735                file.query_relevance
736            ));
737        }
738        if positioning.head_files.len() > 3 {
739            reasoning.push(format!(
740                "   ... and {} more files",
741                positioning.head_files.len() - 3
742            ));
743        }
744        reasoning.push("".to_string());
745
746        // MIDDLE section reasoning
747        reasoning.push(format!(
748            "🔄 MIDDLE ({} files): Supporting utilities and low-centrality files",
749            positioning.middle_files.len()
750        ));
751        reasoning.push("".to_string());
752
753        // TAIL section reasoning
754        reasoning.push(format!(
755            "🏛️ TAIL ({} files): Core functionality, high centrality",
756            positioning.tail_files.len()
757        ));
758        for (i, file) in positioning.tail_files.iter().take(3).enumerate() {
759            reasoning.push(format!(
760                "   {}. {} (centrality: {:.3})",
761                i + 1,
762                file.metadata
763                    .path
764                    .file_name()
765                    .and_then(|n| n.to_str())
766                    .unwrap_or("?"),
767                file.centrality.combined
768            ));
769        }
770        if positioning.tail_files.len() > 3 {
771            reasoning.push(format!(
772                "   ... and {} more files",
773                positioning.tail_files.len() - 3
774            ));
775        }
776
777        reasoning.join("\n")
778    }
779
780    /// Create simple positioning when optimization is disabled
781    fn create_simple_positioning(&self, files: Vec<FileMetadata>) -> PositionedSelection {
782        let files_with_centrality: Vec<FileWithCentrality> = files
783            .into_iter()
784            .map(|metadata| FileWithCentrality {
785                metadata,
786                centrality: CentralityScores::default(),
787                query_relevance: 0.0,
788                relatedness_group: "default".to_string(),
789            })
790            .collect();
791
792        let positioning = ContextPositioning {
793            head_files: Vec::new(),
794            middle_files: files_with_centrality,
795            tail_files: Vec::new(),
796        };
797
798        let total_tokens = self.calculate_total_tokens(&positioning);
799
800        PositionedSelection {
801            positioning,
802            total_tokens,
803            positioning_reasoning: "Context positioning disabled - using default order".to_string(),
804        }
805    }
806
807    /// Convert file path to graph key
808    fn file_to_key(&self, path: &Path) -> String {
809        path.to_string_lossy().to_string()
810    }
811
812    /// Estimate tokens for a file (simplified version)
813    fn estimate_tokens(&self, file: &FileMetadata) -> usize {
814        // Basic token estimation: ~3.5 chars per token
815        let base_tokens = ((file.size as f64) / 3.5) as usize;
816
817        // Language-specific adjustments
818        let multiplier = match file.language.as_str() {
819            "Rust" => 1.3,
820            "JavaScript" | "TypeScript" => 1.2,
821            "Python" => 1.1,
822            "C" | "Go" => 1.0,
823            "JSON" | "YAML" | "TOML" => 0.7,
824            _ => 1.0,
825        };
826
827        (base_tokens as f64 * multiplier) as usize
828    }
829
830    /// Smart test file detection based on common patterns
831    fn is_test_file(&self, path: &Path) -> bool {
832        file::is_test_path(path)
833    }
834}
835
836#[cfg(test)]
837mod tests {
838    use super::*;
839    use std::path::PathBuf;
840    use std::time::SystemTime;
841
842    fn create_test_file(path: &str, size: u64, language: &str) -> FileMetadata {
843        FileMetadata {
844            path: PathBuf::from(path),
845            size,
846            modified: SystemTime::now(),
847            language: language.to_string(),
848            file_type: if language == "Rust" {
849                "Source"
850            } else {
851                "Other"
852            }
853            .to_string(),
854        }
855    }
856
857    #[tokio::test]
858    async fn test_context_positioner_creation() {
859        let positioner = ContextPositioner::with_defaults();
860        assert!(positioner.config.enable_positioning);
861        assert_eq!(positioner.config.head_percentage, 0.20);
862        assert_eq!(positioner.config.tail_percentage, 0.20);
863    }
864
865    #[tokio::test]
866    async fn test_centrality_calculation() {
867        let positioner = ContextPositioner::with_defaults();
868
869        let files = vec![
870            create_test_file("src/main.rs", 1000, "Rust"),
871            create_test_file("src/lib.rs", 2000, "Rust"),
872            create_test_file("src/utils.rs", 500, "Rust"),
873        ];
874
875        let files_with_centrality = positioner.calculate_centrality_scores(files).await.unwrap();
876        assert_eq!(files_with_centrality.len(), 3);
877
878        // All files should have some centrality score
879        for file in &files_with_centrality {
880            assert!(file.centrality.combined >= 0.0);
881            assert!(file.centrality.degree >= 0.0);
882            assert!(file.centrality.pagerank >= 0.0);
883            assert!(file.centrality.betweenness >= 0.0);
884        }
885
886        // At least one file should have higher centrality than another
887        let max_centrality = files_with_centrality
888            .iter()
889            .map(|f| f.centrality.combined)
890            .fold(0.0, f64::max);
891        let min_centrality = files_with_centrality
892            .iter()
893            .map(|f| f.centrality.combined)
894            .fold(1.0, f64::min);
895
896        // Allow for equal centrality scores in simple cases
897        assert!(max_centrality >= min_centrality);
898    }
899
900    #[tokio::test]
901    async fn test_positioning_strategy() {
902        let positioner = ContextPositioner::with_defaults();
903
904        let files = vec![
905            create_test_file("src/main.rs", 1000, "Rust"),
906            create_test_file("src/lib.rs", 2000, "Rust"),
907            create_test_file("src/utils.rs", 500, "Rust"),
908            create_test_file("tests/integration.rs", 800, "Rust"),
909            create_test_file("README.md", 300, "Markdown"),
910        ];
911
912        let result = positioner
913            .position_files(files, Some("main"))
914            .await
915            .unwrap();
916
917        // Should have files in all three tiers
918        assert!(!result.positioning.head_files.is_empty());
919        assert!(!result.positioning.middle_files.is_empty());
920        assert!(!result.positioning.tail_files.is_empty());
921
922        // Total should equal original count
923        let total = result.positioning.head_files.len()
924            + result.positioning.middle_files.len()
925            + result.positioning.tail_files.len();
926        assert_eq!(total, 5);
927
928        // Reasoning should be provided
929        assert!(!result.positioning_reasoning.is_empty());
930        assert!(result.positioning_reasoning.contains("HEAD"));
931        assert!(result.positioning_reasoning.contains("TAIL"));
932    }
933
934    #[tokio::test]
935    async fn test_query_relevance() {
936        let positioner = ContextPositioner::with_defaults();
937
938        let files = vec![
939            FileWithCentrality {
940                metadata: create_test_file("src/main.rs", 1000, "Rust"),
941                centrality: CentralityScores::default(),
942                query_relevance: 0.0,
943                relatedness_group: String::new(),
944            },
945            FileWithCentrality {
946                metadata: create_test_file("src/utils.rs", 500, "Rust"),
947                centrality: CentralityScores::default(),
948                query_relevance: 0.0,
949                relatedness_group: String::new(),
950            },
951        ];
952
953        let result = positioner
954            .calculate_query_relevance(files, Some("main"))
955            .await
956            .unwrap();
957
958        // main.rs should have higher query relevance for "main" query
959        let main_relevance = result
960            .iter()
961            .find(|f| f.metadata.path.to_string_lossy().contains("main.rs"))
962            .unwrap();
963        let utils_relevance = result
964            .iter()
965            .find(|f| f.metadata.path.to_string_lossy().contains("utils.rs"))
966            .unwrap();
967
968        assert!(main_relevance.query_relevance > utils_relevance.query_relevance);
969    }
970
971    #[test]
972    fn test_relatedness_grouping() {
973        let positioner = ContextPositioner::with_defaults();
974
975        let file = create_test_file("src/api/handlers.rs", 1000, "Rust");
976        let group = positioner.determine_relatedness_group(&file);
977
978        assert!(group.contains("src/api"));
979        assert!(group.contains("Rust"));
980    }
981
982    #[test]
983    fn test_token_estimation() {
984        let positioner = ContextPositioner::with_defaults();
985
986        let rust_file = create_test_file("src/main.rs", 1000, "Rust");
987        let json_file = create_test_file("package.json", 1000, "JSON");
988
989        let rust_tokens = positioner.estimate_tokens(&rust_file);
990        let json_tokens = positioner.estimate_tokens(&json_file);
991
992        // Rust should have more tokens than JSON for same file size
993        assert!(rust_tokens > json_tokens);
994    }
995
996    #[test]
997    fn test_is_test_file_detection() {
998        let positioner = ContextPositioner::with_defaults();
999
1000        // Test directory patterns
1001        assert!(positioner.is_test_file(&std::path::Path::new("src/test/utils.rs")));
1002        assert!(positioner.is_test_file(&std::path::Path::new("src/tests/integration.py")));
1003        assert!(positioner.is_test_file(&std::path::Path::new("__tests__/component.test.js")));
1004
1005        // Test file name patterns
1006        assert!(positioner.is_test_file(&std::path::Path::new("test_utils.py")));
1007        assert!(positioner.is_test_file(&std::path::Path::new("utils_test.rs")));
1008        assert!(positioner.is_test_file(&std::path::Path::new("component.test.tsx")));
1009        assert!(positioner.is_test_file(&std::path::Path::new("service.spec.ts")));
1010        assert!(positioner.is_test_file(&std::path::Path::new("model_test.go")));
1011
1012        // Language-specific patterns
1013        assert!(positioner.is_test_file(&std::path::Path::new("UserTest.java")));
1014        assert!(positioner.is_test_file(&std::path::Path::new("user_spec.rb")));
1015        assert!(positioner.is_test_file(&std::path::Path::new("UserTest.php")));
1016
1017        // Non-test files should not be detected
1018        assert!(!positioner.is_test_file(&std::path::Path::new("src/main.rs")));
1019        assert!(!positioner.is_test_file(&std::path::Path::new("lib/utils.py")));
1020        assert!(!positioner.is_test_file(&std::path::Path::new("components/Button.tsx")));
1021        assert!(!positioner.is_test_file(&std::path::Path::new("README.md")));
1022        assert!(!positioner.is_test_file(&std::path::Path::new("package.json")));
1023    }
1024
1025    #[tokio::test]
1026    async fn test_auto_exclude_tests() {
1027        let mut config = ContextPositioningConfig::default();
1028        config.auto_exclude_tests = true;
1029        let positioner = ContextPositioner::new(config);
1030
1031        // Create mix of test and non-test files
1032        let files = vec![
1033            create_test_file("src/main.rs", 1000, "Rust"),
1034            create_test_file("src/lib.rs", 800, "Rust"),
1035            create_test_file("src/tests/integration_test.rs", 1200, "Rust"),
1036            create_test_file("test/unit_test.py", 600, "Python"),
1037            create_test_file("components/Button.tsx", 900, "TypeScript"),
1038            create_test_file("__tests__/Button.test.tsx", 700, "TypeScript"),
1039        ];
1040
1041        let result = positioner.position_files(files, None).await.unwrap();
1042
1043        // Should have filtered out test files
1044        let all_files: Vec<&FileWithCentrality> = result
1045            .positioning
1046            .head_files
1047            .iter()
1048            .chain(result.positioning.middle_files.iter())
1049            .chain(result.positioning.tail_files.iter())
1050            .collect();
1051
1052        // Should only have non-test files (3 out of 6)
1053        assert_eq!(all_files.len(), 3);
1054
1055        // Verify no test files remain
1056        for file in all_files {
1057            let path_str = file.metadata.path.to_string_lossy();
1058            assert!(!path_str.contains("test"));
1059            assert!(!path_str.contains("__tests__"));
1060        }
1061
1062        // Verify we have the expected non-test files
1063        let file_names: Vec<String> = result
1064            .positioning
1065            .head_files
1066            .iter()
1067            .chain(result.positioning.middle_files.iter())
1068            .chain(result.positioning.tail_files.iter())
1069            .map(|f| {
1070                f.metadata
1071                    .path
1072                    .file_name()
1073                    .unwrap()
1074                    .to_string_lossy()
1075                    .to_string()
1076            })
1077            .collect();
1078
1079        assert!(file_names.contains(&"main.rs".to_string()));
1080        assert!(file_names.contains(&"lib.rs".to_string()));
1081        assert!(file_names.contains(&"Button.tsx".to_string()));
1082    }
1083
1084    #[tokio::test]
1085    async fn test_auto_exclude_disabled() {
1086        let mut config = ContextPositioningConfig::default();
1087        config.auto_exclude_tests = false; // Explicitly disabled
1088        let positioner = ContextPositioner::new(config);
1089
1090        // Create mix of test and non-test files
1091        let files = vec![
1092            create_test_file("src/main.rs", 1000, "Rust"),
1093            create_test_file("src/tests/integration_test.rs", 1200, "Rust"),
1094            create_test_file("test_utils.py", 600, "Python"),
1095        ];
1096
1097        let result = positioner.position_files(files, None).await.unwrap();
1098
1099        // Should include all files when auto-exclude is disabled
1100        let all_files: Vec<&FileWithCentrality> = result
1101            .positioning
1102            .head_files
1103            .iter()
1104            .chain(result.positioning.middle_files.iter())
1105            .chain(result.positioning.tail_files.iter())
1106            .collect();
1107
1108        // Should have all 3 files including test files
1109        assert_eq!(all_files.len(), 3);
1110    }
1111}
scribe_scaling/positioning.rs

scribe_scaling/
positioning.rs