scribe_scaling/
positioning.rs

1//! Context Positioning Optimization
2//!
3//! Strategic file positioning based on transformer model attention patterns.
4//! Models have better reasoning at the head and tail of context, so we position:
5//! - HEAD (20%): Query-specific high centrality files  
6//! - MIDDLE (60%): Low centrality supporting files
7//! - TAIL (20%): Core functionality, high centrality files
8
9use petgraph::algo::kosaraju_scc;
10use petgraph::visit::EdgeRef;
11use petgraph::{graph::NodeIndex, Directed, Graph};
12use rayon::prelude::*;
13use serde::{Deserialize, Serialize};
14use std::collections::{HashMap, HashSet};
15use std::path::Path;
16use tracing::{debug, info, warn};
17
18use crate::error::{ScalingError, ScalingResult};
19use crate::streaming::FileMetadata;
20
21/// Configuration for context positioning optimization
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct ContextPositioningConfig {
24    /// Enable context positioning optimization
25    pub enable_positioning: bool,
26
27    /// Percentage of context for HEAD positioning (query-relevant, high centrality)
28    pub head_percentage: f64,
29
30    /// Percentage of context for TAIL positioning (core functionality)  
31    pub tail_percentage: f64,
32
33    /// Weight for centrality in positioning decisions
34    pub centrality_weight: f64,
35
36    /// Weight for file relatedness in grouping decisions
37    pub relatedness_weight: f64,
38
39    /// Weight for query relevance in HEAD positioning
40    pub query_relevance_weight: f64,
41
42    /// Auto-exclude test files from selection (focuses on code and docs only)
43    pub auto_exclude_tests: bool,
44}
45
46impl Default for ContextPositioningConfig {
47    fn default() -> Self {
48        Self {
49            enable_positioning: true,
50            head_percentage: 0.20,
51            tail_percentage: 0.20,
52            centrality_weight: 0.4,
53            relatedness_weight: 0.3,
54            query_relevance_weight: 0.3,
55            auto_exclude_tests: false,
56        }
57    }
58}
59
60/// Centrality scores for files in the codebase
61#[derive(Debug, Clone, Default, Serialize, Deserialize)]
62pub struct CentralityScores {
63    /// Betweenness centrality: files connecting different parts
64    pub betweenness: f64,
65
66    /// PageRank centrality: heavily referenced files
67    pub pagerank: f64,
68
69    /// Degree centrality: files with many connections
70    pub degree: f64,
71
72    /// Combined centrality score
73    pub combined: f64,
74}
75
76/// File with centrality and positioning metadata
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct FileWithCentrality {
79    pub metadata: FileMetadata,
80    pub centrality: CentralityScores,
81    pub query_relevance: f64,
82    pub relatedness_group: String,
83}
84
85/// Three-tier context positioning structure
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct ContextPositioning {
88    /// HEAD: Query-specific high centrality files (first ~20%)
89    pub head_files: Vec<FileWithCentrality>,
90
91    /// MIDDLE: Low centrality supporting files (~60%)  
92    pub middle_files: Vec<FileWithCentrality>,
93
94    /// TAIL: Core functionality, high centrality (~20%)
95    pub tail_files: Vec<FileWithCentrality>,
96}
97
98/// Result of context positioning with reasoning
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct PositionedSelection {
101    pub positioning: ContextPositioning,
102    pub total_tokens: usize,
103    pub positioning_reasoning: String,
104}
105
106/// Context positioning optimizer
107pub struct ContextPositioner {
108    config: ContextPositioningConfig,
109}
110
111impl ContextPositioner {
112    /// Create new context positioner with configuration
113    pub fn new(config: ContextPositioningConfig) -> Self {
114        Self { config }
115    }
116
117    /// Create with default configuration
118    pub fn with_defaults() -> Self {
119        Self::new(ContextPositioningConfig::default())
120    }
121
122    /// Apply context positioning to selected files
123    pub async fn position_files(
124        &self,
125        files: Vec<FileMetadata>,
126        query_hint: Option<&str>,
127    ) -> ScalingResult<PositionedSelection> {
128        if !self.config.enable_positioning || files.is_empty() {
129            return Ok(self.create_simple_positioning(files));
130        }
131
132        // Filter out test files if auto-exclude is enabled
133        let filtered_files = if self.config.auto_exclude_tests {
134            let original_count = files.len();
135            let non_test_files: Vec<FileMetadata> = files
136                .into_iter()
137                .filter(|file| !self.is_test_file(&file.path))
138                .collect();
139            let filtered_count = non_test_files.len();
140
141            if original_count != filtered_count {
142                info!(
143                    "Auto-excluded {} test files, {} files remaining",
144                    original_count - filtered_count,
145                    filtered_count
146                );
147            }
148
149            non_test_files
150        } else {
151            files
152        };
153
154        info!(
155            "Starting context positioning for {} files",
156            filtered_files.len()
157        );
158
159        // Phase 1: Calculate centrality scores for all files
160        let files_with_centrality = self.calculate_centrality_scores(filtered_files).await?;
161
162        // Phase 2: Calculate query relevance if hint provided
163        let files_with_relevance = self
164            .calculate_query_relevance(files_with_centrality, query_hint)
165            .await?;
166
167        // Phase 3: Group by relatedness
168        let files_with_groups = self.group_by_relatedness(files_with_relevance).await?;
169
170        // Phase 4: Apply three-tier positioning strategy
171        let positioning = self.apply_positioning_strategy(files_with_groups).await?;
172
173        // Phase 5: Calculate total tokens and generate reasoning
174        let total_tokens = self.calculate_total_tokens(&positioning);
175        let reasoning = self.generate_positioning_reasoning(&positioning, query_hint);
176
177        info!(
178            "Context positioning complete: HEAD={}, MIDDLE={}, TAIL={}",
179            positioning.head_files.len(),
180            positioning.middle_files.len(),
181            positioning.tail_files.len()
182        );
183
184        Ok(PositionedSelection {
185            positioning,
186            total_tokens,
187            positioning_reasoning: reasoning,
188        })
189    }
190
191    /// Calculate centrality scores for all files using optimized algorithms
192    async fn calculate_centrality_scores(
193        &self,
194        files: Vec<FileMetadata>,
195    ) -> ScalingResult<Vec<FileWithCentrality>> {
196        debug!("Calculating centrality scores for {} files", files.len());
197
198        if files.is_empty() {
199            return Ok(Vec::new());
200        }
201
202        // Build optimized dependency graph
203        let (graph, node_map) = self.build_dependency_graph(&files).await?;
204
205        // Calculate all centrality measures efficiently using petgraph
206        let centrality_scores = self.calculate_all_centralities(&graph, &node_map).await?;
207
208        // Map centrality scores back to files in parallel
209        let files_with_centrality: Vec<FileWithCentrality> = files
210            .into_par_iter()
211            .map(|file| {
212                let file_key = self.file_to_key(&file.path);
213                let centrality = centrality_scores
214                    .get(&file_key)
215                    .cloned()
216                    .unwrap_or_default();
217
218                FileWithCentrality {
219                    metadata: file,
220                    centrality,
221                    query_relevance: 0.0,             // Will be set later
222                    relatedness_group: String::new(), // Will be set later
223                }
224            })
225            .collect();
226
227        debug!(
228            "Calculated centrality for {} files",
229            files_with_centrality.len()
230        );
231        Ok(files_with_centrality)
232    }
233
234    /// Build dependency graph from file relationships using petgraph
235    async fn build_dependency_graph(
236        &self,
237        files: &[FileMetadata],
238    ) -> ScalingResult<(Graph<String, (), Directed>, HashMap<String, NodeIndex>)> {
239        let mut graph = Graph::new();
240        let mut node_map = HashMap::new();
241
242        // First pass: create nodes for all files
243        for file in files {
244            let file_key = self.file_to_key(&file.path);
245            let node_idx = graph.add_node(file_key.clone());
246            node_map.insert(file_key, node_idx);
247        }
248
249        // Second pass: create edges based on dependencies
250        for file in files {
251            let file_key = self.file_to_key(&file.path);
252            let dependencies = self.extract_dependencies(file).await?;
253
254            if let Some(&from_idx) = node_map.get(&file_key) {
255                for dep in dependencies {
256                    if let Some(&to_idx) = node_map.get(&dep) {
257                        graph.add_edge(from_idx, to_idx, ());
258                    }
259                }
260            }
261        }
262
263        debug!(
264            "Built dependency graph: {} nodes, {} edges",
265            graph.node_count(),
266            graph.edge_count()
267        );
268
269        Ok((graph, node_map))
270    }
271
272    /// Extract dependencies from a file (imports, includes, etc.)
273    async fn extract_dependencies(&self, file: &FileMetadata) -> ScalingResult<Vec<String>> {
274        // Simple dependency extraction based on file patterns and language
275        let mut dependencies = Vec::new();
276
277        let path_str = file.path.to_string_lossy();
278        let dir_path = file
279            .path
280            .parent()
281            .map(|p| p.to_string_lossy().to_string())
282            .unwrap_or_default();
283
284        // For Rust files, assume mod.rs and lib.rs are central
285        if file.language == "Rust" {
286            let filename = file.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
287
288            if filename == "mod.rs" || filename == "lib.rs" {
289                // These are likely dependency targets
290            } else {
291                // Regular Rust files likely depend on lib.rs or mod.rs
292                dependencies.push(format!("{}/lib.rs", dir_path));
293                dependencies.push(format!("{}/mod.rs", dir_path));
294            }
295        }
296
297        // For Python files, __init__.py files are central
298        if file.language == "Python" {
299            let filename = file.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
300
301            if filename != "__init__.py" {
302                dependencies.push(format!("{}/__init__.py", dir_path));
303            }
304        }
305
306        // For JavaScript/TypeScript, index files are central
307        if file.language == "JavaScript" || file.language == "TypeScript" {
308            dependencies.push(format!("{}/index.js", dir_path));
309            dependencies.push(format!("{}/index.ts", dir_path));
310        }
311
312        // Configuration files often depend on package manifests
313        if file.file_type == "Configuration" {
314            dependencies.push("package.json".to_string());
315            dependencies.push("Cargo.toml".to_string());
316            dependencies.push("pyproject.toml".to_string());
317        }
318
319        Ok(dependencies)
320    }
321
322    /// Calculate all centrality measures efficiently using petgraph algorithms
323    async fn calculate_all_centralities(
324        &self,
325        graph: &Graph<String, (), Directed>,
326        node_map: &HashMap<String, NodeIndex>,
327    ) -> ScalingResult<HashMap<String, CentralityScores>> {
328        let mut centrality_scores = HashMap::new();
329
330        if graph.node_count() == 0 {
331            return Ok(centrality_scores);
332        }
333
334        // Calculate PageRank using simplified approach (petgraph doesn't have built-in PageRank)
335        // We'll implement a basic version or use degree centrality as approximation
336        let pagerank_scores = self.calculate_simple_pagerank(graph, node_map)?;
337
338        // Calculate degree centrality in parallel
339        let degree_scores: Vec<(NodeIndex, f64)> = node_map
340            .par_iter()
341            .map(|(_, &node_idx)| {
342                let in_degree = graph.edges_directed(node_idx, petgraph::Incoming).count();
343                let out_degree = graph.edges_directed(node_idx, petgraph::Outgoing).count();
344                let total_degree = in_degree + out_degree;
345                let max_possible = graph.node_count().saturating_sub(1);
346
347                let normalized_degree = if max_possible == 0 {
348                    0.0
349                } else {
350                    total_degree as f64 / max_possible as f64
351                };
352
353                (node_idx, normalized_degree)
354            })
355            .collect();
356
357        // Calculate betweenness centrality using strongly connected components
358        let betweenness_scores = self.calculate_betweenness_from_scc(graph, node_map)?;
359
360        // Combine all scores
361        for (file_key, &node_idx) in node_map {
362            let pagerank = pagerank_scores
363                .get(node_idx.index())
364                .copied()
365                .unwrap_or(0.0);
366            let degree = degree_scores
367                .iter()
368                .find(|(idx, _)| *idx == node_idx)
369                .map(|(_, score)| *score)
370                .unwrap_or(0.0);
371            let betweenness = betweenness_scores.get(&node_idx).copied().unwrap_or(0.0);
372
373            // Combine centrality scores with weights
374            let combined = (degree * 0.3) + (pagerank * 0.5) + (betweenness * 0.2);
375
376            centrality_scores.insert(
377                file_key.clone(),
378                CentralityScores {
379                    degree,
380                    pagerank,
381                    betweenness,
382                    combined,
383                },
384            );
385        }
386
387        debug!(
388            "Calculated centrality scores for {} files",
389            centrality_scores.len()
390        );
391        Ok(centrality_scores)
392    }
393
394    /// Calculate betweenness centrality using strongly connected components
395    fn calculate_betweenness_from_scc(
396        &self,
397        graph: &Graph<String, (), Directed>,
398        node_map: &HashMap<String, NodeIndex>,
399    ) -> ScalingResult<HashMap<NodeIndex, f64>> {
400        let mut betweenness_scores = HashMap::new();
401
402        // Use Kosaraju's algorithm to find strongly connected components
403        let sccs = kosaraju_scc(graph);
404
405        // Calculate betweenness based on component connectivity
406        for &node_idx in node_map.values() {
407            let mut betweenness = 0.0;
408
409            // Find which SCC this node belongs to
410            let node_scc = sccs.iter().position(|scc| scc.contains(&node_idx));
411
412            if let Some(scc_idx) = node_scc {
413                // Count connections to other SCCs
414                let out_edges: HashSet<usize> = graph
415                    .edges_directed(node_idx, petgraph::Outgoing)
416                    .filter_map(|edge| {
417                        let target = edge.target();
418                        sccs.iter().position(|scc| scc.contains(&target))
419                    })
420                    .filter(|&target_scc| target_scc != scc_idx)
421                    .collect();
422
423                let in_edges: HashSet<usize> = graph
424                    .edges_directed(node_idx, petgraph::Incoming)
425                    .filter_map(|edge| {
426                        let source = edge.source();
427                        sccs.iter().position(|scc| scc.contains(&source))
428                    })
429                    .filter(|&source_scc| source_scc != scc_idx)
430                    .collect();
431
432                // Betweenness is based on how many different components this node connects
433                betweenness = (out_edges.len() + in_edges.len()) as f64;
434
435                // Normalize by maximum possible connections
436                let max_components = sccs.len().saturating_sub(1);
437                if max_components > 0 {
438                    betweenness /= max_components as f64;
439                }
440            }
441
442            betweenness_scores.insert(node_idx, betweenness);
443        }
444
445        Ok(betweenness_scores)
446    }
447
448    /// Calculate simplified PageRank scores
449    fn calculate_simple_pagerank(
450        &self,
451        graph: &Graph<String, (), Directed>,
452        node_map: &HashMap<String, NodeIndex>,
453    ) -> ScalingResult<Vec<f64>> {
454        let node_count = graph.node_count();
455        if node_count == 0 {
456            return Ok(Vec::new());
457        }
458
459        let mut scores = vec![1.0 / node_count as f64; node_count];
460        let damping = 0.85;
461        let iterations = 10; // Simple approximation
462
463        for _ in 0..iterations {
464            let mut new_scores = vec![(1.0 - damping) / node_count as f64; node_count];
465
466            for &node_idx in node_map.values() {
467                let out_degree = graph.edges_directed(node_idx, petgraph::Outgoing).count();
468                if out_degree > 0 {
469                    let contribution = scores[node_idx.index()] * damping / out_degree as f64;
470
471                    for edge in graph.edges_directed(node_idx, petgraph::Outgoing) {
472                        let target_idx = edge.target().index();
473                        new_scores[target_idx] += contribution;
474                    }
475                }
476            }
477
478            scores = new_scores;
479        }
480
481        Ok(scores)
482    }
483
484    /// Calculate query relevance scores if query hint provided
485    async fn calculate_query_relevance(
486        &self,
487        mut files: Vec<FileWithCentrality>,
488        query_hint: Option<&str>,
489    ) -> ScalingResult<Vec<FileWithCentrality>> {
490        if let Some(query) = query_hint {
491            debug!("Calculating query relevance for: {}", query);
492
493            let query_lower = query.to_lowercase();
494            let query_words: Vec<&str> = query_lower.split_whitespace().collect();
495
496            for file in &mut files {
497                file.query_relevance =
498                    self.calculate_file_query_relevance(&file.metadata, &query_words);
499            }
500        }
501
502        Ok(files)
503    }
504
505    /// Calculate query relevance for a single file
506    fn calculate_file_query_relevance(&self, file: &FileMetadata, query_words: &[&str]) -> f64 {
507        let path_str = file.path.to_string_lossy().to_lowercase();
508        let filename = file
509            .path
510            .file_name()
511            .and_then(|n| n.to_str())
512            .unwrap_or("")
513            .to_lowercase();
514
515        let mut relevance = 0.0;
516
517        for word in query_words {
518            // Exact matches in filename get highest score
519            if filename.contains(word) {
520                relevance += 1.0;
521            }
522            // Partial matches in path get medium score
523            else if path_str.contains(word) {
524                relevance += 0.5;
525            }
526            // Language matches get small boost
527            else if file.language.to_lowercase().contains(word) {
528                relevance += 0.2;
529            }
530        }
531
532        // Boost for entry points that might be relevant
533        if filename.contains("main")
534            || filename.contains("index")
535            || filename == "lib.rs"
536            || filename == "__init__.py"
537        {
538            relevance += 0.3;
539        }
540
541        relevance
542    }
543
544    /// Group files by relatedness
545    async fn group_by_relatedness(
546        &self,
547        mut files: Vec<FileWithCentrality>,
548    ) -> ScalingResult<Vec<FileWithCentrality>> {
549        debug!("Grouping {} files by relatedness", files.len());
550
551        for file in &mut files {
552            file.relatedness_group = self.determine_relatedness_group(&file.metadata);
553        }
554
555        Ok(files)
556    }
557
558    /// Determine relatedness group for a file
559    fn determine_relatedness_group(&self, file: &FileMetadata) -> String {
560        let path_str = file.path.to_string_lossy();
561
562        // Group by directory structure (first 2 levels)
563        let path_components: Vec<&str> = path_str.split('/').collect();
564        let group = if path_components.len() >= 2 {
565            format!("{}/{}", path_components[0], path_components[1])
566        } else if path_components.len() == 1 {
567            path_components[0].to_string()
568        } else {
569            "root".to_string()
570        };
571
572        // Add language suffix for better grouping
573        format!("{}::{}", group, file.language)
574    }
575
576    /// Apply three-tier positioning strategy
577    async fn apply_positioning_strategy(
578        &self,
579        files: Vec<FileWithCentrality>,
580    ) -> ScalingResult<ContextPositioning> {
581        if files.is_empty() {
582            return Ok(ContextPositioning {
583                head_files: Vec::new(),
584                middle_files: Vec::new(),
585                tail_files: Vec::new(),
586            });
587        }
588
589        let total_files = files.len();
590        let head_count = ((total_files as f64 * self.config.head_percentage) as usize).max(1);
591        let tail_count = ((total_files as f64 * self.config.tail_percentage) as usize).max(1);
592
593        debug!(
594            "Positioning strategy: HEAD={}, TAIL={}, MIDDLE={}",
595            head_count,
596            tail_count,
597            total_files - head_count - tail_count
598        );
599
600        // Sort files for HEAD positioning: query relevance + centrality
601        let mut head_candidates = files.clone();
602        head_candidates.sort_by(|a, b| {
603            let score_a = (a.query_relevance * self.config.query_relevance_weight)
604                + (a.centrality.combined * self.config.centrality_weight);
605            let score_b = (b.query_relevance * self.config.query_relevance_weight)
606                + (b.centrality.combined * self.config.centrality_weight);
607            score_b
608                .partial_cmp(&score_a)
609                .unwrap_or(std::cmp::Ordering::Equal)
610        });
611
612        // Sort files for TAIL positioning: pure centrality
613        let mut tail_candidates = files.clone();
614        tail_candidates.sort_by(|a, b| {
615            b.centrality
616                .combined
617                .partial_cmp(&a.centrality.combined)
618                .unwrap_or(std::cmp::Ordering::Equal)
619        });
620
621        // Select HEAD files (query-relevant + high centrality)
622        let mut selected_files = HashSet::new();
623        let mut head_files = Vec::new();
624
625        for file in head_candidates.into_iter().take(head_count) {
626            let file_key = self.file_to_key(&file.metadata.path);
627            selected_files.insert(file_key);
628            head_files.push(file);
629        }
630
631        // Select TAIL files (high centrality, not already in head)
632        let mut tail_files = Vec::new();
633        for file in tail_candidates {
634            if tail_files.len() >= tail_count {
635                break;
636            }
637            let file_key = self.file_to_key(&file.metadata.path);
638            if !selected_files.contains(&file_key) {
639                selected_files.insert(file_key);
640                tail_files.push(file);
641            }
642        }
643
644        // Remaining files go to MIDDLE
645        let mut middle_files = Vec::new();
646        for file in files {
647            let file_key = self.file_to_key(&file.metadata.path);
648            if !selected_files.contains(&file_key) {
649                middle_files.push(file);
650            }
651        }
652
653        // Group related files within each tier
654        self.group_within_tier(&mut head_files);
655        self.group_within_tier(&mut middle_files);
656        self.group_within_tier(&mut tail_files);
657
658        Ok(ContextPositioning {
659            head_files,
660            middle_files,
661            tail_files,
662        })
663    }
664
665    /// Group related files within a tier to improve locality
666    fn group_within_tier(&self, files: &mut Vec<FileWithCentrality>) {
667        files.sort_by(|a, b| {
668            // Primary sort: relatedness group
669            let group_cmp = a.relatedness_group.cmp(&b.relatedness_group);
670            if group_cmp != std::cmp::Ordering::Equal {
671                return group_cmp;
672            }
673
674            // Secondary sort: centrality within group
675            b.centrality
676                .combined
677                .partial_cmp(&a.centrality.combined)
678                .unwrap_or(std::cmp::Ordering::Equal)
679        });
680    }
681
682    /// Calculate total tokens for positioned files
683    fn calculate_total_tokens(&self, positioning: &ContextPositioning) -> usize {
684        let head_tokens = positioning
685            .head_files
686            .iter()
687            .map(|f| self.estimate_tokens(&f.metadata))
688            .sum::<usize>();
689
690        let middle_tokens = positioning
691            .middle_files
692            .iter()
693            .map(|f| self.estimate_tokens(&f.metadata))
694            .sum::<usize>();
695
696        let tail_tokens = positioning
697            .tail_files
698            .iter()
699            .map(|f| self.estimate_tokens(&f.metadata))
700            .sum::<usize>();
701
702        head_tokens + middle_tokens + tail_tokens
703    }
704
705    /// Generate positioning reasoning explanation
706    fn generate_positioning_reasoning(
707        &self,
708        positioning: &ContextPositioning,
709        query_hint: Option<&str>,
710    ) -> String {
711        let mut reasoning = Vec::new();
712
713        reasoning.push("🎯 Context Positioning Strategy Applied".to_string());
714        reasoning.push("".to_string());
715
716        // HEAD section reasoning
717        reasoning.push(format!(
718            "📍 HEAD ({} files): Query-specific high centrality files",
719            positioning.head_files.len()
720        ));
721        if let Some(query) = query_hint {
722            reasoning.push(format!("   Query hint: '{}'", query));
723        }
724        for (i, file) in positioning.head_files.iter().take(3).enumerate() {
725            reasoning.push(format!(
726                "   {}. {} (centrality: {:.3}, relevance: {:.3})",
727                i + 1,
728                file.metadata
729                    .path
730                    .file_name()
731                    .and_then(|n| n.to_str())
732                    .unwrap_or("?"),
733                file.centrality.combined,
734                file.query_relevance
735            ));
736        }
737        if positioning.head_files.len() > 3 {
738            reasoning.push(format!(
739                "   ... and {} more files",
740                positioning.head_files.len() - 3
741            ));
742        }
743        reasoning.push("".to_string());
744
745        // MIDDLE section reasoning
746        reasoning.push(format!(
747            "🔄 MIDDLE ({} files): Supporting utilities and low-centrality files",
748            positioning.middle_files.len()
749        ));
750        reasoning.push("".to_string());
751
752        // TAIL section reasoning
753        reasoning.push(format!(
754            "🏛️ TAIL ({} files): Core functionality, high centrality",
755            positioning.tail_files.len()
756        ));
757        for (i, file) in positioning.tail_files.iter().take(3).enumerate() {
758            reasoning.push(format!(
759                "   {}. {} (centrality: {:.3})",
760                i + 1,
761                file.metadata
762                    .path
763                    .file_name()
764                    .and_then(|n| n.to_str())
765                    .unwrap_or("?"),
766                file.centrality.combined
767            ));
768        }
769        if positioning.tail_files.len() > 3 {
770            reasoning.push(format!(
771                "   ... and {} more files",
772                positioning.tail_files.len() - 3
773            ));
774        }
775
776        reasoning.join("\n")
777    }
778
779    /// Create simple positioning when optimization is disabled
780    fn create_simple_positioning(&self, files: Vec<FileMetadata>) -> PositionedSelection {
781        let files_with_centrality: Vec<FileWithCentrality> = files
782            .into_iter()
783            .map(|metadata| FileWithCentrality {
784                metadata,
785                centrality: CentralityScores::default(),
786                query_relevance: 0.0,
787                relatedness_group: "default".to_string(),
788            })
789            .collect();
790
791        let positioning = ContextPositioning {
792            head_files: Vec::new(),
793            middle_files: files_with_centrality,
794            tail_files: Vec::new(),
795        };
796
797        let total_tokens = self.calculate_total_tokens(&positioning);
798
799        PositionedSelection {
800            positioning,
801            total_tokens,
802            positioning_reasoning: "Context positioning disabled - using default order".to_string(),
803        }
804    }
805
806    /// Convert file path to graph key
807    fn file_to_key(&self, path: &Path) -> String {
808        path.to_string_lossy().to_string()
809    }
810
811    /// Estimate tokens for a file (simplified version)
812    fn estimate_tokens(&self, file: &FileMetadata) -> usize {
813        // Basic token estimation: ~3.5 chars per token
814        let base_tokens = ((file.size as f64) / 3.5) as usize;
815
816        // Language-specific adjustments
817        let multiplier = match file.language.as_str() {
818            "Rust" => 1.3,
819            "JavaScript" | "TypeScript" => 1.2,
820            "Python" => 1.1,
821            "C" | "Go" => 1.0,
822            "JSON" | "YAML" | "TOML" => 0.7,
823            _ => 1.0,
824        };
825
826        (base_tokens as f64 * multiplier) as usize
827    }
828
829    /// Smart test file detection based on common patterns
830    fn is_test_file(&self, path: &Path) -> bool {
831        let path_str = path.to_string_lossy().to_lowercase();
832        let file_name = path
833            .file_name()
834            .map(|s| s.to_string_lossy().to_lowercase())
835            .unwrap_or_default();
836
837        // Test directory patterns
838        if path_str.contains("/test/")
839            || path_str.contains("/tests/")
840            || path_str.contains("\\test\\")
841            || path_str.contains("\\tests\\")
842            || path_str.contains("/__tests__/")
843            || path_str.contains("\\__tests__\\")
844        {
845            return true;
846        }
847
848        // Test file name patterns
849        if file_name.starts_with("test_")
850            || file_name.ends_with("_test.rs")
851            || file_name.ends_with("_test.py")
852            || file_name.ends_with("_test.js")
853            || file_name.ends_with("_test.ts")
854            || file_name.ends_with(".test.js")
855            || file_name.ends_with(".test.ts")
856            || file_name.ends_with(".test.jsx")
857            || file_name.ends_with(".test.tsx")
858            || file_name.ends_with(".spec.js")
859            || file_name.ends_with(".spec.ts")
860            || file_name.ends_with(".spec.jsx")
861            || file_name.ends_with(".spec.tsx")
862            || file_name.ends_with("_spec.py")
863            || file_name.ends_with("_spec.rb")
864        {
865            return true;
866        }
867
868        // Language-specific test patterns
869        match path.extension().and_then(|s| s.to_str()) {
870            Some("rs") => {
871                // Rust: mod tests, #[cfg(test)]
872                file_name.contains("test")
873                    && (file_name.starts_with("test_")
874                        || file_name.ends_with("_test.rs")
875                        || path_str.contains("/tests/"))
876            }
877            Some("py") => {
878                // Python: test_*.py, *_test.py, pytest patterns
879                file_name.starts_with("test_")
880                    || file_name.ends_with("_test.py")
881                    || file_name.contains("test_")
882            }
883            Some("go") => {
884                // Go: *_test.go
885                file_name.ends_with("_test.go")
886            }
887            Some("java") | Some("kt") => {
888                // Java/Kotlin: *Test.java, *Tests.java
889                file_name.ends_with("test.java")
890                    || file_name.ends_with("tests.java")
891                    || file_name.ends_with("test.kt")
892                    || file_name.ends_with("tests.kt")
893                    || path_str.contains("/test/")
894                    || path_str.contains("/tests/")
895            }
896            Some("js") | Some("ts") | Some("jsx") | Some("tsx") => {
897                // JavaScript/TypeScript: comprehensive test patterns
898                file_name.contains(".test.")
899                    || file_name.contains(".spec.")
900                    || file_name.ends_with(".test.js")
901                    || file_name.ends_with(".test.ts")
902                    || file_name.ends_with(".spec.js")
903                    || file_name.ends_with(".spec.ts")
904                    || path_str.contains("/__tests__/")
905                    || path_str.contains("/test/")
906                    || path_str.contains("/tests/")
907            }
908            Some("rb") => {
909                // Ruby: *_test.rb, *_spec.rb, spec/ and test/ directories
910                file_name.ends_with("_test.rb")
911                    || file_name.ends_with("_spec.rb")
912                    || path_str.contains("/spec/")
913                    || path_str.contains("/test/")
914            }
915            Some("php") => {
916                // PHP: *Test.php, *_test.php
917                file_name.ends_with("test.php")
918                    || file_name.ends_with("_test.php")
919                    || file_name.contains("test") && path_str.contains("/test")
920            }
921            _ => false,
922        }
923    }
924}
925
926#[cfg(test)]
927mod tests {
928    use super::*;
929    use std::path::PathBuf;
930    use std::time::SystemTime;
931
932    fn create_test_file(path: &str, size: u64, language: &str) -> FileMetadata {
933        FileMetadata {
934            path: PathBuf::from(path),
935            size,
936            modified: SystemTime::now(),
937            language: language.to_string(),
938            file_type: if language == "Rust" {
939                "Source"
940            } else {
941                "Other"
942            }
943            .to_string(),
944        }
945    }
946
947    #[tokio::test]
948    async fn test_context_positioner_creation() {
949        let positioner = ContextPositioner::with_defaults();
950        assert!(positioner.config.enable_positioning);
951        assert_eq!(positioner.config.head_percentage, 0.20);
952        assert_eq!(positioner.config.tail_percentage, 0.20);
953    }
954
955    #[tokio::test]
956    async fn test_centrality_calculation() {
957        let positioner = ContextPositioner::with_defaults();
958
959        let files = vec![
960            create_test_file("src/main.rs", 1000, "Rust"),
961            create_test_file("src/lib.rs", 2000, "Rust"),
962            create_test_file("src/utils.rs", 500, "Rust"),
963        ];
964
965        let files_with_centrality = positioner.calculate_centrality_scores(files).await.unwrap();
966        assert_eq!(files_with_centrality.len(), 3);
967
968        // All files should have some centrality score
969        for file in &files_with_centrality {
970            assert!(file.centrality.combined >= 0.0);
971            assert!(file.centrality.degree >= 0.0);
972            assert!(file.centrality.pagerank >= 0.0);
973            assert!(file.centrality.betweenness >= 0.0);
974        }
975
976        // At least one file should have higher centrality than another
977        let max_centrality = files_with_centrality
978            .iter()
979            .map(|f| f.centrality.combined)
980            .fold(0.0, f64::max);
981        let min_centrality = files_with_centrality
982            .iter()
983            .map(|f| f.centrality.combined)
984            .fold(1.0, f64::min);
985
986        // Allow for equal centrality scores in simple cases
987        assert!(max_centrality >= min_centrality);
988    }
989
990    #[tokio::test]
991    async fn test_positioning_strategy() {
992        let positioner = ContextPositioner::with_defaults();
993
994        let files = vec![
995            create_test_file("src/main.rs", 1000, "Rust"),
996            create_test_file("src/lib.rs", 2000, "Rust"),
997            create_test_file("src/utils.rs", 500, "Rust"),
998            create_test_file("tests/integration.rs", 800, "Rust"),
999            create_test_file("README.md", 300, "Markdown"),
1000        ];
1001
1002        let result = positioner
1003            .position_files(files, Some("main"))
1004            .await
1005            .unwrap();
1006
1007        // Should have files in all three tiers
1008        assert!(!result.positioning.head_files.is_empty());
1009        assert!(!result.positioning.middle_files.is_empty());
1010        assert!(!result.positioning.tail_files.is_empty());
1011
1012        // Total should equal original count
1013        let total = result.positioning.head_files.len()
1014            + result.positioning.middle_files.len()
1015            + result.positioning.tail_files.len();
1016        assert_eq!(total, 5);
1017
1018        // Reasoning should be provided
1019        assert!(!result.positioning_reasoning.is_empty());
1020        assert!(result.positioning_reasoning.contains("HEAD"));
1021        assert!(result.positioning_reasoning.contains("TAIL"));
1022    }
1023
1024    #[tokio::test]
1025    async fn test_query_relevance() {
1026        let positioner = ContextPositioner::with_defaults();
1027
1028        let files = vec![
1029            FileWithCentrality {
1030                metadata: create_test_file("src/main.rs", 1000, "Rust"),
1031                centrality: CentralityScores::default(),
1032                query_relevance: 0.0,
1033                relatedness_group: String::new(),
1034            },
1035            FileWithCentrality {
1036                metadata: create_test_file("src/utils.rs", 500, "Rust"),
1037                centrality: CentralityScores::default(),
1038                query_relevance: 0.0,
1039                relatedness_group: String::new(),
1040            },
1041        ];
1042
1043        let result = positioner
1044            .calculate_query_relevance(files, Some("main"))
1045            .await
1046            .unwrap();
1047
1048        // main.rs should have higher query relevance for "main" query
1049        let main_relevance = result
1050            .iter()
1051            .find(|f| f.metadata.path.to_string_lossy().contains("main.rs"))
1052            .unwrap();
1053        let utils_relevance = result
1054            .iter()
1055            .find(|f| f.metadata.path.to_string_lossy().contains("utils.rs"))
1056            .unwrap();
1057
1058        assert!(main_relevance.query_relevance > utils_relevance.query_relevance);
1059    }
1060
1061    #[test]
1062    fn test_relatedness_grouping() {
1063        let positioner = ContextPositioner::with_defaults();
1064
1065        let file = create_test_file("src/api/handlers.rs", 1000, "Rust");
1066        let group = positioner.determine_relatedness_group(&file);
1067
1068        assert!(group.contains("src/api"));
1069        assert!(group.contains("Rust"));
1070    }
1071
1072    #[test]
1073    fn test_token_estimation() {
1074        let positioner = ContextPositioner::with_defaults();
1075
1076        let rust_file = create_test_file("src/main.rs", 1000, "Rust");
1077        let json_file = create_test_file("package.json", 1000, "JSON");
1078
1079        let rust_tokens = positioner.estimate_tokens(&rust_file);
1080        let json_tokens = positioner.estimate_tokens(&json_file);
1081
1082        // Rust should have more tokens than JSON for same file size
1083        assert!(rust_tokens > json_tokens);
1084    }
1085
1086    #[test]
1087    fn test_is_test_file_detection() {
1088        let positioner = ContextPositioner::with_defaults();
1089
1090        // Test directory patterns
1091        assert!(positioner.is_test_file(&std::path::Path::new("src/test/utils.rs")));
1092        assert!(positioner.is_test_file(&std::path::Path::new("src/tests/integration.py")));
1093        assert!(positioner.is_test_file(&std::path::Path::new("__tests__/component.test.js")));
1094
1095        // Test file name patterns
1096        assert!(positioner.is_test_file(&std::path::Path::new("test_utils.py")));
1097        assert!(positioner.is_test_file(&std::path::Path::new("utils_test.rs")));
1098        assert!(positioner.is_test_file(&std::path::Path::new("component.test.tsx")));
1099        assert!(positioner.is_test_file(&std::path::Path::new("service.spec.ts")));
1100        assert!(positioner.is_test_file(&std::path::Path::new("model_test.go")));
1101
1102        // Language-specific patterns
1103        assert!(positioner.is_test_file(&std::path::Path::new("UserTest.java")));
1104        assert!(positioner.is_test_file(&std::path::Path::new("user_spec.rb")));
1105        assert!(positioner.is_test_file(&std::path::Path::new("UserTest.php")));
1106
1107        // Non-test files should not be detected
1108        assert!(!positioner.is_test_file(&std::path::Path::new("src/main.rs")));
1109        assert!(!positioner.is_test_file(&std::path::Path::new("lib/utils.py")));
1110        assert!(!positioner.is_test_file(&std::path::Path::new("components/Button.tsx")));
1111        assert!(!positioner.is_test_file(&std::path::Path::new("README.md")));
1112        assert!(!positioner.is_test_file(&std::path::Path::new("package.json")));
1113    }
1114
1115    #[tokio::test]
1116    async fn test_auto_exclude_tests() {
1117        let mut config = ContextPositioningConfig::default();
1118        config.auto_exclude_tests = true;
1119        let positioner = ContextPositioner::new(config);
1120
1121        // Create mix of test and non-test files
1122        let files = vec![
1123            create_test_file("src/main.rs", 1000, "Rust"),
1124            create_test_file("src/lib.rs", 800, "Rust"),
1125            create_test_file("src/tests/integration_test.rs", 1200, "Rust"),
1126            create_test_file("test/unit_test.py", 600, "Python"),
1127            create_test_file("components/Button.tsx", 900, "TypeScript"),
1128            create_test_file("__tests__/Button.test.tsx", 700, "TypeScript"),
1129        ];
1130
1131        let result = positioner.position_files(files, None).await.unwrap();
1132
1133        // Should have filtered out test files
1134        let all_files: Vec<&FileWithCentrality> = result
1135            .positioning
1136            .head_files
1137            .iter()
1138            .chain(result.positioning.middle_files.iter())
1139            .chain(result.positioning.tail_files.iter())
1140            .collect();
1141
1142        // Should only have non-test files (3 out of 6)
1143        assert_eq!(all_files.len(), 3);
1144
1145        // Verify no test files remain
1146        for file in all_files {
1147            let path_str = file.metadata.path.to_string_lossy();
1148            assert!(!path_str.contains("test"));
1149            assert!(!path_str.contains("__tests__"));
1150        }
1151
1152        // Verify we have the expected non-test files
1153        let file_names: Vec<String> = result
1154            .positioning
1155            .head_files
1156            .iter()
1157            .chain(result.positioning.middle_files.iter())
1158            .chain(result.positioning.tail_files.iter())
1159            .map(|f| {
1160                f.metadata
1161                    .path
1162                    .file_name()
1163                    .unwrap()
1164                    .to_string_lossy()
1165                    .to_string()
1166            })
1167            .collect();
1168
1169        assert!(file_names.contains(&"main.rs".to_string()));
1170        assert!(file_names.contains(&"lib.rs".to_string()));
1171        assert!(file_names.contains(&"Button.tsx".to_string()));
1172    }
1173
1174    #[tokio::test]
1175    async fn test_auto_exclude_disabled() {
1176        let mut config = ContextPositioningConfig::default();
1177        config.auto_exclude_tests = false; // Explicitly disabled
1178        let positioner = ContextPositioner::new(config);
1179
1180        // Create mix of test and non-test files
1181        let files = vec![
1182            create_test_file("src/main.rs", 1000, "Rust"),
1183            create_test_file("src/tests/integration_test.rs", 1200, "Rust"),
1184            create_test_file("test_utils.py", 600, "Python"),
1185        ];
1186
1187        let result = positioner.position_files(files, None).await.unwrap();
1188
1189        // Should include all files when auto-exclude is disabled
1190        let all_files: Vec<&FileWithCentrality> = result
1191            .positioning
1192            .head_files
1193            .iter()
1194            .chain(result.positioning.middle_files.iter())
1195            .chain(result.positioning.tail_files.iter())
1196            .collect();
1197
1198        // Should have all 3 files including test files
1199        assert_eq!(all_files.len(), 3);
1200    }
1201}
scribe_scaling/positioning.rs

scribe_scaling/
positioning.rs