Skip to main content

weave_core/
region.rs

1use sem_core::model::entity::SemanticEntity;
2
3/// A region of a file — either an entity or the interstitial content between entities.
4#[derive(Debug, Clone)]
5pub enum FileRegion {
6    Entity(EntityRegion),
7    Interstitial(InterstitialRegion),
8}
9
10impl FileRegion {
11    pub fn content(&self) -> &str {
12        match self {
13            FileRegion::Entity(e) => &e.content,
14            FileRegion::Interstitial(i) => &i.content,
15        }
16    }
17
18    pub fn key(&self) -> &str {
19        match self {
20            FileRegion::Entity(e) => &e.entity_id,
21            FileRegion::Interstitial(i) => &i.position_key,
22        }
23    }
24
25    pub fn is_entity(&self) -> bool {
26        matches!(self, FileRegion::Entity(_))
27    }
28}
29
30#[derive(Debug, Clone)]
31pub struct EntityRegion {
32    pub entity_id: String,
33    pub entity_name: String,
34    pub entity_type: String,
35    pub content: String,
36    pub start_line: usize,
37    pub end_line: usize,
38}
39
40#[derive(Debug, Clone)]
41pub struct InterstitialRegion {
42    /// A key like "before:entity_id" or "after:entity_id" or "file_header" / "file_footer"
43    pub position_key: String,
44    pub content: String,
45}
46
47/// Extract ordered regions from file content using the given entities.
48///
49/// Entities must be from the same file. The function splits the file into
50/// alternating interstitial and entity regions based on line ranges.
51pub fn extract_regions(content: &str, entities: &[SemanticEntity]) -> Vec<FileRegion> {
52    if entities.is_empty() {
53        // Entire file is one interstitial region
54        return vec![FileRegion::Interstitial(InterstitialRegion {
55            position_key: "file_only".to_string(),
56            content: content.to_string(),
57        })];
58    }
59
60    let lines: Vec<&str> = content.lines().collect();
61    let total_lines = lines.len();
62
63    // Sort entities by start_line (they should already be sorted, but be safe)
64    let mut sorted_entities: Vec<&SemanticEntity> = entities.iter().collect();
65    sorted_entities.sort_by_key(|e| e.start_line);
66
67    let mut regions: Vec<FileRegion> = Vec::new();
68    let mut current_line: usize = 0; // 0-indexed into lines array
69
70    for (i, entity) in sorted_entities.iter().enumerate() {
71        // Entity start_line and end_line are 1-based from sem-core
72        let entity_start = entity.start_line.saturating_sub(1); // convert to 0-based
73        let entity_end = entity.end_line; // end_line is inclusive, so this is exclusive in 0-based
74
75        // Comment bundling: scan backwards from entity_start to find leading doc comments.
76        // These comments (JSDoc, Rust ///, Python docstrings, Java /** */) should be part
77        // of the entity region, not the interstitial gap.
78        let bundled_start = find_leading_comment_start(&lines, entity_start, current_line);
79
80        // Interstitial before this entity (excluding bundled comments)
81        if current_line < bundled_start {
82            let interstitial_content = join_lines(&lines[current_line..bundled_start]);
83            let position_key = if i == 0 {
84                "file_header".to_string()
85            } else {
86                format!("between:{}:{}", sorted_entities[i - 1].id, entity.id)
87            };
88            regions.push(FileRegion::Interstitial(InterstitialRegion {
89                position_key,
90                content: interstitial_content,
91            }));
92        }
93
94        // Entity region — includes bundled leading comments
95        let entity_end_clamped = entity_end.min(total_lines);
96        let entity_content = if bundled_start < entity_end_clamped {
97            join_lines(&lines[bundled_start..entity_end_clamped])
98        } else {
99            entity.content.clone()
100        };
101
102        regions.push(FileRegion::Entity(EntityRegion {
103            entity_id: entity.id.clone(),
104            entity_name: entity.name.clone(),
105            entity_type: entity.entity_type.clone(),
106            content: entity_content,
107            start_line: entity.start_line,
108            end_line: entity.end_line,
109        }));
110
111        current_line = entity_end_clamped;
112    }
113
114    // Interstitial after last entity (file footer)
115    if current_line < total_lines {
116        let footer_content = join_lines(&lines[current_line..total_lines]);
117        regions.push(FileRegion::Interstitial(InterstitialRegion {
118            position_key: "file_footer".to_string(),
119            content: footer_content,
120        }));
121    }
122
123    // Handle trailing newline — if original content ends with newline and our last region doesn't
124    if content.ends_with('\n') {
125        if let Some(last) = regions.last() {
126            if !last.content().ends_with('\n') {
127                match regions.last_mut() {
128                    Some(FileRegion::Entity(e)) => e.content.push('\n'),
129                    Some(FileRegion::Interstitial(i)) => i.content.push('\n'),
130                    None => {}
131                }
132            }
133        }
134    }
135
136    regions
137}
138
139/// Find the start of leading doc comments before an entity.
140///
141/// Walks backwards from `entity_start` to find contiguous doc comment lines.
142/// Stops at `min_line` (the end of the previous entity/region).
143///
144/// Recognizes:
145/// - `///` and `//!` (Rust doc comments)
146/// - `/** ... */` (JSDoc, JavaDoc block comments)
147/// - `# comment` above Python defs (not always doc, but commonly associated)
148/// - Decorators/annotations (already handled by entity extraction, but defensive)
149fn find_leading_comment_start(lines: &[&str], entity_start: usize, min_line: usize) -> usize {
150    if entity_start == 0 || entity_start <= min_line {
151        return entity_start;
152    }
153
154    let mut comment_start = entity_start;
155    let mut in_block_comment = false;
156
157    // Walk backwards
158    let mut line_idx = entity_start.saturating_sub(1);
159    loop {
160        if line_idx < min_line {
161            break;
162        }
163
164        let trimmed = lines[line_idx].trim();
165
166        if trimmed.is_empty() {
167            // Allow one blank line between comment and entity
168            // But don't extend past it
169            if comment_start == entity_start && line_idx + 1 == entity_start {
170                // Blank line immediately before entity — skip it, check further up
171                line_idx = line_idx.saturating_sub(1);
172                if line_idx < min_line {
173                    break;
174                }
175                continue;
176            }
177            break;
178        }
179
180        // Check for end of block comment (scanning backwards, so */ means start of block)
181        if trimmed.ends_with("*/") && !trimmed.starts_with("/*") {
182            // This is the end of a block comment — scan backwards for /*
183            in_block_comment = true;
184            comment_start = line_idx;
185            if line_idx == min_line {
186                break;
187            }
188            line_idx -= 1;
189            continue;
190        }
191
192        if in_block_comment {
193            if trimmed.starts_with("/*") || trimmed.starts_with("/**") {
194                comment_start = line_idx;
195                in_block_comment = false;
196            }
197            // Continue scanning backwards through block comment
198            if line_idx == min_line {
199                break;
200            }
201            line_idx -= 1;
202            continue;
203        }
204
205        // Single-line doc comment patterns
206        if trimmed.starts_with("///")    // Rust doc comment
207            || trimmed.starts_with("//!") // Rust inner doc comment
208            || trimmed.starts_with("/**") // JSDoc/JavaDoc one-liner
209            || trimmed.starts_with("* ")  // JSDoc/JavaDoc continuation
210            || trimmed == "*"             // Empty JSDoc line
211            || trimmed == "*/"            // End of JSDoc block
212        {
213            comment_start = line_idx;
214            if line_idx == min_line {
215                break;
216            }
217            line_idx -= 1;
218            continue;
219        }
220
221        // Not a comment line — stop
222        break;
223    }
224
225    comment_start
226}
227
228fn join_lines(lines: &[&str]) -> String {
229    if lines.is_empty() {
230        return String::new();
231    }
232    let mut result = lines.join("\n");
233    result.push('\n');
234    result
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240    use sem_core::parser::plugins::create_default_registry;
241
242    #[test]
243    fn test_extract_regions_typescript() {
244        let content = r#"import { foo } from 'bar';
245
246export function hello() {
247    return "hello";
248}
249
250export function world() {
251    return "world";
252}
253"#;
254
255        let registry = create_default_registry();
256        let plugin = registry.get_plugin("test.ts").unwrap();
257        let entities = plugin.extract_entities(content, "test.ts");
258
259        assert!(!entities.is_empty(), "Should extract entities from TypeScript");
260
261        let regions = extract_regions(content, &entities);
262
263        // Should have interstitial + entity regions
264        assert!(regions.len() >= 2, "Should have multiple regions, got {}", regions.len());
265
266        // Verify entities are present
267        let entity_regions: Vec<_> = regions
268            .iter()
269            .filter_map(|r| match r {
270                FileRegion::Entity(e) => Some(e),
271                _ => None,
272            })
273            .collect();
274
275        let entity_names: Vec<&str> = entity_regions.iter().map(|e| e.entity_name.as_str()).collect();
276        assert!(entity_names.contains(&"hello"), "Should find hello function, got {:?}", entity_names);
277        assert!(entity_names.contains(&"world"), "Should find world function, got {:?}", entity_names);
278    }
279
280    #[test]
281    fn test_comment_bundling_jsdoc() {
282        // JSDoc comment should be bundled with the following function entity
283        let content = r#"import { foo } from 'bar';
284
285/**
286 * Greets a person by name.
287 * @param name - The person's name
288 */
289export function hello(name: string) {
290    return `Hello, ${name}!`;
291}
292
293export function world() {
294    return "world";
295}
296"#;
297
298        let registry = create_default_registry();
299        let plugin = registry.get_plugin("test.ts").unwrap();
300        let entities = plugin.extract_entities(content, "test.ts");
301
302        let _hello = entities.iter().find(|e| e.name == "hello").expect("Should find hello");
303        let regions = extract_regions(content, &entities);
304
305        // Find the hello entity region
306        let hello_region = regions.iter().find(|r| match r {
307            FileRegion::Entity(e) => e.entity_name == "hello",
308            _ => false,
309        }).expect("Should find hello region");
310
311        // The entity region should include the JSDoc comment
312        assert!(
313            hello_region.content().contains("/**"),
314            "hello region should include JSDoc comment. Content: {:?}",
315            hello_region.content(),
316        );
317        assert!(
318            hello_region.content().contains("@param name"),
319            "hello region should include JSDoc @param. Content: {:?}",
320            hello_region.content(),
321        );
322
323        // The interstitial before hello should NOT contain the JSDoc
324        let interstitials: Vec<_> = regions.iter().filter(|r| !r.is_entity()).collect();
325        for inter in &interstitials {
326            assert!(
327                !inter.content().contains("/**") || inter.content().contains("@param") == false,
328                "Interstitial should not contain the bundled JSDoc. Key: {:?}, Content: {:?}",
329                inter.key(), inter.content(),
330            );
331        }
332    }
333
334    #[test]
335    fn test_comment_bundling_rust_doc() {
336        let content = r#"use std::io;
337
338/// Adds two numbers together.
339///
340/// # Examples
341/// ```
342/// assert_eq!(add(1, 2), 3);
343/// ```
344fn add(a: i32, b: i32) -> i32 {
345    a + b
346}
347
348fn subtract(a: i32, b: i32) -> i32 {
349    a - b
350}
351"#;
352
353        let registry = create_default_registry();
354        let plugin = registry.get_plugin("test.rs").unwrap();
355        let entities = plugin.extract_entities(content, "test.rs");
356
357        let regions = extract_regions(content, &entities);
358        let add_region = regions.iter().find(|r| match r {
359            FileRegion::Entity(e) => e.entity_name == "add",
360            _ => false,
361        }).expect("Should find add region");
362
363        assert!(
364            add_region.content().contains("/// Adds two numbers"),
365            "add region should include Rust doc comment. Content: {:?}",
366            add_region.content(),
367        );
368    }
369
370    #[test]
371    fn test_extract_regions_no_entities() {
372        let content = "just some text\nno code here\n";
373        let regions = extract_regions(content, &[]);
374        assert_eq!(regions.len(), 1);
375        assert!(!regions[0].is_entity());
376        assert_eq!(regions[0].content(), content);
377    }
378}