Skip to main content

lumen_compiler/markdown/
extract.rs

1//! Markdown → code block extraction with source location tracking
2
3use crate::compiler::tokens::Span;
4
5/// A code block extracted from a Markdown file
6#[derive(Debug, Clone)]
7pub struct CodeBlock {
8    /// The raw source code inside the fenced block
9    pub code: String,
10    /// Language tag (should be "lumen")
11    pub language: String,
12    /// Span covering the entire fenced block in the original file
13    pub span: Span,
14    /// Byte offset where the code content starts (after the opening fence)
15    pub code_offset: usize,
16    /// Line number where the code content starts (1-based)
17    pub code_start_line: usize,
18}
19
20/// A top-level directive line (starts with @)
21#[derive(Debug, Clone)]
22pub struct DirectiveLine {
23    pub name: String,
24    pub value: Option<String>,
25    pub span: Span,
26}
27
28/// Result of extracting blocks from Markdown
29#[derive(Debug, Clone)]
30pub struct ExtractResult {
31    pub code_blocks: Vec<CodeBlock>,
32    pub directives: Vec<DirectiveLine>,
33    /// True when at least one fenced lumen/lm block was found.
34    pub has_fenced_blocks: bool,
35}
36
37/// Extract Lumen code blocks and directives from a Markdown file.
38///
39/// Code blocks are fenced with triple backticks and tagged `lumen`.
40/// Directives are lines starting with `@` outside of code blocks.
41pub fn extract_blocks(source: &str) -> ExtractResult {
42    let mut code_blocks = Vec::new();
43    let mut directives = Vec::new();
44    let mut has_fenced_blocks = false;
45
46    let mut in_fence = false;
47    let mut fence_lang = String::new();
48    let mut fence_code = String::new();
49    let mut fence_start_offset: usize = 0;
50    let mut fence_start_line: usize = 0;
51    let mut code_start_line: usize = 0;
52    let mut code_start_offset: usize = 0;
53    let mut fence_backtick_count: usize = 0;
54
55    let mut byte_offset: usize = 0;
56
57    // Normalize line endings (handle CRLF)
58    let normalized = source.replace("\r\n", "\n");
59    let lines: Vec<&str> = normalized.split('\n').collect();
60
61    for (line_idx, line) in lines.iter().enumerate() {
62        let line_num = line_idx + 1; // 1-based
63        let trimmed = line.trim();
64
65        if !in_fence {
66            // Check for opening fence: ```lumen (or ````lumen, etc.)
67            if let Some(backtick_count) = count_leading_backticks(trimmed) {
68                if backtick_count >= 3 {
69                    // Extract language tag after backticks, trimming whitespace
70                    let rest = &trimmed[backtick_count..];
71                    let lang = rest.trim().to_lowercase();
72                    // Accept "lumen", "lm", or empty (treated as lumen if it's the first block)
73                    if lang == "lumen" || lang == "lm" {
74                        in_fence = true;
75                        fence_lang = lang;
76                        fence_code.clear();
77                        fence_start_offset = byte_offset;
78                        fence_start_line = line_num;
79                        code_start_line = line_num + 1;
80                        code_start_offset = byte_offset + line.len() + 1; // +1 for newline
81                        fence_backtick_count = backtick_count;
82                    }
83                }
84            } else if let Some(stripped) = trimmed.strip_prefix('@') {
85                // Parse directive
86                let directive_text = stripped.trim();
87                let (name, value) =
88                    if let Some(space_idx) = directive_text.find(|c: char| c.is_whitespace()) {
89                        let n = directive_text[..space_idx].to_string();
90                        let v = directive_text[space_idx..]
91                            .trim()
92                            .trim_matches('"')
93                            .to_string();
94                        (n, Some(v))
95                    } else {
96                        (directive_text.to_string(), None)
97                    };
98                directives.push(DirectiveLine {
99                    name,
100                    value,
101                    span: Span::new(byte_offset, byte_offset + line.len(), line_num, 1),
102                });
103            }
104        } else {
105            // Check for closing fence (must match opening backtick count or more)
106            if let Some(backtick_count) = count_leading_backticks(trimmed) {
107                let rest = &trimmed[backtick_count..];
108                if backtick_count >= fence_backtick_count && rest.trim().is_empty() {
109                    // Closing fence found
110                    in_fence = false;
111                    code_blocks.push(CodeBlock {
112                        code: fence_code.clone(),
113                        language: fence_lang.clone(),
114                        span: Span::new(
115                            fence_start_offset,
116                            byte_offset + line.len(),
117                            fence_start_line,
118                            1,
119                        ),
120                        code_offset: code_start_offset,
121                        code_start_line,
122                    });
123                    has_fenced_blocks = true;
124                    fence_code.clear();
125                    continue;
126                }
127            }
128            // Not a closing fence, add line to code
129            if !fence_code.is_empty() {
130                fence_code.push('\n');
131            }
132            fence_code.push_str(line);
133        }
134
135        byte_offset += line.len() + 1; // +1 for newline
136    }
137
138    if code_blocks.is_empty() && looks_like_lumen_source(&normalized) {
139        // Treat unfenced code as markdown-native source while preserving directive handling.
140        let mut fallback_lines = Vec::new();
141        for line in normalized.split('\n') {
142            if line.trim().starts_with('@') {
143                fallback_lines.push(String::new());
144            } else {
145                fallback_lines.push(line.to_string());
146            }
147        }
148        code_blocks.push(CodeBlock {
149            code: fallback_lines.join("\n"),
150            language: "lumen".to_string(),
151            span: Span::new(0, normalized.len(), 1, 1),
152            code_offset: 0,
153            code_start_line: 1,
154        });
155    }
156
157    ExtractResult {
158        code_blocks,
159        directives,
160        has_fenced_blocks,
161    }
162}
163
164fn looks_like_lumen_source(source: &str) -> bool {
165    for line in source.lines() {
166        let trimmed = line.trim();
167        if trimmed.is_empty() {
168            continue;
169        }
170        if trimmed.starts_with('@') {
171            return true;
172        }
173        if let Some(first) = trimmed.split_whitespace().next() {
174            if is_lumen_code_starter(first) {
175                return true;
176            }
177        }
178    }
179    false
180}
181
182fn is_lumen_code_starter(first: &str) -> bool {
183    matches!(
184        first,
185        "record"
186            | "enum"
187            | "cell"
188            | "agent"
189            | "effect"
190            | "handler"
191            | "import"
192            | "use"
193            | "grant"
194            | "type"
195            | "const"
196            | "pub"
197            | "async"
198            | "trait"
199            | "impl"
200            | "let"
201            | "if"
202            | "for"
203            | "while"
204            | "loop"
205            | "match"
206            | "return"
207            | "halt"
208            | "break"
209            | "continue"
210            | "emit"
211    )
212}
213
214/// Count leading backticks in a trimmed line, returning None if doesn't start with backticks
215fn count_leading_backticks(trimmed: &str) -> Option<usize> {
216    let count = trimmed.chars().take_while(|&c| c == '`').count();
217    if count > 0 {
218        Some(count)
219    } else {
220        None
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_extract_simple() {
230        let src = r#"@lumen 1
231@package "test"
232
233# Hello
234
235```lumen
236record Foo
237  x: Int
238end
239```
240
241Some prose here.
242
243```lumen
244cell main() -> Int
245  return 42
246end
247```
248"#;
249        let result = extract_blocks(src);
250        assert_eq!(result.directives.len(), 2);
251        assert_eq!(result.directives[0].name, "lumen");
252        assert_eq!(result.directives[0].value, Some("1".to_string()));
253        assert_eq!(result.directives[1].name, "package");
254        assert_eq!(result.directives[1].value, Some("test".to_string()));
255
256        assert_eq!(result.code_blocks.len(), 2);
257        assert!(result.code_blocks[0].code.contains("record Foo"));
258        assert!(result.code_blocks[1].code.contains("cell main"));
259        assert!(result.has_fenced_blocks);
260    }
261
262    #[test]
263    fn test_extract_non_lumen_blocks_ignored() {
264        let src = r#"
265```python
266print("hello")
267```
268
269```lumen
270cell greet() -> String
271  return "hello"
272end
273```
274"#;
275        let result = extract_blocks(src);
276        assert_eq!(result.code_blocks.len(), 1);
277        assert!(result.code_blocks[0].code.contains("cell greet"));
278        assert!(result.has_fenced_blocks);
279    }
280
281    #[test]
282    fn test_nested_code_fences() {
283        let src = r#"
284````lumen
285record Example
286  code: String
287end
288
289cell demo() -> String
290  let x = "```lumen\ncell foo()\nend\n```"
291  return x
292end
293````
294"#;
295        let result = extract_blocks(src);
296        assert_eq!(result.code_blocks.len(), 1);
297        assert!(result.code_blocks[0].code.contains("```lumen"));
298        assert!(result.code_blocks[0].code.contains("cell foo"));
299        assert!(result.has_fenced_blocks);
300    }
301
302    #[test]
303    fn test_language_alias_lm() {
304        let src = r#"
305```lm
306cell test() -> Int
307  42
308end
309```
310"#;
311        let result = extract_blocks(src);
312        assert_eq!(result.code_blocks.len(), 1);
313        assert_eq!(result.code_blocks[0].language, "lm");
314        assert!(result.code_blocks[0].code.contains("cell test"));
315        assert!(result.has_fenced_blocks);
316    }
317
318    #[test]
319    fn test_case_insensitive_language() {
320        let src = r#"
321```Lumen
322cell test() -> Int
323  42
324end
325```
326
327```LUMEN
328cell test2() -> Int
329  84
330end
331```
332"#;
333        let result = extract_blocks(src);
334        assert_eq!(result.code_blocks.len(), 2);
335        assert!(result.code_blocks[0].code.contains("cell test"));
336        assert!(result.code_blocks[1].code.contains("cell test2"));
337        assert!(result.has_fenced_blocks);
338    }
339
340    #[test]
341    fn test_empty_code_block() {
342        let src = r#"
343```lumen
344```
345"#;
346        let result = extract_blocks(src);
347        assert_eq!(result.code_blocks.len(), 1);
348        assert_eq!(result.code_blocks[0].code, "");
349        assert!(result.has_fenced_blocks);
350    }
351
352    #[test]
353    fn test_trailing_whitespace_on_fence() {
354        let src = r#"
355```lumen
356cell test() -> Int
357  42
358end
359```
360"#;
361        let result = extract_blocks(src);
362        assert_eq!(result.code_blocks.len(), 1);
363        assert!(result.code_blocks[0].code.contains("cell test"));
364        assert!(result.has_fenced_blocks);
365    }
366
367    #[test]
368    fn test_windows_line_endings() {
369        let src = "```lumen\r\ncell test() -> Int\r\n  42\r\nend\r\n```\r\n";
370        let result = extract_blocks(src);
371        assert_eq!(result.code_blocks.len(), 1);
372        assert!(result.code_blocks[0].code.contains("cell test"));
373        assert!(result.code_blocks[0].code.contains("42"));
374        assert!(result.has_fenced_blocks);
375    }
376
377    #[test]
378    fn test_no_final_newline() {
379        let src = "```lumen\ncell test() -> Int\n  42\nend\n```";
380        let result = extract_blocks(src);
381        assert_eq!(result.code_blocks.len(), 1);
382        assert!(result.code_blocks[0].code.contains("cell test"));
383        assert!(result.has_fenced_blocks);
384    }
385
386    #[test]
387    fn test_multiple_blocks_line_tracking() {
388        let src = r#"First line
389
390```lumen
391cell first() -> Int
392  1
393end
394```
395
396Middle prose here.
397
398```lumen
399cell second() -> Int
400  2
401end
402```
403"#;
404        let result = extract_blocks(src);
405        assert_eq!(result.code_blocks.len(), 2);
406        // First block starts on line 3 (after blank line and "First line")
407        assert_eq!(result.code_blocks[0].code_start_line, 4);
408        // Second block starts after first block + prose
409        assert!(result.code_blocks[1].code_start_line > result.code_blocks[0].code_start_line);
410        assert!(result.has_fenced_blocks);
411    }
412
413    #[test]
414    fn test_indented_code_blocks_ignored() {
415        let src = r#"
416Regular text.
417
418    This is an indented code block
419    It should be ignored
420
421```lumen
422cell test() -> Int
423  42
424end
425```
426"#;
427        let result = extract_blocks(src);
428        // Only the fenced block should be extracted
429        assert_eq!(result.code_blocks.len(), 1);
430        assert!(result.code_blocks[0].code.contains("cell test"));
431        assert!(!result.code_blocks[0].code.contains("indented code block"));
432        assert!(result.has_fenced_blocks);
433    }
434
435    #[test]
436    fn test_backticks_inside_code() {
437        let src = r#"
438```lumen
439cell demo() -> String
440  let msg = "Use ``` for code fences"
441  return msg
442end
443```
444"#;
445        let result = extract_blocks(src);
446        assert_eq!(result.code_blocks.len(), 1);
447        assert!(result.code_blocks[0]
448            .code
449            .contains("Use ``` for code fences"));
450        assert!(result.has_fenced_blocks);
451    }
452
453    #[test]
454    fn test_unfenced_source_fallback_extracts_code() {
455        let src = r#"
456@doc_mode true
457
458cell main() -> Int
459  return 42
460end
461"#;
462        let result = extract_blocks(src);
463        assert_eq!(result.directives.len(), 1);
464        assert_eq!(result.code_blocks.len(), 1);
465        assert!(result.code_blocks[0].code.contains("cell main"));
466        assert!(!result.has_fenced_blocks);
467    }
468
469    #[test]
470    fn test_prose_only_markdown_does_not_fallback_to_code() {
471        let src = r#"# Heading
472
473This is documentation only.
474"#;
475        let result = extract_blocks(src);
476        assert!(result.code_blocks.is_empty());
477        assert!(!result.has_fenced_blocks);
478    }
479}