use crate::compiler::tokens::Span;
#[derive(Debug, Clone)]
pub struct CodeBlock {
pub code: String,
pub language: String,
pub span: Span,
pub code_offset: usize,
pub code_start_line: usize,
}
#[derive(Debug, Clone)]
pub struct DirectiveLine {
pub name: String,
pub value: Option<String>,
pub span: Span,
}
#[derive(Debug, Clone)]
pub struct ExtractResult {
pub code_blocks: Vec<CodeBlock>,
pub directives: Vec<DirectiveLine>,
pub has_fenced_blocks: bool,
}
pub fn extract_blocks(source: &str) -> ExtractResult {
let mut code_blocks = Vec::new();
let mut directives = Vec::new();
let mut has_fenced_blocks = false;
let mut in_fence = false;
let mut fence_lang = String::new();
let mut fence_code = String::new();
let mut fence_start_offset: usize = 0;
let mut fence_start_line: usize = 0;
let mut code_start_line: usize = 0;
let mut code_start_offset: usize = 0;
let mut fence_backtick_count: usize = 0;
let mut byte_offset: usize = 0;
let normalized = source.replace("\r\n", "\n");
let lines: Vec<&str> = normalized.split('\n').collect();
for (line_idx, line) in lines.iter().enumerate() {
let line_num = line_idx + 1; let trimmed = line.trim();
if !in_fence {
if let Some(backtick_count) = count_leading_backticks(trimmed) {
if backtick_count >= 3 {
let rest = &trimmed[backtick_count..];
let lang = rest.trim().to_lowercase();
if lang == "lumen" || lang == "lm" {
in_fence = true;
fence_lang = lang;
fence_code.clear();
fence_start_offset = byte_offset;
fence_start_line = line_num;
code_start_line = line_num + 1;
code_start_offset = byte_offset + line.len() + 1; fence_backtick_count = backtick_count;
}
}
} else if let Some(stripped) = trimmed.strip_prefix('@') {
let directive_text = stripped.trim();
let (name, value) =
if let Some(space_idx) = directive_text.find(|c: char| c.is_whitespace()) {
let n = directive_text[..space_idx].to_string();
let v = directive_text[space_idx..]
.trim()
.trim_matches('"')
.to_string();
(n, Some(v))
} else {
(directive_text.to_string(), None)
};
directives.push(DirectiveLine {
name,
value,
span: Span::new(byte_offset, byte_offset + line.len(), line_num, 1),
});
}
} else {
if let Some(backtick_count) = count_leading_backticks(trimmed) {
let rest = &trimmed[backtick_count..];
if backtick_count >= fence_backtick_count && rest.trim().is_empty() {
in_fence = false;
code_blocks.push(CodeBlock {
code: fence_code.clone(),
language: fence_lang.clone(),
span: Span::new(
fence_start_offset,
byte_offset + line.len(),
fence_start_line,
1,
),
code_offset: code_start_offset,
code_start_line,
});
has_fenced_blocks = true;
fence_code.clear();
continue;
}
}
if !fence_code.is_empty() {
fence_code.push('\n');
}
fence_code.push_str(line);
}
byte_offset += line.len() + 1; }
if code_blocks.is_empty() && looks_like_lumen_source(&normalized) {
let mut fallback_lines = Vec::new();
for line in normalized.split('\n') {
if line.trim().starts_with('@') {
fallback_lines.push(String::new());
} else {
fallback_lines.push(line.to_string());
}
}
code_blocks.push(CodeBlock {
code: fallback_lines.join("\n"),
language: "lumen".to_string(),
span: Span::new(0, normalized.len(), 1, 1),
code_offset: 0,
code_start_line: 1,
});
}
ExtractResult {
code_blocks,
directives,
has_fenced_blocks,
}
}
fn looks_like_lumen_source(source: &str) -> bool {
for line in source.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.starts_with('@') {
return true;
}
if let Some(first) = trimmed.split_whitespace().next() {
if is_lumen_code_starter(first) {
return true;
}
}
}
false
}
fn is_lumen_code_starter(first: &str) -> bool {
matches!(
first,
"record"
| "enum"
| "cell"
| "agent"
| "effect"
| "handler"
| "import"
| "use"
| "grant"
| "type"
| "const"
| "pub"
| "async"
| "trait"
| "impl"
| "let"
| "if"
| "for"
| "while"
| "loop"
| "match"
| "return"
| "halt"
| "break"
| "continue"
| "emit"
)
}
fn count_leading_backticks(trimmed: &str) -> Option<usize> {
let count = trimmed.chars().take_while(|&c| c == '`').count();
if count > 0 {
Some(count)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_simple() {
let src = r#"@lumen 1
@package "test"
# Hello
```lumen
record Foo
x: Int
end
```
Some prose here.
```lumen
cell main() -> Int
return 42
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.directives.len(), 2);
assert_eq!(result.directives[0].name, "lumen");
assert_eq!(result.directives[0].value, Some("1".to_string()));
assert_eq!(result.directives[1].name, "package");
assert_eq!(result.directives[1].value, Some("test".to_string()));
assert_eq!(result.code_blocks.len(), 2);
assert!(result.code_blocks[0].code.contains("record Foo"));
assert!(result.code_blocks[1].code.contains("cell main"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_extract_non_lumen_blocks_ignored() {
let src = r#"
```python
print("hello")
```
```lumen
cell greet() -> String
return "hello"
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell greet"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_nested_code_fences() {
let src = r#"
````lumen
record Example
code: String
end
cell demo() -> String
let x = "```lumen\ncell foo()\nend\n```"
return x
end
````
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("```lumen"));
assert!(result.code_blocks[0].code.contains("cell foo"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_language_alias_lm() {
let src = r#"
```lm
cell test() -> Int
42
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert_eq!(result.code_blocks[0].language, "lm");
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_case_insensitive_language() {
let src = r#"
```Lumen
cell test() -> Int
42
end
```
```LUMEN
cell test2() -> Int
84
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 2);
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(result.code_blocks[1].code.contains("cell test2"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_empty_code_block() {
let src = r#"
```lumen
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert_eq!(result.code_blocks[0].code, "");
assert!(result.has_fenced_blocks);
}
#[test]
fn test_trailing_whitespace_on_fence() {
let src = r#"
```lumen
cell test() -> Int
42
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_windows_line_endings() {
let src = "```lumen\r\ncell test() -> Int\r\n 42\r\nend\r\n```\r\n";
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(result.code_blocks[0].code.contains("42"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_no_final_newline() {
let src = "```lumen\ncell test() -> Int\n 42\nend\n```";
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_multiple_blocks_line_tracking() {
let src = r#"First line
```lumen
cell first() -> Int
1
end
```
Middle prose here.
```lumen
cell second() -> Int
2
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 2);
assert_eq!(result.code_blocks[0].code_start_line, 4);
assert!(result.code_blocks[1].code_start_line > result.code_blocks[0].code_start_line);
assert!(result.has_fenced_blocks);
}
#[test]
fn test_indented_code_blocks_ignored() {
let src = r#"
Regular text.
This is an indented code block
It should be ignored
```lumen
cell test() -> Int
42
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell test"));
assert!(!result.code_blocks[0].code.contains("indented code block"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_backticks_inside_code() {
let src = r#"
```lumen
cell demo() -> String
let msg = "Use ``` for code fences"
return msg
end
```
"#;
let result = extract_blocks(src);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0]
.code
.contains("Use ``` for code fences"));
assert!(result.has_fenced_blocks);
}
#[test]
fn test_unfenced_source_fallback_extracts_code() {
let src = r#"
@doc_mode true
cell main() -> Int
return 42
end
"#;
let result = extract_blocks(src);
assert_eq!(result.directives.len(), 1);
assert_eq!(result.code_blocks.len(), 1);
assert!(result.code_blocks[0].code.contains("cell main"));
assert!(!result.has_fenced_blocks);
}
#[test]
fn test_prose_only_markdown_does_not_fallback_to_code() {
let src = r#"# Heading
This is documentation only.
"#;
let result = extract_blocks(src);
assert!(result.code_blocks.is_empty());
assert!(!result.has_fenced_blocks);
}
}