lumen_compiler/markdown/
extract.rs1use crate::compiler::tokens::Span;
4
5#[derive(Debug, Clone)]
7pub struct CodeBlock {
8 pub code: String,
10 pub language: String,
12 pub span: Span,
14 pub code_offset: usize,
16 pub code_start_line: usize,
18}
19
20#[derive(Debug, Clone)]
22pub struct DirectiveLine {
23 pub name: String,
24 pub value: Option<String>,
25 pub span: Span,
26}
27
28#[derive(Debug, Clone)]
30pub struct ExtractResult {
31 pub code_blocks: Vec<CodeBlock>,
32 pub directives: Vec<DirectiveLine>,
33 pub has_fenced_blocks: bool,
35}
36
37pub fn extract_blocks(source: &str) -> ExtractResult {
42 let mut code_blocks = Vec::new();
43 let mut directives = Vec::new();
44 let mut has_fenced_blocks = false;
45
46 let mut in_fence = false;
47 let mut fence_lang = String::new();
48 let mut fence_code = String::new();
49 let mut fence_start_offset: usize = 0;
50 let mut fence_start_line: usize = 0;
51 let mut code_start_line: usize = 0;
52 let mut code_start_offset: usize = 0;
53 let mut fence_backtick_count: usize = 0;
54
55 let mut byte_offset: usize = 0;
56
57 let normalized = source.replace("\r\n", "\n");
59 let lines: Vec<&str> = normalized.split('\n').collect();
60
61 for (line_idx, line) in lines.iter().enumerate() {
62 let line_num = line_idx + 1; let trimmed = line.trim();
64
65 if !in_fence {
66 if let Some(backtick_count) = count_leading_backticks(trimmed) {
68 if backtick_count >= 3 {
69 let rest = &trimmed[backtick_count..];
71 let lang = rest.trim().to_lowercase();
72 if lang == "lumen" || lang == "lm" {
74 in_fence = true;
75 fence_lang = lang;
76 fence_code.clear();
77 fence_start_offset = byte_offset;
78 fence_start_line = line_num;
79 code_start_line = line_num + 1;
80 code_start_offset = byte_offset + line.len() + 1; fence_backtick_count = backtick_count;
82 }
83 }
84 } else if let Some(stripped) = trimmed.strip_prefix('@') {
85 let directive_text = stripped.trim();
87 let (name, value) =
88 if let Some(space_idx) = directive_text.find(|c: char| c.is_whitespace()) {
89 let n = directive_text[..space_idx].to_string();
90 let v = directive_text[space_idx..]
91 .trim()
92 .trim_matches('"')
93 .to_string();
94 (n, Some(v))
95 } else {
96 (directive_text.to_string(), None)
97 };
98 directives.push(DirectiveLine {
99 name,
100 value,
101 span: Span::new(byte_offset, byte_offset + line.len(), line_num, 1),
102 });
103 }
104 } else {
105 if let Some(backtick_count) = count_leading_backticks(trimmed) {
107 let rest = &trimmed[backtick_count..];
108 if backtick_count >= fence_backtick_count && rest.trim().is_empty() {
109 in_fence = false;
111 code_blocks.push(CodeBlock {
112 code: fence_code.clone(),
113 language: fence_lang.clone(),
114 span: Span::new(
115 fence_start_offset,
116 byte_offset + line.len(),
117 fence_start_line,
118 1,
119 ),
120 code_offset: code_start_offset,
121 code_start_line,
122 });
123 has_fenced_blocks = true;
124 fence_code.clear();
125 continue;
126 }
127 }
128 if !fence_code.is_empty() {
130 fence_code.push('\n');
131 }
132 fence_code.push_str(line);
133 }
134
135 byte_offset += line.len() + 1; }
137
138 if code_blocks.is_empty() && looks_like_lumen_source(&normalized) {
139 let mut fallback_lines = Vec::new();
141 for line in normalized.split('\n') {
142 if line.trim().starts_with('@') {
143 fallback_lines.push(String::new());
144 } else {
145 fallback_lines.push(line.to_string());
146 }
147 }
148 code_blocks.push(CodeBlock {
149 code: fallback_lines.join("\n"),
150 language: "lumen".to_string(),
151 span: Span::new(0, normalized.len(), 1, 1),
152 code_offset: 0,
153 code_start_line: 1,
154 });
155 }
156
157 ExtractResult {
158 code_blocks,
159 directives,
160 has_fenced_blocks,
161 }
162}
163
164fn looks_like_lumen_source(source: &str) -> bool {
165 for line in source.lines() {
166 let trimmed = line.trim();
167 if trimmed.is_empty() {
168 continue;
169 }
170 if trimmed.starts_with('@') {
171 return true;
172 }
173 if let Some(first) = trimmed.split_whitespace().next() {
174 if is_lumen_code_starter(first) {
175 return true;
176 }
177 }
178 }
179 false
180}
181
182fn is_lumen_code_starter(first: &str) -> bool {
183 matches!(
184 first,
185 "record"
186 | "enum"
187 | "cell"
188 | "agent"
189 | "effect"
190 | "handler"
191 | "import"
192 | "use"
193 | "grant"
194 | "type"
195 | "const"
196 | "pub"
197 | "async"
198 | "trait"
199 | "impl"
200 | "let"
201 | "if"
202 | "for"
203 | "while"
204 | "loop"
205 | "match"
206 | "return"
207 | "halt"
208 | "break"
209 | "continue"
210 | "emit"
211 )
212}
213
214fn count_leading_backticks(trimmed: &str) -> Option<usize> {
216 let count = trimmed.chars().take_while(|&c| c == '`').count();
217 if count > 0 {
218 Some(count)
219 } else {
220 None
221 }
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn test_extract_simple() {
230 let src = r#"@lumen 1
231@package "test"
232
233# Hello
234
235```lumen
236record Foo
237 x: Int
238end
239```
240
241Some prose here.
242
243```lumen
244cell main() -> Int
245 return 42
246end
247```
248"#;
249 let result = extract_blocks(src);
250 assert_eq!(result.directives.len(), 2);
251 assert_eq!(result.directives[0].name, "lumen");
252 assert_eq!(result.directives[0].value, Some("1".to_string()));
253 assert_eq!(result.directives[1].name, "package");
254 assert_eq!(result.directives[1].value, Some("test".to_string()));
255
256 assert_eq!(result.code_blocks.len(), 2);
257 assert!(result.code_blocks[0].code.contains("record Foo"));
258 assert!(result.code_blocks[1].code.contains("cell main"));
259 assert!(result.has_fenced_blocks);
260 }
261
262 #[test]
263 fn test_extract_non_lumen_blocks_ignored() {
264 let src = r#"
265```python
266print("hello")
267```
268
269```lumen
270cell greet() -> String
271 return "hello"
272end
273```
274"#;
275 let result = extract_blocks(src);
276 assert_eq!(result.code_blocks.len(), 1);
277 assert!(result.code_blocks[0].code.contains("cell greet"));
278 assert!(result.has_fenced_blocks);
279 }
280
281 #[test]
282 fn test_nested_code_fences() {
283 let src = r#"
284````lumen
285record Example
286 code: String
287end
288
289cell demo() -> String
290 let x = "```lumen\ncell foo()\nend\n```"
291 return x
292end
293````
294"#;
295 let result = extract_blocks(src);
296 assert_eq!(result.code_blocks.len(), 1);
297 assert!(result.code_blocks[0].code.contains("```lumen"));
298 assert!(result.code_blocks[0].code.contains("cell foo"));
299 assert!(result.has_fenced_blocks);
300 }
301
302 #[test]
303 fn test_language_alias_lm() {
304 let src = r#"
305```lm
306cell test() -> Int
307 42
308end
309```
310"#;
311 let result = extract_blocks(src);
312 assert_eq!(result.code_blocks.len(), 1);
313 assert_eq!(result.code_blocks[0].language, "lm");
314 assert!(result.code_blocks[0].code.contains("cell test"));
315 assert!(result.has_fenced_blocks);
316 }
317
318 #[test]
319 fn test_case_insensitive_language() {
320 let src = r#"
321```Lumen
322cell test() -> Int
323 42
324end
325```
326
327```LUMEN
328cell test2() -> Int
329 84
330end
331```
332"#;
333 let result = extract_blocks(src);
334 assert_eq!(result.code_blocks.len(), 2);
335 assert!(result.code_blocks[0].code.contains("cell test"));
336 assert!(result.code_blocks[1].code.contains("cell test2"));
337 assert!(result.has_fenced_blocks);
338 }
339
340 #[test]
341 fn test_empty_code_block() {
342 let src = r#"
343```lumen
344```
345"#;
346 let result = extract_blocks(src);
347 assert_eq!(result.code_blocks.len(), 1);
348 assert_eq!(result.code_blocks[0].code, "");
349 assert!(result.has_fenced_blocks);
350 }
351
352 #[test]
353 fn test_trailing_whitespace_on_fence() {
354 let src = r#"
355```lumen
356cell test() -> Int
357 42
358end
359```
360"#;
361 let result = extract_blocks(src);
362 assert_eq!(result.code_blocks.len(), 1);
363 assert!(result.code_blocks[0].code.contains("cell test"));
364 assert!(result.has_fenced_blocks);
365 }
366
367 #[test]
368 fn test_windows_line_endings() {
369 let src = "```lumen\r\ncell test() -> Int\r\n 42\r\nend\r\n```\r\n";
370 let result = extract_blocks(src);
371 assert_eq!(result.code_blocks.len(), 1);
372 assert!(result.code_blocks[0].code.contains("cell test"));
373 assert!(result.code_blocks[0].code.contains("42"));
374 assert!(result.has_fenced_blocks);
375 }
376
377 #[test]
378 fn test_no_final_newline() {
379 let src = "```lumen\ncell test() -> Int\n 42\nend\n```";
380 let result = extract_blocks(src);
381 assert_eq!(result.code_blocks.len(), 1);
382 assert!(result.code_blocks[0].code.contains("cell test"));
383 assert!(result.has_fenced_blocks);
384 }
385
386 #[test]
387 fn test_multiple_blocks_line_tracking() {
388 let src = r#"First line
389
390```lumen
391cell first() -> Int
392 1
393end
394```
395
396Middle prose here.
397
398```lumen
399cell second() -> Int
400 2
401end
402```
403"#;
404 let result = extract_blocks(src);
405 assert_eq!(result.code_blocks.len(), 2);
406 assert_eq!(result.code_blocks[0].code_start_line, 4);
408 assert!(result.code_blocks[1].code_start_line > result.code_blocks[0].code_start_line);
410 assert!(result.has_fenced_blocks);
411 }
412
413 #[test]
414 fn test_indented_code_blocks_ignored() {
415 let src = r#"
416Regular text.
417
418 This is an indented code block
419 It should be ignored
420
421```lumen
422cell test() -> Int
423 42
424end
425```
426"#;
427 let result = extract_blocks(src);
428 assert_eq!(result.code_blocks.len(), 1);
430 assert!(result.code_blocks[0].code.contains("cell test"));
431 assert!(!result.code_blocks[0].code.contains("indented code block"));
432 assert!(result.has_fenced_blocks);
433 }
434
435 #[test]
436 fn test_backticks_inside_code() {
437 let src = r#"
438```lumen
439cell demo() -> String
440 let msg = "Use ``` for code fences"
441 return msg
442end
443```
444"#;
445 let result = extract_blocks(src);
446 assert_eq!(result.code_blocks.len(), 1);
447 assert!(result.code_blocks[0]
448 .code
449 .contains("Use ``` for code fences"));
450 assert!(result.has_fenced_blocks);
451 }
452
453 #[test]
454 fn test_unfenced_source_fallback_extracts_code() {
455 let src = r#"
456@doc_mode true
457
458cell main() -> Int
459 return 42
460end
461"#;
462 let result = extract_blocks(src);
463 assert_eq!(result.directives.len(), 1);
464 assert_eq!(result.code_blocks.len(), 1);
465 assert!(result.code_blocks[0].code.contains("cell main"));
466 assert!(!result.has_fenced_blocks);
467 }
468
469 #[test]
470 fn test_prose_only_markdown_does_not_fallback_to_code() {
471 let src = r#"# Heading
472
473This is documentation only.
474"#;
475 let result = extract_blocks(src);
476 assert!(result.code_blocks.is_empty());
477 assert!(!result.has_fenced_blocks);
478 }
479}