markdown_code_runner/
parser.rs

1//! State machine for processing Markdown files.
2
3use anyhow::{bail, Result};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashMap;
7use std::path::PathBuf;
8
9use crate::executor::{execute_code, Language};
10use crate::markers::{
11    get_indent, is_code_backticks_end, is_code_backticks_start, is_code_comment_bash_start,
12    is_code_comment_end, is_code_comment_python_start, is_output_end, is_output_start, is_skip,
13    remove_md_comment, WARNING,
14};
15
16/// Pattern to extract key=value options from a backtick line.
17static OPTION_PATTERN: Lazy<Regex> =
18    Lazy::new(|| Regex::new(r"(?P<key>\w+)=(?P<value>\S+)").unwrap());
19
20/// Current section being processed.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Section {
23    Normal,
24    Output,
25    CodeCommentPython,
26    CodeCommentBash,
27    CodeBackticks,
28}
29
30/// Backtick options extracted from the code fence line.
31#[derive(Debug, Clone, Default)]
32pub struct BacktickOptions {
33    pub language: String,
34    pub filename: Option<PathBuf>,
35    pub other: HashMap<String, String>,
36}
37
38impl BacktickOptions {
39    /// Extract backtick options from a code fence line.
40    pub fn from_line(line: &str) -> Self {
41        let mut options = BacktickOptions::default();
42
43        // Extract language
44        static LANG_PATTERN: Lazy<Regex> =
45            Lazy::new(|| Regex::new(r"```(?P<language>\w+)").unwrap());
46
47        if let Some(caps) = LANG_PATTERN.captures(line) {
48            options.language = caps["language"].to_string();
49        }
50
51        // Extract key=value options after markdown-code-runner
52        if line.contains("markdown-code-runner") {
53            for caps in OPTION_PATTERN.captures_iter(line) {
54                let key = caps["key"].to_string();
55                let value = caps["value"].to_string();
56                if key == "filename" {
57                    options.filename = Some(PathBuf::from(value));
58                } else {
59                    options.other.insert(key, value);
60                }
61            }
62        }
63
64        options
65    }
66}
67
68/// Processing state for the Markdown file.
69pub struct ProcessingState {
70    /// Current section being processed.
71    pub section: Section,
72    /// Code lines collected from the current code block.
73    pub code: Vec<String>,
74    /// Original output lines (preserved when skipping).
75    pub original_output: Vec<String>,
76    /// Whether to skip the next code block.
77    pub skip_code_block: bool,
78    /// Output from the last executed code block.
79    pub output: Option<Vec<String>>,
80    /// New lines being built.
81    pub new_lines: Vec<String>,
82    /// Options from the current backtick code block.
83    pub backtick_options: BacktickOptions,
84    /// Whether to standardize backticks (remove markdown-code-runner).
85    pub backtick_standardize: bool,
86    /// Indentation of the current code block.
87    pub indent: String,
88    /// Verbose mode for debugging.
89    pub verbose: bool,
90    /// All Python code blocks seen so far (for context sharing).
91    python_blocks: Vec<Vec<String>>,
92}
93
94impl ProcessingState {
95    /// Create a new processing state.
96    pub fn new(backtick_standardize: bool, verbose: bool) -> Self {
97        Self {
98            section: Section::Normal,
99            code: Vec::new(),
100            original_output: Vec::new(),
101            skip_code_block: false,
102            output: None,
103            new_lines: Vec::new(),
104            backtick_options: BacktickOptions::default(),
105            backtick_standardize,
106            indent: String::new(),
107            verbose,
108            python_blocks: Vec::new(),
109        }
110    }
111
112    /// Process a single line of the Markdown file.
113    pub fn process_line(&mut self, line: &str) -> Result<()> {
114        if is_skip(line) {
115            self.skip_code_block = true;
116            self.new_lines.push(line.to_string());
117        } else if is_output_start(line).is_some() {
118            self.process_output_start(line);
119        } else if is_output_end(line) {
120            self.process_output_end(line);
121        } else {
122            match self.section {
123                Section::CodeCommentPython | Section::CodeCommentBash => {
124                    self.process_comment_code(line)?;
125                }
126                Section::CodeBackticks => {
127                    self.process_backtick_code(line)?;
128                }
129                Section::Output => {
130                    self.original_output.push(line.to_string());
131                }
132                Section::Normal => {
133                    let processed_line = self.process_start_markers(line);
134                    self.new_lines
135                        .push(processed_line.unwrap_or_else(|| line.to_string()));
136                    return Ok(());
137                }
138            }
139            if self.section != Section::Output {
140                self.new_lines.push(line.to_string());
141            }
142        }
143        Ok(())
144    }
145
146    /// Process start markers (code block starts).
147    fn process_start_markers(&mut self, line: &str) -> Option<String> {
148        // Check for Python code comment start
149        if is_code_comment_python_start(line).is_some() {
150            self.output = None;
151            self.section = Section::CodeCommentPython;
152            self.indent = get_indent(line);
153            return Some(line.to_string());
154        }
155
156        // Check for Bash code comment start
157        if is_code_comment_bash_start(line).is_some() {
158            self.output = None;
159            self.section = Section::CodeCommentBash;
160            self.indent = get_indent(line);
161            return Some(line.to_string());
162        }
163
164        // Check for backtick code block start
165        if let Some(caps) = is_code_backticks_start(line) {
166            self.output = None;
167            self.backtick_options = BacktickOptions::from_line(line);
168            self.section = Section::CodeBackticks;
169            self.indent = caps.name("spaces").map_or("", |m| m.as_str()).to_string();
170
171            // Standardize backticks if needed
172            if self.backtick_standardize && line.contains("markdown-code-runner") {
173                static STRIP_PATTERN: Lazy<Regex> =
174                    Lazy::new(|| Regex::new(r"\s+markdown-code-runner.*").unwrap());
175                return Some(STRIP_PATTERN.replace(line, "").to_string());
176            }
177            return Some(line.to_string());
178        }
179
180        None
181    }
182
183    /// Process output start marker.
184    fn process_output_start(&mut self, line: &str) {
185        self.section = Section::Output;
186        if !self.skip_code_block {
187            // Get the output, panicking if it's None (this is a programming error)
188            let output = self.output.as_ref().unwrap_or_else(|| {
189                panic!("Output must be set before OUTPUT:START, line: {}", line)
190            });
191            let indent = get_indent(line);
192
193            // Add the output start marker
194            self.new_lines.push(line.to_string());
195
196            // Add the warning comment with indentation
197            self.new_lines.push(format!("{}{}", indent, WARNING));
198
199            // Add each output line with proper indentation and trailing whitespace trimmed
200            for ol in output {
201                let trimmed = ol.trim_end();
202                if trimmed.is_empty() {
203                    self.new_lines.push(String::new());
204                } else {
205                    self.new_lines.push(format!("{}{}", indent, trimmed));
206                }
207            }
208        } else {
209            self.original_output.push(line.to_string());
210        }
211    }
212
213    /// Process output end marker.
214    fn process_output_end(&mut self, line: &str) {
215        self.section = Section::Normal;
216        if self.skip_code_block {
217            self.new_lines.append(&mut self.original_output);
218            self.skip_code_block = false;
219        }
220        self.new_lines.push(line.to_string());
221        self.original_output.clear();
222        self.output = None;
223    }
224
225    /// Strip the code block's indentation prefix from a line.
226    fn strip_indent(&self, line: &str) -> String {
227        if !self.indent.is_empty() && line.starts_with(&self.indent) {
228            line[self.indent.len()..].to_string()
229        } else {
230            line.to_string()
231        }
232    }
233
234    /// Process code inside a comment block.
235    fn process_comment_code(&mut self, line: &str) -> Result<()> {
236        if is_code_comment_end(line) {
237            if !self.skip_code_block {
238                let language = match self.section {
239                    Section::CodeCommentPython => Language::Python,
240                    Section::CodeCommentBash => Language::Bash,
241                    _ => unreachable!(),
242                };
243                self.execute_current_block(language)?;
244            }
245            self.section = Section::Normal;
246            self.code.clear();
247            self.backtick_options = BacktickOptions::default();
248            self.indent.clear();
249        } else {
250            // Remove markdown comment and add to code
251            if let Some(code_line) = remove_md_comment(line) {
252                self.code.push(code_line);
253            }
254        }
255        Ok(())
256    }
257
258    /// Process code inside a backtick block.
259    fn process_backtick_code(&mut self, line: &str) -> Result<()> {
260        if is_code_backticks_end(line) {
261            if !self.skip_code_block {
262                let language = Language::parse(&self.backtick_options.language);
263                // Clone the filename to avoid borrow issues
264                let output_file = self.backtick_options.filename.clone();
265
266                if language.is_none() && output_file.is_none() {
267                    bail!("Specify 'output_file' for non-Python/Bash languages.");
268                }
269
270                if let Some(lang) = language {
271                    self.execute_current_block_with_file(lang, output_file.as_deref())?;
272                } else {
273                    // Write to file for non-executable languages
274                    let code = self.code.clone();
275                    let verbose = self.verbose;
276                    self.output = Some(execute_code(
277                        &code,
278                        Language::Python,
279                        output_file.as_deref(),
280                        verbose,
281                    )?);
282                }
283            }
284            self.section = Section::Normal;
285            self.code.clear();
286            self.backtick_options = BacktickOptions::default();
287            self.indent.clear();
288        } else {
289            // Strip the block indentation from the code line
290            let stripped = self.strip_indent(line);
291            self.code.push(stripped);
292        }
293        Ok(())
294    }
295
296    /// Execute the current code block for Python with context sharing.
297    fn execute_current_block(&mut self, language: Language) -> Result<()> {
298        self.execute_current_block_with_file(language, None)
299    }
300
301    /// Execute the current code block with optional output file.
302    fn execute_current_block_with_file(
303        &mut self,
304        language: Language,
305        output_file: Option<&std::path::Path>,
306    ) -> Result<()> {
307        if output_file.is_some() {
308            // Write to file, no execution
309            self.output = Some(execute_code(
310                &self.code,
311                language,
312                output_file,
313                self.verbose,
314            )?);
315        } else if language == Language::Python {
316            // For Python, we need to share context between blocks
317            // Add current block to the list of Python blocks
318            self.python_blocks.push(self.code.clone());
319
320            // We need to capture only the output from the current block
321            // The Python version uses exec() with a shared context, but since we're
322            // shelling out, we need to use a different approach.
323            //
324            // Strategy: Add a marker before the current block's output
325            let marker = format!("__MCR_MARKER_{}__", self.python_blocks.len());
326            let mut code_with_marker: Vec<String> = Vec::new();
327
328            // Add all previous blocks
329            for (i, block) in self.python_blocks.iter().enumerate() {
330                if i == self.python_blocks.len() - 1 {
331                    // Current block - add marker before it
332                    code_with_marker.push(format!("print('{}')", marker));
333                }
334                code_with_marker.extend(block.iter().cloned());
335            }
336
337            let output = execute_code(&code_with_marker, Language::Python, None, self.verbose)?;
338
339            // Extract only the output after our marker
340            let mut in_current_block = false;
341            let mut current_output: Vec<String> = Vec::new();
342            for line in output {
343                if line == marker {
344                    in_current_block = true;
345                } else if in_current_block {
346                    current_output.push(line);
347                }
348            }
349
350            self.output = Some(current_output);
351        } else {
352            // Bash doesn't need context sharing
353            self.output = Some(execute_code(&self.code, language, None, self.verbose)?);
354        }
355        Ok(())
356    }
357}
358
359/// Process markdown content and return the modified lines.
360pub fn process_markdown(
361    content: &[String],
362    verbose: bool,
363    backtick_standardize: bool,
364    execute: bool,
365) -> Result<Vec<String>> {
366    if !execute {
367        return Ok(content.to_vec());
368    }
369
370    let mut state = ProcessingState::new(backtick_standardize, verbose);
371
372    for (i, line) in content.iter().enumerate() {
373        if verbose {
374            eprintln!("\x1b[1mline {:4}\x1b[0m: {}", i, line);
375        }
376        state.process_line(line)?;
377    }
378
379    Ok(state.new_lines)
380}
381
382#[cfg(test)]
383mod tests {
384    use super::*;
385
386    #[test]
387    fn test_backtick_options_from_line() {
388        let opts = BacktickOptions::from_line("```python markdown-code-runner filename=test.py");
389        assert_eq!(opts.language, "python");
390        assert_eq!(opts.filename, Some(PathBuf::from("test.py")));
391
392        let opts = BacktickOptions::from_line("```bash markdown-code-runner");
393        assert_eq!(opts.language, "bash");
394        assert_eq!(opts.filename, None);
395
396        let opts = BacktickOptions::from_line("```python");
397        assert_eq!(opts.language, "python");
398        assert_eq!(opts.filename, None);
399    }
400
401    #[test]
402    fn test_process_simple_python() {
403        let input = vec![
404            "Some text".to_string(),
405            "```python markdown-code-runner".to_string(),
406            "print('Hello, world!')".to_string(),
407            "```".to_string(),
408            "<!-- OUTPUT:START -->".to_string(),
409            "old output".to_string(),
410            "<!-- OUTPUT:END -->".to_string(),
411        ];
412
413        let output = process_markdown(&input, false, false, true).unwrap();
414        assert!(output.contains(&"Hello, world!".to_string()));
415        assert!(!output.contains(&"old output".to_string()));
416    }
417
418    #[test]
419    fn test_process_with_skip() {
420        let input = vec![
421            "<!-- CODE:SKIP -->".to_string(),
422            "```python markdown-code-runner".to_string(),
423            "print('Hello, world!')".to_string(),
424            "```".to_string(),
425            "<!-- OUTPUT:START -->".to_string(),
426            "old output".to_string(),
427            "<!-- OUTPUT:END -->".to_string(),
428        ];
429
430        let output = process_markdown(&input, false, false, true).unwrap();
431        assert!(output.contains(&"old output".to_string()));
432    }
433
434    #[test]
435    fn test_process_execute_false() {
436        let input = vec![
437            "```python markdown-code-runner".to_string(),
438            "print('Hello')".to_string(),
439            "```".to_string(),
440        ];
441
442        let output = process_markdown(&input, false, false, false).unwrap();
443        assert_eq!(input, output);
444    }
445}