Inscribe 0.0.3 - Docs.rs

//! The core processing engine for Inscribe.
//!
//! This module is responsible for parsing the input markdown, executing the
//! embedded code blocks, and substituting the results back into the document.
//! The process is designed as a three-pass system for efficiency and correctness.

use crate::config::{Runner, DELIMITER};
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashMap;
use std::env;
use std::io::Write;
use std::path::Path;
use std::process::{Command, Stdio};

// This single, comprehensive Regex is used to find all relevant parts of the markdown in one pass.
// It uses named capture groups to distinguish between different types of inscribe directives.
// - `runner_def`: A runner definition comment, e.g., `<!-- inscribe python command="python3" -->`
// - `inscribe_block`: A fenced code block marked for execution, e.g., `<!-- inscribe -->```python...```
// - `inscribe_inline`: An inline code snippet marked for execution, e.g., `<!-- inscribe -->`...``
// The `(?s)` flag (DOTALL) allows `.` to match newlines, which is crucial for multiline code blocks.
static UNIFIED_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
r#"(?s)(?P<runner_def><!--\s*inscribe\s+(?P<runner_lang>\w+)[^>]*?-->)|(?P<inscribe_block><!--\s*inscribe\s*-->\s*```(?P<lang_fenced>\w+)\r?\n(?P<code_fenced>.*?)```(?:\r?\n)?)|(?P<inscribe_inline><!--\s*inscribe\s*-->`(?P<code_inline>[^`]+)`)"#
    )
    .unwrap()
});

// A simpler Regex to parse key-value attributes from within a runner definition comment.
// e.g., `command="..."`
static ATTR_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?P<key>\w+)="(?P<value>[^"]+)""#).unwrap());

/// Processes a markdown string, executing code blocks and embedding the output.
///
/// This function implements a three-pass algorithm:
/// 1. **Collection Pass**: It iterates through the markdown, parsing runner definitions
///    and collecting all code blocks into batches, grouped by their respective `Runner`.
/// 2. **Execution Pass**: It executes each batch of code. All code for a single runner
///    is sent to one process to minimize overhead. Outputs are captured.
/// 3. **Replacement Pass**: It iterates through the markdown again, replacing the original
///    inscribe blocks with the captured output from the execution pass.
///
/// # Arguments
/// * `markdown_input` - The raw markdown content to process.
/// * `default_runners` - A map of built-in language configurations.
/// * `input_path` - An optional path to the input file, used to set the
///   working directory for executed code, allowing relative path access.
///
/// # Returns
/// A `Result` containing the processed markdown string or an error string on failure.
pub fn process_markdown(
    markdown_input: &str,
    default_runners: &HashMap<String, Runner>,
    input_path: Option<&Path>,
) -> Result<String, String> {
    // === PASS 1: COLLECT CODE INTO BATCHES ===
    // This pass identifies all code to be run and groups it by the language runner.
    // This is efficient as it allows us to start one process per runner for all its blocks.
    let mut batches: HashMap<Runner, Vec<&str>> = HashMap::new();
    let mut active_runners = default_runners.clone();
    let mut last_lang = String::new(); // Tracks the last language for context-sensitive inline blocks.

    for caps in UNIFIED_RE.captures_iter(markdown_input) {
        // First, check for a runner definition and update the active runners.
        if let Some(def_text) = caps.name("runner_def") {
            let lang = caps.name("runner_lang").unwrap().as_str().to_string();
            let mut command = None;
            let mut delimiter = None;

            for attr_caps in ATTR_RE.captures_iter(def_text.as_str()) {
                match attr_caps.name("key").unwrap().as_str() {
                    "command" => {
                        command = Some(attr_caps.name("value").unwrap().as_str().to_string())
                    }
                    "delimiter" => {
                        delimiter = Some(attr_caps.name("value").unwrap().as_str().to_string())
                    }
                    _ => {} // Ignore unknown attributes.
                }
            }
            let runner = Runner {
                command: command.ok_or(format!("Runner for '{}' is missing a 'command'", lang))?,
                delimiter_command: delimiter.unwrap_or_else(|| format!("echo {}", DELIMITER)),
            };
            active_runners.insert(lang, runner);
        // Next, check for a runnable code block (fenced or inline).
        } else if caps.name("inscribe_block").is_some() || caps.name("inscribe_inline").is_some() {
            let lang = if let Some(fenced_lang) = caps.name("lang_fenced") {
                let l = fenced_lang.as_str().to_string();
                last_lang = l.clone(); // Remember this language for subsequent inline blocks.
                l
            } else {
                // An inline block was found; use the language of the last fenced block.
                if last_lang.is_empty() {
                    return Err(
                        "Found an inline inscribe block before any language was specified.".into(),
                    );
                }
                last_lang.clone()
            };

            let code = caps
                .name("code_fenced")
                .or_else(|| caps.name("code_inline"))
                .unwrap()
                .as_str();

            let runner = active_runners
                .get(&lang)
                .ok_or_else(|| format!("No runner configured for '{}'", lang))?;

            // Add the code snippet to the appropriate batch.
            batches.entry(runner.clone()).or_default().push(code);
        }
    }

    // === PASS 2: EXECUTE BATCHES ===
    // Now that all code is collected, run each batch.
    let mut results_map: HashMap<Runner, Vec<String>> = HashMap::new();
    for (runner, code_blocks) in batches {
        let output = execute_batch(&runner, &code_blocks, input_path)?;
        // Split the combined stdout by the delimiter to get individual results.
        let results: Vec<String> = output.split(DELIMITER).map(|s| s.to_string()).collect();
        results_map.insert(runner, results);
    }

    // === PASS 3: REPLACE INSCRIBE BLOCKS WITH RESULTS ===
    // Rebuild the markdown string, substituting placeholders with the execution results.
    let mut final_output = String::new();
    let mut last_end = 0;
    let mut result_counters: HashMap<Runner, usize> = HashMap::new();
    let mut active_runners = default_runners.clone(); // Re-initialize for a consistent state.
    let mut last_lang = String::new();

    for caps in UNIFIED_RE.captures_iter(markdown_input) {
        let match_start = caps.get(0).unwrap().start();
        let match_end = caps.get(0).unwrap().end();

        // Append the text between the last match and this one.
        final_output.push_str(&markdown_input[last_end..match_start]);

        if caps.name("runner_def").is_some() {
            // Runner definitions are consumed and not written to the final output.
            // We still need to parse them to ensure we use the correct runner for subsequent blocks.
            let lang = caps.name("runner_lang").unwrap().as_str().to_string();
            let def_text = caps.name("runner_def").unwrap().as_str();
            let mut command = None;
            let mut delimiter = None;

            for attr_caps in ATTR_RE.captures_iter(def_text) {
                match attr_caps.name("key").unwrap().as_str() {
                    "command" => {
                        command = Some(attr_caps.name("value").unwrap().as_str().to_string())
                    }
                    "delimiter" => {
                        delimiter = Some(attr_caps.name("value").unwrap().as_str().to_string())
                    }
                    _ => {}
                }
            }
            let runner = Runner {
                command: command.unwrap(), // Should be safe due to pass 1 validation.
                delimiter_command: delimiter.unwrap_or_else(|| format!("echo {}", DELIMITER)),
            };
            active_runners.insert(lang, runner);
        } else {
            // This is an inscribe block that needs to be replaced with its result.
            let is_inline = caps.name("inscribe_inline").is_some();

            let lang = if let Some(fenced_lang) = caps.name("lang_fenced") {
                let l = fenced_lang.as_str().to_string();
                last_lang = l.clone();
                l
            } else {
                last_lang.clone()
            };

            let runner = active_runners.get(&lang).unwrap();
            let counter = result_counters.entry(runner.clone()).or_insert(0);

            if let Some(results) = results_map.get(runner) {
                if let Some(result) = results.get(*counter) {
                    if is_inline {
                        final_output.push_str(
                            result
                                .replace("\r\n", "\n")
                                .replace('\r', "\n")
                                .trim_matches('\n'),
                        );
                    } else {
                        // For fenced blocks, trim only the final newline(s) from the output
                        // This allows the markdown's structural newlines to control spacing
                        let normalized_result = result.replace("\r\n", "\n").replace('\r', "\n");
                        let normalized_result = normalized_result
                            .trim_start_matches('\n')
                            .trim_start()
                            .trim_start_matches('\n');

                        if normalized_result.is_empty() || normalized_result.ends_with('\n') {
                            final_output.push_str(&normalized_result);
                        } else {
                            final_output.push_str(&normalized_result);
                            final_output.push('\n');
                        }
                    }
                }
            }
            *counter += 1;
        }

        last_end = match_end;
    }

    // Append any remaining content after the last match.
    final_output.push_str(&markdown_input[last_end..]);

    Ok(final_output)
}

/// Executes a batch of code snippets for a single runner.
///
/// It combines all code blocks into a single script, with each block followed by
/// a command to print the delimiter. This script is then piped to the standard
/// input of the runner's command.
fn execute_batch(
    runner: &Runner,
    code_blocks: &[&str],
    input_path: Option<&Path>,
) -> Result<String, String> {
    let mut full_script = String::new();
    for code_snippet in code_blocks {
        full_script.push_str(code_snippet);
        full_script.push('\n');
        full_script.push_str(&runner.delimiter_command);
        full_script.push('\n');
    }
    run_script_via_stdin(&runner.command, &full_script, input_path)
}

/// Spawns a process, writes a script to its stdin, and captures its output.
///
/// This function is the lowest-level execution primitive. It sets the working
/// directory to the parent directory of the input file, which allows scripts
/// to use relative paths to access other files.
fn run_script_via_stdin(
    runner_cmd: &str,
    script_content: &str,
    input_path: Option<&Path>,
) -> Result<String, String> {
    // Determine the working directory for the child process.
    // If an input file is present, use its parent directory. Otherwise, use the current directory.
    let working_dir = match input_path.and_then(|p| p.parent()) {
        Some(dir) if !dir.as_os_str().is_empty() => dir.to_path_buf(),
        _ => env::current_dir().map_err(|e| e.to_string())?,
    };

    // Parse the runner command into a program and its arguments.
    let cleaned_runner_cmd = runner_cmd.trim();
    let mut parts = cleaned_runner_cmd.split_whitespace();
    let command = parts.next().ok_or("Empty runner command")?;
    let args: Vec<&str> = parts.collect();

    let mut child = Command::new(command)
        .args(&args)
        .current_dir(working_dir)
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .map_err(|e| format!("Failed to spawn command '{}': {}", command, e))?;

    // Write the full script to the child process's stdin.
    if let Some(mut stdin) = child.stdin.take() {
        stdin
            .write_all(script_content.as_bytes())
            .map_err(|e| format!("Failed to write to script stdin: {}", e))?;
    } // stdin is closed when it goes out of scope, signaling EOF to the child.

    let output = child
        .wait_with_output()
        .map_err(|e| format!("Failed to wait for command '{}': {}", command, e))?;

    // Check if the command executed successfully. If not, return an error with stderr.
    if !output.status.success() {
        return Err(format!(
            "Execution failed for command '{}'.\nStderr:\n{}",
            command,
            String::from_utf8_lossy(&output.stderr)
        ));
    }

    Ok(String::from_utf8_lossy(&output.stdout).to_string())
}