qala-compiler 0.1.1

Compiler and bytecode VM for the Qala programming language
Documentation
//! the assembly-text builder for the ARM64 backend.
//!
//! [`Asm`] accumulates emitted instructions and labels into a `.text` section
//! buffer and `printf` format strings into a `.data` section buffer.
//! [`Asm::finish`] prepends the fixed m4 `define` preamble, the (omitted-if-
//! empty) `.data` section, and the `.text` section header, producing the
//! complete CPSC 355-dialect `.asm` text.
//!
//! the dialect, fixed: lowercase mnemonics and directives throughout, the m4
//! `define(fp, x29)` / `define(lr, x30)` register-alias preamble, GAS sections,
//! column-0 labels and directives, 8-space-indented instructions. matches the
//! course tutorial files under `docs/references/`.

/// the standard CPSC 355 instruction indentation: eight spaces. labels and
/// directives sit at column 0; instructions are indented under them.
const INDENT: &str = "        ";

/// the assembly-text builder. holds the `.text` section buffer (where every
/// function's directives, prologue, body, and epilogue land) and the `.data`
/// section buffer of `printf` format strings.
///
/// crate-private: only the `arm64` module builds assembly text.
pub(crate) struct Asm {
    /// the `.data` section buffer. the `printf` output path is the only
    /// writer: a `print` / `println` lowering interns its built format string
    /// here via [`Asm::intern_format`]. empty for a program with no output.
    data: String,
    /// the `.text` section buffer: per-function directives, labels, and
    /// instructions, in emission order.
    text: String,
    /// the format strings already interned, in interning order. the index of
    /// a string in this list is the `N` in its `.Lfmt_N` label;
    /// [`Asm::intern_format`] searches it so a repeated format string reuses
    /// its first label rather than emitting a second `.data` entry.
    format_strings: Vec<String>,
}

impl Asm {
    /// construct an empty builder. the m4 preamble and the section headers are
    /// written by [`Asm::finish`], not seeded here, so an in-progress `Asm`
    /// holds only the caller's emitted lines.
    pub(crate) fn new() -> Self {
        Asm {
            data: String::new(),
            text: String::new(),
            format_strings: Vec::new(),
        }
    }

    /// intern an arbitrary `printf` format string into the `.data` section and
    /// return its unique label.
    ///
    /// `content` is the literal format string the interpolation lowering
    /// built: real newline bytes where a `\n` belongs, `%lld` conversions for
    /// the `i64` holes, `%%` for a literal percent. interning is by content --
    /// a format string equal to one already interned returns that string's
    /// existing `.Lfmt_N` label and emits nothing, so the `.data` section
    /// holds one entry per distinct format string. a new string is appended as
    /// `.Lfmt_N:  .string "<escaped>"`, where the escaping
    /// ([`escape_for_string_directive`]) turns the real bytes into the GAS
    /// `.string` source form.
    pub(crate) fn intern_format(&mut self, content: &str) -> String {
        if let Some(index) = self.format_strings.iter().position(|s| s == content) {
            return format!(".Lfmt_{index}");
        }
        let index = self.format_strings.len();
        let label = format!(".Lfmt_{index}");
        self.format_strings.push(content.to_string());
        self.data.push_str(&format!(
            "{label}:  .string \"{}\"\n",
            escape_for_string_directive(content)
        ));
        label
    }

    /// append a raw line, plus a trailing newline, to the `.text` buffer.
    ///
    /// the line is written verbatim with no indentation -- the caller controls
    /// its own column. used for blank separators and any line that is neither a
    /// plain label nor a plain instruction.
    pub(crate) fn emit_line(&mut self, line: &str) {
        self.text.push_str(line);
        self.text.push('\n');
    }

    /// append a label definition (`<label>:`) at column 0 to the `.text` buffer.
    ///
    /// both global function labels (`main`) and local assembler labels
    /// (`.Lmain_epilogue`) go through here -- the `.L` prefix, when present, is
    /// already part of `label`.
    pub(crate) fn emit_label(&mut self, label: &str) {
        self.text.push_str(label);
        self.text.push_str(":\n");
    }

    /// append an instruction to the `.text` buffer, indented with the standard
    /// CPSC 355 eight-space indent.
    pub(crate) fn emit_insn(&mut self, insn: &str) {
        self.text.push_str(INDENT);
        self.text.push_str(insn);
        self.text.push('\n');
    }

    /// append an instruction with a trailing `// comment`, indented and
    /// comment-aligned in the worked-example style (`str x0, [fp, -8]  // a`).
    ///
    /// the comment documents what a stack slot or value is, mirroring the
    /// `// a`, `// spill lhs` annotations in the reference assembly.
    pub(crate) fn emit_insn_commented(&mut self, insn: &str, comment: &str) {
        self.text.push_str(INDENT);
        self.text.push_str(insn);
        self.text.push_str("  // ");
        self.text.push_str(comment);
        self.text.push('\n');
    }

    /// concatenate the m4 preamble, the (omitted-if-empty) `.data` section, and
    /// the `.text` section into the complete `.asm` text.
    ///
    /// the preamble is exactly the two `define` lines and a blank line. the
    /// `.data` section is emitted only when its buffer is non-empty -- the
    /// integer core leaves it empty, so `finish` produces preamble + `.text`.
    /// every emitted line ends in `\n`; the result is LF-only regardless of host.
    pub(crate) fn finish(self) -> String {
        let mut out = String::new();
        // the fixed m4 register-alias preamble.
        out.push_str("define(fp, x29)\n");
        out.push_str("define(lr, x30)\n");
        out.push('\n');
        // .data: emitted only when there is data to place (Phase 13 onward).
        if !self.data.is_empty() {
            out.push_str(INDENT);
            out.push_str(".data\n");
            out.push_str(&self.data);
            out.push('\n');
        }
        // .text: the section header, then every function's emitted lines.
        out.push_str(INDENT);
        out.push_str(".text\n");
        out.push_str(&self.text);
        out
    }
}

/// escape a literal string for a GAS `.string` directive body.
///
/// `.string` interprets C-style backslash escapes, so a real byte that would
/// be misread inside the directive's double quotes is rewritten to its escape
/// form: a backslash to `\\`, a double quote to `\"`, and the whitespace
/// control bytes to `\n` / `\t` / `\r`. every other byte -- printable ASCII,
/// the `%` of a `%lld` conversion, the doubled `%%` of a literal percent --
/// is copied through unchanged. the printf lowering builds a format string
/// with real newline bytes and `%lld` conversions; this turns that into the
/// `.string` source form the assembler reads.
fn escape_for_string_directive(content: &str) -> String {
    let mut out = String::with_capacity(content.len());
    for ch in content.chars() {
        match ch {
            '\\' => out.push_str("\\\\"),
            '"' => out.push_str("\\\""),
            '\n' => out.push_str("\\n"),
            '\t' => out.push_str("\\t"),
            '\r' => out.push_str("\\r"),
            other => out.push(other),
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn finish_on_an_empty_builder_is_the_preamble_and_text_header() {
        // the deterministic baseline: a builder with nothing emitted produces
        // exactly the two define lines, a blank line, and the .text header.
        let asm = Asm::new();
        let expected = "define(fp, x29)\n\
                        define(lr, x30)\n\
                        \n\
                        \x20\x20\x20\x20\x20\x20\x20\x20.text\n";
        assert_eq!(asm.finish(), expected);
    }

    #[test]
    fn finish_omits_the_data_section_when_the_buffer_is_empty() {
        // the integer core never writes .data; finish must not emit a bare
        // ".data" header for an empty buffer.
        let asm = Asm::new();
        assert!(!asm.finish().contains(".data"));
    }

    #[test]
    fn emit_label_writes_the_label_at_column_zero() {
        let mut asm = Asm::new();
        asm.emit_label("main");
        asm.emit_label(".Lmain_epilogue");
        let out = asm.finish();
        assert!(out.contains("\nmain:\n"));
        assert!(out.contains("\n.Lmain_epilogue:\n"));
    }

    #[test]
    fn emit_insn_indents_with_eight_spaces() {
        let mut asm = Asm::new();
        asm.emit_insn("mov x0, 0");
        let out = asm.finish();
        assert!(
            out.contains("\n        mov x0, 0\n"),
            "missing 8-space indent: {out:?}"
        );
    }

    #[test]
    fn emit_insn_commented_appends_a_trailing_comment() {
        let mut asm = Asm::new();
        asm.emit_insn_commented("str x0, [fp, -8]", "a");
        let out = asm.finish();
        assert!(out.contains("        str x0, [fp, -8]  // a\n"), "{out:?}");
    }

    #[test]
    fn emit_line_writes_the_line_verbatim() {
        let mut asm = Asm::new();
        asm.emit_line("");
        let out = asm.finish();
        // a blank emit_line produces an empty line inside .text.
        assert!(out.ends_with(".text\n\n"));
    }

    #[test]
    fn finish_output_is_lf_only() {
        // every writer appends '\n'; the result must carry no carriage return.
        let mut asm = Asm::new();
        asm.emit_label("f");
        asm.emit_insn("ret");
        assert!(!asm.finish().contains('\r'));
    }

    #[test]
    fn intern_format_registers_the_format_string_and_returns_a_unique_label() {
        let mut asm = Asm::new();
        assert_eq!(asm.intern_format("done\n"), ".Lfmt_0");
        let out = asm.finish();
        // the label+directive lands inside a .data section.
        assert!(
            out.contains("        .data\n"),
            "missing .data section: {out:?}"
        );
        assert!(
            out.contains(".Lfmt_0:  .string \"done\\n\"\n"),
            "missing the interned format string: {out:?}"
        );
    }

    #[test]
    fn intern_format_numbers_distinct_strings_monotonically() {
        // two different format strings get .Lfmt_0 and .Lfmt_1, both emitted.
        let mut asm = Asm::new();
        assert_eq!(asm.intern_format("a"), ".Lfmt_0");
        assert_eq!(asm.intern_format("b"), ".Lfmt_1");
        let out = asm.finish();
        assert!(out.contains(".Lfmt_0:  .string \"a\"\n"), "{out:?}");
        assert!(out.contains(".Lfmt_1:  .string \"b\"\n"), "{out:?}");
    }

    #[test]
    fn intern_format_deduplicates_by_content() {
        // interning the same content twice returns the first label and emits
        // exactly one .data entry -- one entry per distinct format string.
        let mut asm = Asm::new();
        let first = asm.intern_format("fib(%lld) = %lld\n");
        let second = asm.intern_format("fib(%lld) = %lld\n");
        assert_eq!(
            first, second,
            "an identical format string must reuse its label"
        );
        assert_eq!(first, ".Lfmt_0");
        let out = asm.finish();
        assert_eq!(
            out.matches(".Lfmt_0:").count(),
            1,
            "a repeated format string must be interned exactly once: {out:?}"
        );
    }

    #[test]
    fn intern_format_escapes_quotes_backslashes_and_newlines() {
        // a format string with a real newline, a quote, and a backslash must
        // reach the .string directive in escaped form.
        let mut asm = Asm::new();
        asm.intern_format("a\"b\\c\n");
        let out = asm.finish();
        assert!(
            out.contains(".Lfmt_0:  .string \"a\\\"b\\\\c\\n\"\n"),
            "the format string was not escaped for the .string directive: {out:?}"
        );
    }

    #[test]
    fn intern_format_passes_percent_conversions_through_unescaped() {
        // `.string` does not interpret `%`, so a `%lld` conversion and a
        // doubled `%%` literal-percent are copied through verbatim.
        let mut asm = Asm::new();
        asm.intern_format("100%% done: %lld\n");
        let out = asm.finish();
        assert!(
            out.contains(".Lfmt_0:  .string \"100%% done: %lld\\n\"\n"),
            "percent conversions must pass through unescaped: {out:?}"
        );
    }

    #[test]
    fn finish_still_omits_the_data_section_when_no_format_string_is_interned() {
        // the empty-.data behaviour is unchanged when intern_format is unused.
        let asm = Asm::new();
        assert!(!asm.finish().contains(".data"));
    }

    #[test]
    fn escape_for_string_directive_leaves_plain_text_untouched() {
        // printable ASCII with no escape-worthy byte passes through unchanged.
        assert_eq!(
            escape_for_string_directive("fib(%lld) = %lld"),
            "fib(%lld) = %lld"
        );
    }
}