1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
//! String and Symbol Global Handling
//!
//! This module handles deduplication of string and symbol literals
//! as LLVM IR global constants.
use super::{CodeGen, CodeGenError};
use std::fmt::Write as _;
impl CodeGen {
/// Escape a byte slice for LLVM IR string literals.
///
/// Byte-by-byte: printable ASCII (excluding `"` and `\`) emits as the
/// raw character; everything else emits as `\NN` hex. Embedded NULs
/// and high-bit bytes (0x80–0xFF) round-trip exactly — Seq strings
/// are byte-clean.
pub(super) fn escape_llvm_string(bytes: &[u8]) -> Result<String, std::fmt::Error> {
let mut result = String::new();
for &b in bytes {
match b {
// LLVM IR string constants only spec the `\NN` hex escape.
// `\\` happens to be tolerated by the assembler today but
// isn't documented; emit `\5C` to stay strictly on-spec.
b'\\' => result.push_str(r"\5C"),
b'"' => result.push_str(r#"\22"#),
// Printable ASCII excluding `"` (0x22) and `\` (0x5C).
0x20..=0x21 | 0x23..=0x5B | 0x5D..=0x7E => result.push(b as char),
_ => write!(&mut result, r"\{:02X}", b)?,
}
}
Ok(result)
}
/// Get or create a global byte-string constant.
///
/// Storage layout is `[len+1 x i8]` with a trailing NUL — that way
/// C-string consumers (variant-tag comparison, symbol push) keep
/// working without breaking embedded NULs in byte literals, since
/// the codegen for byte literals passes an explicit length to the
/// runtime and ignores the trailing NUL.
pub(super) fn get_string_global(&mut self, bytes: &[u8]) -> Result<String, CodeGenError> {
if let Some(global_name) = self.string_constants.get(bytes) {
return Ok(global_name.clone());
}
let global_name = format!("@.str.{}", self.string_counter);
self.string_counter += 1;
let escaped = Self::escape_llvm_string(bytes)?;
let len = bytes.len() + 1; // +1 for trailing NUL
writeln!(
&mut self.string_globals,
"{} = private unnamed_addr constant [{} x i8] c\"{}\\00\"",
global_name, len, escaped
)?;
self.string_constants
.insert(bytes.to_vec(), global_name.clone());
Ok(global_name)
}
/// Get or create a global interned symbol constant (Issue #166)
///
/// Creates a static SeqString structure with capacity=0 to mark it as interned.
/// This enables O(1) symbol equality via pointer comparison.
pub(super) fn get_symbol_global(&mut self, symbol_name: &str) -> Result<String, CodeGenError> {
// Deduplicate: return existing global if we've seen this symbol
if let Some(global_name) = self.symbol_constants.get(symbol_name) {
return Ok(global_name.clone());
}
// Get or create the underlying string data
let str_global = self.get_string_global(symbol_name.as_bytes())?;
// Create the SeqString structure global
let sym_global = format!("@.sym.{}", self.symbol_counter);
self.symbol_counter += 1;
// SeqString layout: { ptr, i64 len, i64 capacity, i8 global }
// capacity=0 marks this as an interned symbol (never freed)
// global=1 marks it as static data
writeln!(
&mut self.symbol_globals,
"{} = private unnamed_addr constant {{ ptr, i64, i64, i8 }} {{ ptr {}, i64 {}, i64 0, i8 1 }}",
sym_global,
str_global,
symbol_name.len()
)?;
self.symbol_constants
.insert(symbol_name.to_string(), sym_global.clone());
Ok(sym_global)
}
/// Generate LLVM IR for entire program
pub(super) fn emit_string_and_symbol_globals(
&self,
ir: &mut String,
) -> Result<(), CodeGenError> {
// String constants
if !self.string_globals.is_empty() {
ir.push_str(&self.string_globals);
writeln!(ir)?;
}
// Symbol constants (interned symbols for O(1) equality)
if !self.symbol_globals.is_empty() {
ir.push_str(&self.symbol_globals);
writeln!(ir)?;
}
Ok(())
}
}