Skip to main content

ras/
parser.rs

1//! Assembly text parser
2//!
3//! Parses gas-compatible assembly: .text, .data, .global/.globl, directives,
4//! labels, and instructions. Outputs `lines` preserving label/instruction order
5//! for two-pass assembly (jmp/call with label resolution).
6
7use crate::encoder::traits::ParsedInstruction;
8use crate::error::RasError;
9use std::collections::HashSet;
10
11#[derive(Debug, Clone)]
12pub struct Section {
13    pub name: String,
14    pub flags: SectionFlags,
15}
16
17#[derive(Debug, Clone)]
18pub struct SectionFlags {
19    pub alloc: bool,
20    pub exec: bool,
21    pub write: bool,
22}
23
24#[derive(Debug, Clone)]
25pub struct Symbol {
26    pub name: String,
27    pub global: bool,
28    pub section: String,
29}
30
31#[derive(Debug, Clone)]
32pub enum Line {
33    Label(Symbol),
34    Instruction(ParsedInstruction),
35    Data(Vec<u8>),
36}
37
38pub struct ParsedAssembly {
39    pub sections: Vec<Section>,
40    pub symbols: Vec<Symbol>,
41    pub instructions: Vec<ParsedInstruction>,
42    pub lines: Vec<Line>,
43}
44
45pub struct AssemblyParser {
46    current_section: String,
47    sections: Vec<Section>,
48    symbols: Vec<Symbol>,
49    instructions: Vec<ParsedInstruction>,
50    lines: Vec<Line>,
51    pending_globals: HashSet<String>,
52}
53
54impl Default for AssemblyParser {
55    fn default() -> Self {
56        Self::new()
57    }
58}
59
60impl AssemblyParser {
61    pub fn new() -> Self {
62        Self {
63            current_section: ".text".to_string(),
64            sections: Vec::new(),
65            symbols: Vec::new(),
66            instructions: Vec::new(),
67            lines: Vec::new(),
68            pending_globals: HashSet::new(),
69        }
70    }
71
72    pub fn parse(&mut self, text: &str) -> Result<ParsedAssembly, RasError> {
73        for line in text.lines() {
74            let line = line.trim();
75            if line.is_empty() || line.starts_with('#') || line.starts_with("//") {
76                continue;
77            }
78
79            // Handle "label:" and "label: directive/instruction" forms.
80            // A label token has no internal whitespace and ends with ':'.
81            if let Some(colon_pos) = line.find(':') {
82                let before = &line[..colon_pos];
83                if !before.is_empty() && !before.contains(char::is_whitespace) {
84                    let label = before.trim();
85                    let global = self.pending_globals.remove(label);
86                    let sym = Symbol {
87                        name: label.to_string(),
88                        global,
89                        section: self.current_section.clone(),
90                    };
91                    self.symbols.push(sym.clone());
92                    self.lines.push(Line::Label(sym));
93                    let rest = line[colon_pos + 1..].trim();
94                    if !rest.is_empty() {
95                        if rest.starts_with('.') {
96                            self.parse_directive(rest)?;
97                        } else {
98                            self.parse_instruction(rest)?;
99                        }
100                    }
101                    continue;
102                }
103            }
104
105            if line.starts_with('.') {
106                self.parse_directive(line)?;
107            } else {
108                self.parse_instruction(line)?;
109            }
110        }
111
112        Ok(ParsedAssembly {
113            sections: self.sections.clone(),
114            symbols: self.symbols.clone(),
115            instructions: self.instructions.clone(),
116            lines: self.lines.clone(),
117        })
118    }
119
120    fn parse_directive(&mut self, line: &str) -> Result<(), RasError> {
121        let parts: Vec<&str> = line.split_whitespace().collect();
122        if parts.is_empty() {
123            return Ok(());
124        }
125
126        match parts[0] {
127            ".text" => {
128                self.current_section = ".text".to_string();
129                self.sections.push(Section {
130                    name: ".text".to_string(),
131                    flags: SectionFlags {
132                        alloc: true,
133                        exec: true,
134                        write: false,
135                    },
136                });
137            }
138            ".data" => {
139                self.current_section = ".data".to_string();
140                self.sections.push(Section {
141                    name: ".data".to_string(),
142                    flags: SectionFlags {
143                        alloc: true,
144                        exec: false,
145                        write: true,
146                    },
147                });
148            }
149            ".global" | ".globl" => {
150                if parts.len() < 2 {
151                    return Err(RasError::ParseError(
152                        ".global requires a symbol name".to_string(),
153                    ));
154                }
155                for name in &parts[1..] {
156                    self.pending_globals.insert((*name).to_string());
157                }
158            }
159            ".asciz" | ".string" => {
160                let rest = line[parts[0].len()..].trim();
161                if let Some(bytes) = parse_quoted_string_bytes(rest, true) {
162                    self.lines.push(Line::Data(bytes));
163                }
164            }
165            ".ascii" => {
166                let rest = line[parts[0].len()..].trim();
167                if let Some(bytes) = parse_quoted_string_bytes(rest, false) {
168                    self.lines.push(Line::Data(bytes));
169                }
170            }
171            ".section" | ".align" | ".balign" | ".p2align" | ".byte" | ".short" | ".int"
172            | ".long" | ".quad" | ".zero" | ".space"
173            | ".skip" | ".cfi_startproc" | ".cfi_endproc" | ".cfi_def_cfa" | ".size" | ".type"
174            | ".ident" | ".file" => {}
175            _ => {}
176        }
177
178        Ok(())
179    }
180
181    fn parse_instruction(&mut self, line: &str) -> Result<(), RasError> {
182        let parts: Vec<&str> = line.split_whitespace().collect();
183        if parts.is_empty() {
184            return Ok(());
185        }
186
187        let opcode = parts[0].to_string();
188        let operands: Vec<String> = if parts.len() > 1 {
189            parts[1..]
190                .join(" ")
191                .split(',')
192                .map(|s| s.trim().to_string())
193                .collect()
194        } else {
195            Vec::new()
196        };
197
198        let inst = ParsedInstruction { opcode, operands };
199        self.instructions.push(inst.clone());
200        self.lines.push(Line::Instruction(inst));
201        Ok(())
202    }
203}
204
205fn parse_quoted_string_bytes(s: &str, null_terminate: bool) -> Option<Vec<u8>> {
206    let s = s.trim();
207    let s = s.strip_prefix('"')?.strip_suffix('"')?;
208    let mut out = Vec::new();
209    let mut chars = s.chars();
210    while let Some(c) = chars.next() {
211        if c == '\\' {
212            match chars.next()? {
213                'n' => out.push(b'\n'),
214                'r' => out.push(b'\r'),
215                't' => out.push(b'\t'),
216                '0' => out.push(0),
217                '"' => out.push(b'"'),
218                '\\' => out.push(b'\\'),
219                c => out.push(c as u8),
220            }
221        } else {
222            out.push(c as u8);
223        }
224    }
225    if null_terminate {
226        out.push(0);
227    }
228    Some(out)
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    fn parse(src: &str) -> ParsedAssembly {
236        AssemblyParser::new().parse(src).expect("parse failed")
237    }
238
239    #[test]
240    fn empty_input_produces_empty_output() {
241        let asm = parse("");
242        assert!(asm.lines.is_empty());
243        assert!(asm.symbols.is_empty());
244        assert!(asm.instructions.is_empty());
245    }
246
247    #[test]
248    fn comments_and_blank_lines_are_skipped() {
249        let asm = parse("# this is a comment\n// also a comment\n\n");
250        assert!(asm.lines.is_empty());
251    }
252
253    #[test]
254    fn label_is_parsed_and_recorded_in_symbols() {
255        let asm = parse("main:");
256        assert_eq!(asm.symbols.len(), 1);
257        assert_eq!(asm.symbols[0].name, "main");
258        assert!(!asm.symbols[0].global);
259        assert!(matches!(&asm.lines[0], Line::Label(s) if s.name == "main"));
260    }
261
262    #[test]
263    fn global_directive_marks_following_label_as_global() {
264        let asm = parse(".globl main\nmain:");
265        assert_eq!(asm.symbols.len(), 1);
266        assert!(asm.symbols[0].global, "symbol should be global");
267    }
268
269    #[test]
270    fn global_without_label_does_not_panic() {
271        let asm = parse(".global _foo");
272        assert!(asm.symbols.is_empty(), "no label defined yet");
273    }
274
275    #[test]
276    fn global_missing_name_returns_error() {
277        let result = AssemblyParser::new().parse(".global");
278        assert!(result.is_err());
279    }
280
281    #[test]
282    fn instruction_with_no_operands() {
283        let asm = parse("ret");
284        assert_eq!(asm.instructions.len(), 1);
285        assert_eq!(asm.instructions[0].opcode, "ret");
286        assert!(asm.instructions[0].operands.is_empty());
287    }
288
289    #[test]
290    fn instruction_with_two_operands() {
291        let asm = parse("mov rax, rbx");
292        assert_eq!(asm.instructions[0].opcode, "mov");
293        assert_eq!(asm.instructions[0].operands, &["rax", "rbx"]);
294    }
295
296    #[test]
297    fn instruction_with_three_operands() {
298        let asm = parse("add x0, x1, x2");
299        assert_eq!(asm.instructions[0].operands, &["x0", "x1", "x2"]);
300    }
301
302    #[test]
303    fn text_and_data_directives_push_sections() {
304        let asm = parse(".text\n.data");
305        assert_eq!(asm.sections.len(), 2);
306        assert_eq!(asm.sections[0].name, ".text");
307        assert!(asm.sections[0].flags.exec);
308        assert!(!asm.sections[0].flags.write);
309        assert_eq!(asm.sections[1].name, ".data");
310        assert!(asm.sections[1].flags.write);
311        assert!(!asm.sections[1].flags.exec);
312    }
313
314    #[test]
315    fn label_section_tracks_current_section() {
316        let asm = parse(".data\nvar:");
317        assert_eq!(asm.symbols[0].section, ".data");
318    }
319
320    #[test]
321    fn known_no_op_directives_are_silently_ignored() {
322        let asm = parse(".align 16\n.size foo, 4\n.type foo, @function");
323        assert!(asm.lines.is_empty());
324    }
325
326    #[test]
327    fn mixed_labels_and_instructions_preserve_order() {
328        let asm = parse("foo:\n  mov rax, 0\n  ret");
329        assert_eq!(asm.lines.len(), 3);
330        assert!(matches!(&asm.lines[0], Line::Label(_)));
331        assert!(matches!(&asm.lines[1], Line::Instruction(_)));
332        assert!(matches!(&asm.lines[2], Line::Instruction(_)));
333    }
334
335    #[test]
336    fn multiple_globals_on_one_directive() {
337        // .global accepts a list; each name should be queued.
338        let asm = parse(".global a b\na:\nb:");
339        assert!(asm.symbols.iter().all(|s| s.global));
340        assert_eq!(asm.symbols.len(), 2);
341    }
342
343    #[test]
344    fn inline_label_with_asciz_emits_label_and_data() {
345        let asm = parse(".L_fmt: .asciz \"%lld\\n\"");
346        assert_eq!(asm.symbols.len(), 1);
347        assert_eq!(asm.symbols[0].name, ".L_fmt");
348        assert!(matches!(&asm.lines[0], Line::Label(s) if s.name == ".L_fmt"));
349        let data_line = &asm.lines[1];
350        match data_line {
351            Line::Data(bytes) => {
352                assert_eq!(bytes, b"%lld\n\0");
353            }
354            _ => panic!("expected Line::Data, got {:?}", data_line),
355        }
356    }
357
358    #[test]
359    fn asciz_standalone_emits_data() {
360        let asm = parse(".asciz \"hello\"");
361        assert_eq!(asm.lines.len(), 1);
362        match &asm.lines[0] {
363            Line::Data(bytes) => assert_eq!(bytes, b"hello\0"),
364            _ => panic!("expected Line::Data"),
365        }
366    }
367}