1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
pub mod assembler_errors;
pub mod comment_parsers;
pub mod directive_parsers;
pub mod instruction_parsers;
pub mod label_parsers;
pub mod opcode_parsers;
pub mod operand_parsers;
pub mod program_parsers;
pub mod register_parsers;
pub mod symbols;
use byteorder::{LittleEndian, WriteBytesExt};
use nom::types::CompleteStr;
use assembler::assembler_errors::AssemblerError;
use assembler::instruction_parsers::AssemblerInstruction;
use assembler::program_parsers::{program, Program};
use assembler::symbols::{Symbol, SymbolTable, SymbolType};
use instruction::Opcode;
/// Magic number that begins every bytecode file prefix. These spell out EPIE in ASCII, if you were wondering.
pub const PIE_HEADER_PREFIX: [u8; 4] = [0x45, 0x50, 0x49, 0x45];
/// Constant that determines how long the header is. There are 60 zeros left after the prefix, for later usage if needed.
pub const PIE_HEADER_LENGTH: usize = 64;
#[derive(Debug, PartialEq, Clone)]
pub enum Token {
Op { code: Opcode },
Register { reg_num: u8 },
IntegerOperand { value: i32 },
FloatOperand { value: f64 },
LabelDeclaration { name: String },
LabelUsage { name: String },
Directive { name: String },
IrString { name: String },
Comment,
}
#[derive(Debug, Default)]
pub struct Assembler {
/// Tracks which phase the assember is in
phase: AssemblerPhase,
/// Symbol table for constants and variables
pub symbols: SymbolTable,
/// The read-only data section constants are put in
pub ro: Vec<u8>,
/// The compiled bytecode generated from the assembly instructions
pub bytecode: Vec<u8>,
/// Tracks the current offset of the read-only section
ro_offset: u32,
/// A list of all the sections we've seen in the code
sections: Vec<AssemblerSection>,
/// The current section the assembler is in
current_section: Option<AssemblerSection>,
/// The current instruction the assembler is converting to bytecode
current_instruction: u32,
/// Any errors we find along the way. At the end, we'll present them to the user.
errors: Vec<AssemblerError>,
/// Scratch buffer
buf: [u8; 4],
}
impl Assembler {
pub fn new() -> Assembler {
Assembler {
current_instruction: 0,
ro_offset: 0,
ro: vec![],
bytecode: vec![],
sections: vec![],
errors: vec![],
phase: AssemblerPhase::First,
symbols: SymbolTable::new(),
current_section: None,
buf: [0, 0, 0, 0],
}
}
pub fn assemble(&mut self, raw: &str) -> Result<Vec<u8>, Vec<AssemblerError>> {
match program(CompleteStr(raw)) {
Ok((_remainder, mut program)) => {
// Start processing the AssembledInstructions
self.process_first_phase(&mut program);
if !self.errors.is_empty() {
// TODO: Can we avoid a clone here?
error!(
"Errors were found in the first parsing phase: {:?}",
self.errors
);
return Err(self.errors.clone());
};
debug!("First parsing phase complete");
debug!("Phase 1 program: {:#?}", program);
// Make sure that we have at least one data section and one code section
if self.sections.len() != 2 {
// TODO: Detail out which one(s) are missing
error!("Did not find at least two sections.");
self.errors.push(AssemblerError::InsufficientSections);
// TODO: Can we avoid a clone here?
return Err(self.errors.clone());
}
// Run the second pass, which translates opcodes and associated operands into the bytecode
let mut body = self.process_second_phase(&program);
debug!("Phase 2 program: {:#?}", program);
// Get the header so we can smush it into the bytecode letter
let mut assembled_program = self.write_pie_header();
debug!("Length of header is: {}", assembled_program.len());
// Merge the header with the populated body vector
assembled_program.append(&mut body);
debug!("Complete program is: {:#?}", assembled_program);
Ok(assembled_program)
}
// If there were parsing errors, bad syntax, etc, this arm is run
Err(e) => {
error!("There was an error parsing the code: {:?}", e);
Err(vec![AssemblerError::ParseError {
error: e.to_string(),
}])
}
}
}
/// Runs the first pass of the two-pass assembling process. It looks for labels and puts them in the symbol table
fn process_first_phase(&mut self, p: &mut Program) {
info!("Beginning search for LOAD instructions that need to be split up");
let mut inserts_to_do = Vec::new();
for (idx, i) in p.instructions.iter_mut().enumerate() {
if i.is_integer_needs_splitting() {
let value = i.get_integer_value();
let register = i.get_register_number();
let mut wtr = vec![];
wtr.write_i16::<LittleEndian>(value.unwrap());
i.operand2 = Some(Token::IntegerOperand{ value: wtr[1].into() });
let new_instruction = AssemblerInstruction {
opcode: Some(Token::Op{ code: Opcode::LUI }),
label: None,
directive: None,
operand1: i.operand1.clone(),
operand2: Some(Token::IntegerOperand{ value: wtr[0].into() }),
operand3: None
};
inserts_to_do.push((idx + 1, new_instruction));
}
}
for insert in inserts_to_do {
p.instructions.insert(insert.0, insert.1)
}
info!("Beginning first parsing phase");
// Iterate over every instruction, even though in the first phase we only care about labels and directives
for i in &p.instructions {
debug!("Parsing instruction: {}", i);
if i.is_label() {
// TODO: Factor this out into another function? Put it in `process_label_declaration` maybe?
if self.current_section.is_some() {
// If we have hit a segment header already (e.g., `.code`) then we are ok
debug!(
"Parsing label declaration in first phase: {:?} with offset {:?}",
i.get_label_name(),
self.current_instruction * 4
);
self.process_label_declaration(&i);
} else {
// If we have *not* hit a segment header yet, then we have a label outside of a segment, which is not allowed
error!(
"Label found outside of a section in first phase: {:?}",
i.get_label_name()
);
self.errors.push(AssemblerError::NoSegmentDeclarationFound {
instruction: self.current_instruction,
});
}
}
if i.is_directive() {
self.process_directive(i);
}
// This is used to keep track of which instruction we hit an error on
self.current_instruction += 1;
}
self.phase = AssemblerPhase::Second;
}
/// Runs the second pass of the assembler
fn process_second_phase(&mut self, p: &Program) -> Vec<u8> {
info!("Beginning second parsing phase");
self.current_instruction = 0;
// We're going to put the bytecode meant to be executed in a separate Vec so we can do some post-processing and then merge it with the header and read-only sections
let mut program = vec![];
// Same as in first pass, except in the second pass we care about opcodes and directives
for i in &p.instructions {
if i.is_directive() {
debug!(
"Found a directive in second phase {:?}, bypassing",
i.directive
);
continue;
}
if i.is_opcode() {
let mut bytes = i.to_bytes(&self.symbols);
program.append(&mut bytes);
}
self.current_instruction += 1
}
program
}
fn process_label_declaration(&mut self, i: &AssemblerInstruction) {
// Check if the label is None or String
let name = match i.get_label_name() {
Some(name) => name,
None => {
self.errors
.push(AssemblerError::StringConstantDeclaredWithoutLabel {
instruction: self.current_instruction,
});
return;
}
};
debug!(
"Found label declaration: {} on line {}",
name, self.current_instruction
);
// Check if label is already in use (has an entry in the symbol table)
// TODO: Is there a cleaner way to do this?
if self.symbols.has_symbol(&name) {
self.errors.push(AssemblerError::SymbolAlreadyDeclared);
return;
}
// If we make it here, it isn't a symbol we've seen before, so stick it in the table
let symbol =
Symbol::new_with_offset(name, SymbolType::Label, (self.current_instruction * 4) + 60);
debug!(
"Added new symbol to table: {:?} with offset {:?}",
symbol,
(self.current_instruction * 4) + 60
);
self.symbols.add_symbol(symbol);
}
fn process_directive(&mut self, i: &AssemblerInstruction) {
// First let's make sure we have a parseable name
let directive_name = match i.get_directive_name() {
Some(name) => name,
None => {
error!("Directive has an invalid name: {:?}", i);
return;
}
};
// Now check if there were any operands.
if i.has_operands() {
// If it _does_ have operands, we need to figure out which directive it was
match directive_name.as_ref() {
"asciiz" => {
self.handle_asciiz(i);
}
"integer" => {
self.handle_integer(i);
}
_ => {
self.errors.push(AssemblerError::UnknownDirectiveFound {
directive: directive_name.clone(),
});
return;
}
}
} else {
self.process_section_header(&directive_name);
}
}
/// Handles a declaration of a null-terminated string:
/// hello: .asciiz 'Hello!'
fn handle_asciiz(&mut self, i: &AssemblerInstruction) {
// Being a constant declaration, this is only meaningful in the first pass
if self.phase != AssemblerPhase::First {
return;
}
// In this case, operand1 will have the entire string we need to read in to RO memory
match i.get_string_constant() {
Some(s) => {
match i.get_label_name() {
Some(name) => {
self.symbols.set_symbol_offset(&name, self.ro_offset);
}
None => {
// This would be someone typing:
// .asciiz 'Hello'
println!("Found a string constant with no associated label!");
return;
}
};
// We'll read the string into the read-only section byte-by-byte
for byte in s.as_bytes() {
self.ro.push(*byte);
self.ro_offset += 1;
}
// This is the null termination bit we are using to indicate a string has ended
self.ro.push(0);
self.ro_offset += 1;
}
None => {
// This just means someone typed `.asciiz` for some reason
println!("String constant following an .asciiz was empty");
}
}
}
/// Handles a declaration of an integer numerical constant:
/// total_cats: .integer #500
fn handle_integer(&mut self, i: &AssemblerInstruction) {
// Being a constant declaration, this is only meaningful in the first pass
if self.phase != AssemblerPhase::First {
return;
}
match i.get_i32_constant() {
Some(s) => {
match i.get_label_name() {
Some(name) => {
self.symbols.set_symbol_offset(&name, self.ro_offset);
}
None => {
// This would be someone typing:
// .integer 50
println!("Found a string constant with no associated label!");
return;
}
};
let mut wtr = vec![];
// TODO: Remove unwrap?
wtr.write_i32::<LittleEndian>(s).unwrap();
for byte in &wtr {
self.ro.push(*byte);
self.ro_offset += 1;
}
}
None => {
// This just means someone typed `.asciiz` for some reason
println!("integer constant following an .integer was empty");
}
}
}
/// Handles a declaration of a section header, such as:
/// .code
fn process_section_header(&mut self, header_name: &str) {
let mut new_section: AssemblerSection = header_name.into();
// Only specific section names are allowed
if new_section == AssemblerSection::Unknown {
println!(
"Found an section header that is unknown: {:#?}",
header_name
);
return;
}
match new_section {
AssemblerSection::Code {
ref mut starting_instruction,
} => {
debug!("Code section starts at: {}", self.current_instruction);
*starting_instruction = Some(self.current_instruction)
}
AssemblerSection::Data {
ref mut starting_instruction,
} => {
debug!("Data section starts at: {}", self.current_instruction);
*starting_instruction = Some(self.current_instruction)
}
AssemblerSection::Unknown => {
error!("Found a section header that is unknown: {:?}", new_section)
}
};
// TODO: Check if we really need to keep a list of all sections seen
self.sections.push(new_section.clone());
self.current_section = Some(new_section);
}
/// Convenience function to write the executable header
fn write_pie_header(&self) -> Vec<u8> {
let mut header = vec![];
for byte in &PIE_HEADER_PREFIX {
header.push(byte.clone());
}
// Now pad the rest of the bytecode header
while header.len() < PIE_HEADER_LENGTH {
header.push(0 as u8);
}
// Now we need to calculate the starting offset so that the VM knows where the RO section ends
let mut wtr: Vec<u8> = vec![];
wtr.write_u32::<LittleEndian>(self.ro.len() as u32).unwrap();
header.append(&mut wtr);
header
}
}
#[derive(Debug, PartialEq, Clone)]
pub enum AssemblerPhase {
First,
Second,
}
impl Default for AssemblerPhase {
fn default() -> Self {
AssemblerPhase::First
}
}
#[derive(Debug, PartialEq, Clone)]
pub enum AssemblerSection {
Data { starting_instruction: Option<u32> },
Code { starting_instruction: Option<u32> },
Unknown,
}
impl Default for AssemblerSection {
fn default() -> Self {
AssemblerSection::Unknown
}
}
impl<'a> From<&'a str> for AssemblerSection {
fn from(name: &str) -> AssemblerSection {
match name {
"data" => AssemblerSection::Data {
starting_instruction: None,
},
"code" => AssemblerSection::Code {
starting_instruction: None,
},
_ => AssemblerSection::Unknown,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use vm::VM;
#[test]
/// Tests assembly a small but correct program
fn test_assemble_program() {
let mut asm = Assembler::new();
let test_string = r"
.data
.code
load $0 #100
load $1 #1
load $2 #0
test: inc $0
neq $0 $2
jmpe @test
hlt
";
let program = asm.assemble(test_string).unwrap();
let mut vm = VM::new();
assert_eq!(program.len(), 96);
vm.add_bytes(program);
assert_eq!(vm.program.len(), 96);
}
#[test]
/// Simple test of data that goes into the read only section
fn test_code_start_offset_written() {
let mut asm = Assembler::new();
let test_string = r"
.data
test1: .asciiz 'Hello'
.code
load $0 #100
load $1 #1
load $2 #0
test: inc $0
neq $0 $2
jmpe @test
hlt
";
let program = asm.assemble(test_string);
assert_eq!(program.is_ok(), true);
let unwrapped = program.unwrap();
assert_eq!(unwrapped[64], 6);
}
#[test]
/// Tests that we can add things to the symbol table
fn test_symbol_table() {
let mut sym = SymbolTable::new();
let new_symbol = Symbol::new_with_offset("test".to_string(), SymbolType::Label, 12);
sym.add_symbol(new_symbol);
assert_eq!(sym.symbols.len(), 1);
let v = sym.symbol_value("test");
assert_eq!(true, v.is_some());
let v = v.unwrap();
assert_eq!(v, 12);
let v = sym.symbol_value("does_not_exist");
assert_eq!(v.is_some(), false);
}
#[test]
/// Simple test of data that goes into the read only section
fn test_ro_data_asciiz() {
let mut asm = Assembler::new();
let test_string = r"
.data
test: .asciiz 'This is a test'
.code
";
let program = asm.assemble(test_string);
assert_eq!(program.is_ok(), true);
}
#[test]
/// Simple test of data that goes into the read only section
fn test_ro_data_i32() {
let mut asm = Assembler::new();
let test_string = r"
.data
test: .integer #300
.code
";
let program = asm.assemble(test_string);
assert_eq!(program.is_ok(), true);
}
#[test]
/// This tests that a section name that isn't `code` or `data` throws an error
fn test_bad_ro_data() {
let mut asm = Assembler::new();
let test_string = r"
.code
test: .asciiz 'This is a test'
.wrong
";
let program = asm.assemble(test_string);
assert_eq!(program.is_ok(), false);
}
#[test]
/// Tests that code which does not declare a segment first does not work
fn test_first_phase_no_segment() {
let mut asm = Assembler::new();
let test_string = "hello: .asciiz 'Fail'";
let result = program(CompleteStr(test_string));
assert_eq!(result.is_ok(), true);
let (_, mut p) = result.unwrap();
asm.process_first_phase(&mut p);
assert_eq!(asm.errors.len(), 1);
}
#[test]
/// Tests that code inside a proper segment works
fn test_first_phase_inside_segment() {
let mut asm = Assembler::new();
let test_string = r"
.data
test: .asciiz 'Hello'
";
let result = program(CompleteStr(test_string));
assert_eq!(result.is_ok(), true);
let (_, mut p) = result.unwrap();
asm.process_first_phase(&mut p);
assert_eq!(asm.errors.len(), 0);
}
}