lc3_ensemble/asm/
encoding.rs

1//! Formatters which can read and write memory object files into disk.
2//! 
3//! The [`ObjFileFormat`] trait describes an implementation of reading/writing object files into disk.
4//! This module provides an implementation of the trait:
5//! - [`BinaryFormat`]: A binary representation of object file data
6//! - [`TextFormat`]: A text representation of object file data
7
8use std::collections::{BTreeMap, HashMap};
9use std::fmt::Write;
10
11use super::{DebugSymbols, ObjectFile, SymbolData, SymbolTable};
12
13/// A trait defining object file formats.
14// This trait might be an abuse of notation/namespacing, so oops.
15pub trait ObjFileFormat {
16    /// Representation of the serialized format.
17    /// 
18    /// For binary formats, `[u8]` should be used.
19    /// For text-based formats,`str` should be used.
20    type Stream: ToOwned + ?Sized;
21    /// Serializes into the stream format.
22    fn serialize(o: &ObjectFile) -> <Self::Stream as ToOwned>::Owned;
23    /// Deserializes from the stream format, returning `None`
24    /// if an error occurred during deserialization.
25    fn deserialize(i: &Self::Stream) -> Option<ObjectFile>;
26}
27
28// BINARY!
29/// A binary format of object file data.
30pub struct BinaryFormat;
31
32const BFMT_MAGIC: &[u8] = b"obj\x21\x10";
33const BFMT_VER: &[u8] = b"\x00\x01";
34impl ObjFileFormat for BinaryFormat {
35    type Stream = [u8];
36
37    fn serialize(o: &ObjectFile) -> <Self::Stream as ToOwned>::Owned {
38        // Object file specification:
39        //
40        // Object file consists of a header and an arbitrary number of data blocks.
41        // 
42        // The header consists of:
43        // - The magic number (b"obj\x21\x10")
44        //      Coincidentally, `x21` is `!`, so opening this file with read "obj!"
45        //      That's fun.
46        // - The version (2 bytes)
47        //      Note that this is really arbitrary and backwards-incompatible changes
48        //      may occur without version upgrades.
49        //      The version will likely only upgrade if for some extenuating circumstance,
50        //      the exact object file format of a previous iteration must be maintained (never).
51        //
52        // Data is divided into discrete chunks, which start with one of:
53        // - 0x00: assembled bytecode segment
54        // - 0x01: label symbol table entry
55        // - 0x02: line symbol table entry
56        // - 0x03: source code information
57        // - 0x04: relocation map entry
58        //
59        // Block 0x00 consists of:
60        // - the identifier byte 0x00 (1 byte)
61        // - address where block starts (2 bytes)
62        // - length of the block (2 bytes)
63        // - the .orig span (16 bytes)
64        // - the array of words (3n bytes)
65        //    - each word is either 0xFF???? (initialized data) or 0x000000 (uninitialized data)
66        //
67        // Block 0x01 consists of:
68        // - the identifier byte 0x01 (1 byte)
69        // - address of the label (2 bytes)
70        // - whether the label is external (1 byte)
71        // - the start of the label in source (8 bytes)
72        // - the length of the label's name (8 bytes)
73        // - the label (n bytes)
74        //
75        // Block 0x02 consists of:
76        // - the identifier byte 0x02 (1 byte)
77        // - the source line number (8 bytes)
78        // - length of contiguous block (2 bytes)
79        // - the contiguous block (2n bytes)
80        // 
81        // Block 0x03 consists of:
82        // - the identifier byte 0x03 (1 byte)
83        // - the length of the source code (8 bytes)
84        // - the source code (n bytes)
85        //
86        // Block 0x04 consists of:
87        // - the identifier byte 0x04 (1 byte)
88        // - the address to replace at (2 bytes)
89        // - the length of the label's name (8 bytes)
90        // - the label (n bytes)
91
92        let mut bytes = BFMT_MAGIC.to_vec();
93        bytes.extend_from_slice(BFMT_VER);
94
95        for (addr, data) in o.block_iter() {
96            bytes.push(0x00);
97            bytes.extend(u16::to_le_bytes(addr));
98            bytes.extend(u16::to_le_bytes(data.len() as u16));
99            for &word in data {
100                if let Some(val) = word {
101                    bytes.push(0xFF);
102                    bytes.extend(u16::to_le_bytes(val));
103                } else {
104                    bytes.extend([0x00; 3]);
105                }
106            }
107        }
108
109        if let Some(sym) = &o.sym {
110            for (label, &super::SymbolData { addr, src_start, external }) in sym.label_map.iter() {
111                bytes.push(0x01);
112                bytes.extend(u16::to_le_bytes(addr));
113                bytes.push(u8::from(external));
114                bytes.extend(u64::to_le_bytes(src_start as u64));
115                bytes.extend(u64::to_le_bytes(label.len() as u64));
116                bytes.extend_from_slice(label.as_bytes());
117            }
118
119            if let Some(debug_sym) = &sym.debug_symbols {
120                for (lno, data) in debug_sym.line_map.block_iter() {
121                    bytes.push(0x02);
122                    bytes.extend(u64::to_le_bytes(lno as u64));
123                    bytes.extend(u16::to_le_bytes(data.len() as u16));
124                    for &word in data {
125                        bytes.extend(u16::to_le_bytes(word));
126                    }
127                }
128
129                let src = &debug_sym.src_info.src;
130                bytes.push(0x03);
131                bytes.extend(u64::to_le_bytes(src.len() as u64));
132                bytes.extend_from_slice(src.as_bytes());
133            }
134
135            for (&addr, label) in &sym.rel_map {
136                bytes.push(0x04);
137                bytes.extend(u16::to_be_bytes(addr));
138                bytes.extend(u64::to_le_bytes(label.len() as u64));
139                bytes.extend_from_slice(label.as_bytes());
140            }
141        }
142
143        bytes
144    }
145
146    fn deserialize(mut vec: &Self::Stream) -> Option<ObjectFile> {
147        let mut block_map  = BTreeMap::new();
148        let mut label_map  = HashMap::new();
149        let mut rel_map    = HashMap::new();
150        let mut debug_sym  = None::<(BTreeMap<_, _>, String)>;
151
152        vec = vec.strip_prefix(BFMT_MAGIC)?
153            .strip_prefix(BFMT_VER)?;
154
155        while let Some((ident_byte, rest)) = vec.split_first() {
156            vec = rest;
157            match ident_byte {
158                0x00 => {
159                    let addr     = u16::from_le_bytes(take::<2>(&mut vec)?);
160                    let data_len = u16::from_le_bytes(take::<2>(&mut vec)?);
161
162                    let data = map_chunks::<_, 3>(take_slice(&mut vec, 3 * usize::from(data_len))?, 
163                        |[init, rest @ ..]| (init == 0xFF).then(|| u16::from_le_bytes(rest))
164                    );
165
166                    block_map.insert(addr, data);
167                },
168                0x01 => {
169                    let addr      = u16::from_le_bytes(take::<2>(&mut vec)?);
170                    let external  = u8::from_le_bytes(take::<1>(&mut vec)?) != 0;
171                    let src_start = u64::from_le_bytes(take::<8>(&mut vec)?) as usize;
172                    let str_len   = u64::from_le_bytes(take::<8>(&mut vec)?) as usize;
173                    let string    = String::from_utf8(take_slice(&mut vec, str_len)?.to_vec()).ok()?;
174
175                    label_map.insert(string, SymbolData { addr, src_start, external });
176                },
177                0x02 => {
178                    let (line_map, _) = debug_sym.get_or_insert_with(Default::default);
179                    let lno      = u64::from_le_bytes(take::<8>(&mut vec)?) as usize;
180                    let data_len = u16::from_le_bytes(take::<2>(&mut vec)?);
181                    let data     = map_chunks::<_, 2>(take_slice(&mut vec, 2 * usize::from(data_len))?, u16::from_le_bytes);
182                    
183                    // Assert line map has sorted data without duplicates,
184                    // as LineSymbolMap depends on the block being sorted
185                    // and assumes no duplicates (since no two lines map to the same address)
186                    assert_sorted_no_dup(&data)?;
187                    line_map.insert(lno, data);
188                },
189                0x03 => {
190                    let (_, src) = debug_sym.get_or_insert_with(Default::default);
191
192                    let src_len = u64::from_le_bytes(take::<8>(&mut vec)?) as usize;
193                    let obj_src = std::str::from_utf8(take_slice(&mut vec, src_len)?).ok()?;
194                    src.push_str(obj_src);
195                },
196                0x04 => {
197                    let addr = u16::from_le_bytes(take::<2>(&mut vec)?);
198                    let label_len = u64::from_le_bytes(take::<8>(&mut vec)?) as usize;
199                    let label = String::from_utf8(take_slice(&mut vec, label_len)?.to_vec()).ok()?;
200
201                    rel_map.insert(addr, label);
202                },
203                _ => return None
204            }
205        }
206
207        let debug_symbols = match debug_sym {
208            Some((line_map, src)) => Some(DebugSymbols {
209                // Error should propagate to deser
210                line_map: super::LineSymbolMap::from_blocks(line_map)?,
211                src_info: super::SourceInfo::from_string(src),
212            }),
213            None => None,
214        };
215        let sym = (!label_map.is_empty() || debug_symbols.is_some())
216            .then_some(SymbolTable { label_map, debug_symbols, rel_map });
217        Some(ObjectFile {
218            block_map,
219            sym,
220        })
221    }
222}
223
224fn take<const N: usize>(data: &mut &[u8]) -> Option<[u8; N]> {
225    take_slice(data, N)
226        .map(|slice| <[_; N]>::try_from(slice).unwrap())
227}
228fn take_slice<'a>(data: &mut &'a [u8], n: usize) -> Option<&'a [u8]> {
229    let (left, right) = try_split_at(data, n)?;
230    *data = right;
231    Some(left)
232}
233fn try_split_at(data: &[u8], n: usize) -> Option<(&[u8], &[u8])> {
234    if n > data.len() { return None; }
235    Some(data.split_at(n))
236}
237fn map_chunks<T, const N: usize>(data: &[u8], f: impl FnMut([u8; N]) -> T) -> Vec<T> {
238    assert_eq!(data.len() % N, 0);
239    
240    data.chunks_exact(N)
241        .map(|c| <[_; N]>::try_from(c).unwrap())
242        .map(f)
243        .collect()
244}
245fn assert_sorted_no_dup<T: Ord>(data: &[T]) -> Option<()> {
246    data.windows(2)
247        .map(|w| <&[_; 2]>::try_from(w).unwrap())
248        .all(|[l, r]| l < r)
249        .then_some(())
250}
251
252// TEXT!
253/// A text-based format of object file data.
254pub struct TextFormat;
255
256const TFMT_MAGIC: &str = "LC-3 OBJ FILE";
257const TFMT_UNINIT: &str = "????";
258const TABLE_DIV: &str = " | ";
259
260impl ObjFileFormat for TextFormat {
261    type Stream = str;
262
263    fn serialize(o: &ObjectFile) -> <Self::Stream as ToOwned>::Owned {
264        // Text format specification.
265        //
266        // ```text
267        // LC-3 OBJ FILE
268        // 
269        // .TEXT
270        // <start address in hex>
271        // <length of segment in dec>
272        // <instruction in hex>
273        // <...>
274        //
275        // .SYMBOL
276        // ADDR | EXT | LABEL
277        // 0000 |   0 | FOO  
278        // 0001 |   0 | BAR  
279        // 0002 |   1 | BAZ  
280        // ...
281        //
282        // .LINKER_INFO
283        // ADDR | LABEL
284        // 0002 | BAZ
285        // 0003 | BAZ
286        // 0005 | BAZ
287        // 
288        // .DEBUG
289        // LABEL | INDEX
290        // FOO   | 35
291        // BAR   | 94
292        // ====================
293        // LINE | ADDR | SOURCE
294        // 0    | 9090 | ......
295        // 1    | 9091 | ......
296        // ====================
297        // ...
298        // # Support for comments, as well.
299        // ```
300        //
301        // For a given `instruction in hex`, it prints the ASCII-hex encoding, returning ???? if uninitialized.
302        fn _ser(o: &ObjectFile) -> Result<String, std::fmt::Error> {
303            use std::fmt::Write;
304            let mut buf = String::new();
305
306            writeln!(buf, "{TFMT_MAGIC}")?;
307            writeln!(buf)?;
308            
309            writeln!(buf, ".TEXT")?;
310            for (addr, block) in o.block_iter() {
311                writeln!(buf, "{addr:04X}")?;
312                writeln!(buf, "{}", block.len())?;
313                for &m_instr in block {
314                    match m_instr {
315                        Some(instr) => writeln!(buf, "{instr:04X}")?,
316                        None => writeln!(buf, "{TFMT_UNINIT}")?,
317                    }
318                }
319            }
320            writeln!(buf)?;
321
322            if let Some(sym) = &o.sym {
323                writeln!(buf, ".SYMBOL")?;
324                if !sym.label_map.is_empty() {
325                    let mut entries: Vec<_> = sym.label_iter().collect();
326                    entries.sort_by_key(|&(name, addr, ext)| (addr, name, ext));
327
328                    writeln!(buf, "ADDR{0}EXT{0}LABEL", TABLE_DIV)?;
329                    for (label, addr, external) in entries {
330                        writeln!(buf, "{addr:04X}{0}{1:3}{0}{label}", TABLE_DIV, u8::from(external))?;
331                    }
332                }
333                writeln!(buf)?;
334                
335                writeln!(buf, ".LINKER_INFO")?;
336                if !sym.rel_map.is_empty() {
337                    let mut entries: Vec<_> = sym.rel_map.iter().collect();
338                    entries.sort();
339
340                    writeln!(buf, "ADDR{0}LABEL", TABLE_DIV)?;
341                    for (addr, label) in entries {
342                        writeln!(buf, "{addr:04X}{0}{label}", TABLE_DIV)?;
343                    }
344                }
345                writeln!(buf)?;
346
347                writeln!(buf, ".DEBUG")?;
348                writeln!(buf, "# DEBUG SYMBOLS FOR LC3TOOLS")?;
349                writeln!(buf)?;
350
351                // Display label to index mapping
352                const LABEL: &str = "LABEL";
353                const INDEX: &str = "INDEX";
354                
355                if !sym.label_map.is_empty() {
356                    let mut entries: Vec<_> = sym.label_map.iter()
357                        .map(|(label, sym_data)| (label, sym_data.src_start))
358                        .collect();
359                    entries.sort_by_key(|&(label, index)| (index, label));
360
361                    // Calculate label & index column lengths
362                    let (label_col, index_col) = entries.iter()
363                        .map(|&(label, index)| (label.len(), count_digits(index)))
364                        .fold(
365                            (LABEL.len(), INDEX.len()), 
366                            |(lc, ic), (lx, ix)| (lc.max(lx), ic.max(ix))
367                        );
368
369                    // Display!
370                    writeln!(buf, "{LABEL:1$}{0}{INDEX:2$}", TABLE_DIV, label_col, index_col)?;
371                    for (label, index) in entries {
372                        writeln!(buf, "{label:1$}{0}{index:2$}", TABLE_DIV, label_col, index_col)?;
373                    }
374                }
375                writeln!(buf, "====================")?;
376
377                if let Some(DebugSymbols { line_map, src_info }) = &sym.debug_symbols {
378                    // Create line table
379                    let mut line_table = BTreeMap::from_iter({
380                        src_info.nl_indices.iter().enumerate()
381                            .map(|(lno, &idx)| (lno, (Some(idx), None)))
382                    });
383                    
384                    for (start_line, block) in line_map.block_iter() {
385                        for (i, &addr) in block.iter().enumerate() {
386                            let (_, entry_addr) = line_table.entry(start_line.wrapping_add(i)).or_default();
387                            entry_addr.replace(addr);
388                        }
389                    }
390    
391                    // Display line table
392                    const LINE: &str = "LINE";
393                    if !line_table.is_empty() {
394                        // Compute line & index column length
395                        let (mut last_line, mut last_index) = (None, None);
396                        for (&line, &(index, _)) in line_table.iter().rev() {
397                            if last_line.is_none() { last_line.replace(line); }
398                            if last_index.is_none() { last_index = index; }
399    
400                            if last_line.is_some() && last_index.is_some() {
401                                break;
402                            }
403                        }
404                        let line_col = LINE.len().max(count_digits(last_line.unwrap_or(0)));
405    
406                        // Display!
407                        writeln!(buf, "{LINE:1$}{0}ADDR{0}SOURCE", TABLE_DIV, line_col)?;
408                        for (line, (_, m_addr)) in line_table {
409                            write!(buf, "{line:0$}", line_col)?;
410                            write!(buf, "{TABLE_DIV}")?;
411                            match m_addr {
412                                Some(addr) => write!(buf, "{addr:04X}"),
413                                None => write!(buf, "{TFMT_UNINIT}")
414                            }?;
415                            write!(buf, "{TABLE_DIV}")?;
416    
417                            // Line:
418                            let src_line = src_info.raw_line_span(line)
419                                .and_then(|r| src_info.source().get(r))
420                                .unwrap_or("")
421                                .escape_default();
422                            write!(buf, "{src_line}")?;
423    
424                            writeln!(buf)?;
425                        }
426                    }
427    
428                    writeln!(buf, "====================")?;
429                }
430            }
431
432
433            Ok(buf)
434        }
435
436        _ser(o).unwrap()
437    }
438
439    fn deserialize(string: &Self::Stream) -> Option<ObjectFile> {
440        // Warning: spaghetti.
441
442        let mut block_map  = BTreeMap::new();
443        let mut label_map  = HashMap::<_, SymbolData>::new();
444        let mut rel_map    = HashMap::new();
445        let mut debug_sym  = None::<(Vec<_>, String)>;
446
447        // Read all of the non-empty lines:
448        let mut lines = string.trim().lines()
449            .filter(|l| !l.starts_with('#')) // remove comment lines
450            .filter(|&l| !l.trim().is_empty()); // remove empty lines
451        if lines.next() != Some(TFMT_MAGIC) { return None };
452
453        let mut line_groups = vec![];
454        for line in lines {
455            if line.starts_with('.') {
456                line_groups.push(vec![line]);
457            } else {
458                line_groups.last_mut()?.push(line);
459            }
460        }
461        for group in line_groups {
462            let [header, rest @ ..] = &*group else { return None };
463            match *header {
464                ".TEXT" => {
465                    let mut it = rest.iter();
466                    while let Some(orig_hex) = it.next() {
467                        let orig = hex2u16(orig_hex)?;
468                        let block_len = it.next()?.parse::<u16>().ok()?;
469
470                        // Get and parse block of hex:
471                        let block: Vec<_> = it.by_ref().take(usize::from(block_len))
472                            .copied()
473                            .map(maybe_hex2u16)
474                            .collect::<Option<_>>()?;
475
476                        if block.len() != usize::from(block_len) { return None; }
477                        match block_map.entry(orig) {
478                            std::collections::btree_map::Entry::Vacant(e) => e.insert(block),
479                            std::collections::btree_map::Entry::Occupied(_) => return None,
480                        };
481                    }
482                },
483                ".SYMBOL" => {
484                    let table = parse_table(rest, ["ADDR", "EXT", "LABEL"], |[addr_hex, ext, label], _| {
485                        let addr = hex2u16(addr_hex)?;
486                        let ext = ext.parse::<u8>().ok()? != 0;
487                        Some((addr, ext, label))
488                    }, true)?;
489
490                    for (addr, ext, label) in table {
491                        // TODO: what happens if .SYMBOL label + .DEBUG label mismatch
492                        let entry = label_map.entry(label.to_string()).or_default();
493                        
494                        entry.addr = addr;
495                        entry.external = ext;
496                    }
497                },
498                ".LINKER_INFO" => {
499                    let table = parse_table(rest, ["ADDR", "LABEL"], |[addr_hex, label], _| {
500                        let addr = hex2u16(addr_hex)?;
501                        Some((addr, label.to_string()))
502                    }, true)?;
503
504                    rel_map.extend(table);
505                },
506                ".DEBUG" => if !rest.is_empty() {
507                    let split_pos = rest.iter().position(|l| l.starts_with('='))?;
508                    if !rest.last()?.starts_with('=') { return None; }
509                    let (label_src, [_, line_src @ .., _]) = rest.split_at(split_pos) else { unreachable!("divider should be present") };
510
511                    let label_table = parse_table(label_src, ["LABEL", "INDEX"], |[label, index_str], _| {
512                        let index = index_str.parse().ok()?;
513                        Some((label, index))
514                    }, true)?;
515                    for (label, index) in label_table {
516                        label_map.entry(label.to_string()).or_default().src_start = index;
517                    }
518
519                    let line_table = parse_table(line_src, ["LINE", "ADDR", "SOURCE"], |cols, i| {
520                        let [rest @ .., source_str] = cols;
521                        let [line_str, addr_str] = rest.map(str::trim);
522                        
523                        if line_str.parse::<usize>().ok()? != i { return None; }
524                        let m_addr = maybe_hex2u16(addr_str)?;
525                        Some((m_addr, source_str))
526                    }, false)?;
527
528                    if !line_table.is_empty() {
529                        let (line_map, src) = debug_sym.get_or_insert_with(Default::default);
530                        for (m_addr, line) in line_table {
531                            line_map.push(m_addr);
532                            src.write_str(line).unwrap();
533                        }
534                        *src = unescaper::unescape(src).ok()?;
535                    }
536                },
537                _ => return None
538            }
539        }
540
541        let debug_symbols = match debug_sym {
542            Some((line_map, src)) => Some(DebugSymbols {
543                // propagate error to deser
544                line_map: super::LineSymbolMap::new(line_map)?,
545                src_info: super::SourceInfo::from_string(src),
546            }),
547            None => None,
548        };
549        let sym = (!label_map.is_empty() || debug_symbols.is_some())
550            .then_some(SymbolTable { label_map, debug_symbols, rel_map });
551        Some(ObjectFile {
552            block_map,
553            sym,
554        })
555    }
556}
557
558fn count_digits(n: usize) -> usize {
559    (n.checked_ilog10().unwrap_or(0) + 1) as usize
560}
561fn hex2u16(s: &str) -> Option<u16> {
562    match s.len() == 4 {
563        true => u16::from_str_radix(s, 16).ok(),
564        false => None
565    }
566}
567fn maybe_hex2u16(s: &str) -> Option<Option<u16>> {
568    match s {
569        TFMT_UNINIT => Some(None),
570        s => hex2u16(s).map(Some)
571    }
572}
573
574fn parse_header(line: &str, columns: &[&str]) -> Option<()> {
575    line.splitn(columns.len(), TABLE_DIV)
576        .map(str::trim)
577        .eq(columns.iter().copied())
578        .then_some(())
579}
580fn parse_row<'a, T, const N: usize>(line: &'a str, f: impl FnOnce([&'a str; N]) -> Option<T>) -> Option<T> {
581    let mut segments: Vec<_> = line
582        .splitn(N, TABLE_DIV)
583        .collect();
584    segments.resize(N, "");
585    let segments = *<Box<[_; N]>>::try_from(segments).ok()?;
586    f(segments)
587}
588fn parse_table<'a, T, const N: usize>(
589    contents: &[&'a str],
590    columns: [&str; N], 
591    mut row_parser: impl FnMut([&'a str; N], usize) -> Option<T>,
592    trim: bool
593) -> Option<Vec<T>> {
594    // Accept empty tables:
595    let Some((header, body)) = contents.split_first() else {
596        return Some(vec![])
597    };
598
599    let trimfn = |s| match trim {
600        true  => str::trim(s),
601        false => s,
602    };
603    parse_header(header, &columns)?;
604    body.iter()
605        .enumerate()
606        .map(|(i, l)| parse_row(l, |r| row_parser(r.map(trimfn), i)))
607        .collect()
608}