1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
//! Bytecode
//! ========
//!
//! This module contains functions for encoding and decoding Seax bytecode.
//!
//! Seax Bytecode Format
//! ====================
//!
//! Seax Bytecode Standard Revision 0, June 11th, 2015
//!
//! I: The preamble
//! ---------------
//!
//! All Seax Bytecode files begin with a preamble. This preamble consists of the following:
//!
//! 1. The identifying bytes 0x5ECD. These bytes, chosen based on a poorly-advised attempt to
//!    spell out the abbreviation SECD in hexadecimal, identify the file as a Seax bytecode file.
//! 2. A 16-bit unsigned integer that represents the version of the Seax bytecode format that the
//!    file was encoded with. This number is used to determine how the remainder of the file should
//!    be decoded. This document is Revision 0 of the Seax Bytecode format, so the version
//!    should be 0x0000.
//!
//! Future revisions of this standard will allow additional metadata, such as the author's
//! cryptographic signature, checksums for ensuring the executable's integrity, and directives to
//! the virtual machine, to be placed in the preamble as well.
//!
//! II: Instructions
//! ----------------
//!
//! All Seax VM instructions are encoded using single byes. The Seax opcodes occupy the
//! space 0x00 to 0x30, with the bytes 0x1D through 0x3F being reserved for future use.
//!
//! The following table shows all of the currently available SVM opcodes.
//!
//! | Value | Name          | Description
//! +-------+---------------+--------------------------------------------------------------------
//!   0x00  | NIL           | Pushes an empty list (nil) onto `$s`.
//!   0x01  | LD (a . b)    | Pushes the variable at `$e[a][b]` onto the stack.
//!   0x02  | LDF f         | Constructs a closure from the list `f` and the current environment,
//!                           and pushes it to `$s`.
//!   0x03  | AP c          | Applies the function closure `c`.
//!   0x04  | APCC c        | Applies the function closure `c` and pushes a continuation on `$d`.
//!   0x05  | JOIN          | Returns control to the calling scope at the end of a `SEL`.
//!   0x06  | RAP c         | Applies the recursive closure `c`.
//!   0x07  | RET           | Returns control from a function to the calling function.
//!   0x08  | DUM           | Pushes a dummy environment to `$e` for applying a recursive function.
//!   0x09  | SEL a         | Applies the first list of instructions on `$c` if `a` is non-nil,
//!                           or the second list if it is nil.
//!   0x0A  | ADD a b       |
//!   0x0B  | SUB a b       |
//!   0x0C  | MUL a b       |
//!   0x0D  | DIV a b       |
//!   0x0E  | MOD a b       |
//!   0x0F  | FDIV a b      |
//!   0x10  | EQ a b        |
//!   0x11  | GT a b        |
//!   0x12  | GTE a b       |
//!   0x13  | LT a b        |
//!   0x14  | LTE a b       |
//!   0x15  | ATOM a        |
//!   0x16  | NULL a        |
//!   0x17  | READC         |
//!   0x18  | WRITEC        |
//!   0x19  | CONS a b      |
//!   0x1A  | CAR (a . b)   |
//!   0x1B  | CDR (a . b)   |
//!   0x1C  | LDC           |
//!   0x1D  | STOP          |
//!   0x1E  | reserved      |
//!         |     ...       |
//!   0x30  | reserved      |
//!
//! III: Constants
//! --------------
//!
//! Constants are identified by a preceeding constant-identification byte. Constant-identification
//! bytes comprise the range of bytes between 0xC0 and 0xCF, inclusive, and the NIL byte, 0x00.
//!
//! Constants are expected in two places: following either an LDC (LoaD Constant) instruction,
//! or following an instruction that expects a list on $c, such as a SEL instruction.
//!
//! 1. CONS Cells (0xC0)
//!
//!    0xC0 identifies the beginning of a CONS cell constant. It may then be followed by the
//!    identification byte denoting any other constant, which will be interpreted as the CAR part
//!    of the CONS cell. This is followed by the actual bytecode data for the CAR part of the CONS
//!    cell, which is of the length specified by the type identified by the identification byte.
//!
//!    After the constant for the CAR part, there must be an additional identification byte, which
//!    identifies the type of the CONS cell's CDR part. Unlike the CAR part, this may be any
//!    constant type, including another CONS cell. This identification byte is again followed by
//!    the bytecode for the data stored in the CONS cell's CDR part, whose length is determined
//!    by the type identified by the identification byte.
//!
//!    Alternatively, the CAR or CDR parts of a CONS cell may also contain a Seax instruction. In
//!    such a case, the identification byte is replaced by that instruction's opcode. The opcode
//!    comprises the entirity of the CAR or CDR part, and any further data is interpreted as
//!    appropriate (i.e., if the opcode is the CAR part, another opcode or identifying byte will
//!    be expected, while if the opcode is in the CDR part, a new instruction or constant will
//!    be expected.)
//!
//! 2. Atom constants (0xC1 ... 0xCF)
//!
//!    Any constants that are not CONS cells are atom constants. Atom constants are identified by
//!    bytes in the range between 0xC1 and 0xCF, inclusive. Currently, 0xC1, 0xC2, 0xC3, and 0xC4
//!    identify extant atom types, while 0xC5 ... 0xCE are reserved for future use.
//!
//!    Once an atom constant identifying byte is read, the bytes that follow it will be read as
//!    that type of atom. The number of bytes read depends on the length of the atom type, which is
//!    determined using the identifying bytes. The following identifying bytes correspond to the
//!    following atom types:
//!
//! + 0xC1: uint atom (64-bit unsigned integer)
//! + 0xC2: sint atom (64-bit signed integer)
//! + 0xC3: char atom (32-bit Unicode scalar value)
//! + 0xC4: float atom (64-bit double-precision floating point number
//!
//!    If additional primitive data types are added to the Seax VM, the bytes 0xC5 to 0xCF will
//!    be used to identify those types.
//!
//!    Note that the type tag identifying a constant may be extracted by byte-masking the
//!    identifying byte with the number 0x0F.
//!

extern crate byteorder;

use self::byteorder::{ByteOrder, BigEndian, ReadBytesExt, WriteBytesExt};

use std::error::Error;
use std::io::Read;
use std::fmt;
use std::char;
use std::mem::transmute;

use super::List;
use super::List::*;
use super::{SVMCell,Atom,Inst};
use super::SVMCell::*;
use super::Atom::*;

#[cfg(test)]
mod tests;

#[cfg(not(feature = "unstable"))]
macro_rules! push_all {
    ( $vec:ident, $other:expr ) => {
        for item in $other {
            $vec.push(*item);
        }
    }
}
#[cfg(feature = "unstable")]
macro_rules! push_all {
    ( $vec:ident, $other:expr ) => { $vec.push_all($other); }
}

/// Identifying bytes for a Seax bytecode file
#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since = "0.1.0") )]
pub const IDENT_BYTES: u16 = 0x5ECD;
/// Identifying bytes for this version of the Seax bytecode standard.
#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.3.0") )]
pub const VERSION: u16     = 0x0000;

/// block reserved for future opcodes
const RESERVED_START: u8  = 0x1E;
const RESERVED_LEN: u8    = 0x12;
/// block reserved for typetags
const CONST_START: u8     = 0xC1;
const CONST_LEN: u8       = 0x0E;
/// important bytecodes
const BYTE_CONS: u8       = 0xC0;
const BYTE_NIL: u8        = 0x00;

/// Decode a whole program
///
/// Decodes a whole program, including the identifying and version bytes.
#[cfg_attr(feature = "unstable",
    unstable(feature = "decode", issue = "94") )]
pub fn decode_program<R>(source: &mut R) -> Result<List<SVMCell>, String>
where R: Read {
    let mut decoder = Decoder::new(source);
    decoder
        .check_ident_bytes()
        .map(|_| decoder.check_version()
                        .map_err(|why| warn!("{}", why) )
            )
        .map(|_| // TODO: carry over errors from next_cell()
                 // rather than upwrapping
            decoder.collect::<List<SVMCell>>()
            )
}

#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.1.0"))]
pub struct Decoder<'a, R: 'a> {
    source: &'a mut R,
    num_read: usize
}

/// Decode a Seax instruction from a byte
#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.1.0"))]
fn decode_inst(byte: &u8) -> Result<Inst, String> {
    match *byte {
        b if b >= BYTE_NIL && b < RESERVED_START =>
            unsafe { Ok(transmute(b)) },
        b if b >= RESERVED_START &&
             b <= (RESERVED_START + RESERVED_LEN) =>
            Err(format!("Unimplemented: reserved byte {:#X}", b)),
        b if b > (RESERVED_START + RESERVED_LEN) =>
            Err(String::from("byte too high")),
        _  => unreachable!() // Should require an act of God.
    }
}


#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.1.0"))]
impl<'a, R> Decoder<'a, R>
where R: Read {
    /// Check the identifying bytes
    ///
    /// The two-byte sequence `0x5ECD` is expected at the beginning of all
    /// Seax bytecode files. Those bytes identify that file as a Seax bytecode
    /// file. If the bytes don't match, an error is returned, as the file is
    /// invalid.
    ///
    /// Consumes two bytes.
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0"))]
    pub fn check_ident_bytes(&mut self) -> Result<(), String> {
        self.source
            .read_u16::<BigEndian>()
            .map_err(|why| String::from(why.description()))
            .and_then(|ident| {
                self.num_read += 2;
                match ident {
                    IDENT_BYTES => Ok(()),
                    other_bytes => Err(
                        format!("invalid identifying bytes {:#06x}", other_bytes)
                    )
                }
            })
    }

    /// Checks the version bytes
    ///
    /// Checks the version bytes of a Seax bytecode file against the VERSION
    /// of the bytecode standard used by this code. If the versions are not
    /// the same, an error is returned.
    ///
    /// Consumes two bytes.
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    pub fn check_version(&mut self) -> Result<(), String> {
        self.source
            .read_u16::<BigEndian>()
            .map_err(|why| String::from(why.description()))
            .and_then(|version| {
                self.num_read += 2;
                match version {
                    VERSION => Ok(()),
                    bytes   => Err( // I expect this will generate a warning
                                    // at the call site...
                        format!("mismatched version {}, expected {}",
                            bytes, version)
                    )
                }
            })
    }

    /// Creates a new decoder from a type implementing `std::io::Read`
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    pub fn new(src: &'a mut R) -> Decoder<'a, R> {
        Decoder {
            source: src,
            num_read: 0
        }
    }

    /// Returns the number of bytes read by the decoder
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    pub fn num_read(&self) -> usize {
        self.num_read
    }

    /// Decode a constant
    ///
    /// This method is passed a constant-identifying byte. It extracts
    /// the type tag from that byte, and then consumes the appropriate
    /// number of bytes from the reader and decodes them to the
    /// expected constant.
    ///
    /// Consums a varying number of bytes depending on the constant being
    /// decoded.
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    fn decode_const(&mut self, byte: &u8) -> Result<Atom, String> {
        match *byte & 0x0F { // extract the type tag
            1 => {
                self.num_read += 8; // this should be more FP i guess
                self.source
                    .read_u64::<BigEndian>()
                    .map(Atom::UInt)
                    .map_err(|why| String::from(why.description()))
                },
            2 => {
                self.num_read += 8;
                self.source
                    .read_i64::<BigEndian>()
                    .map(Atom::SInt)
                    .map_err(|why| String::from(why.description()))
                },
            3 => {
                self.num_read += 4;
                self.source
                    .read_u32::<BigEndian>()
                    .map_err( |why | String::from(why.description()))
                    .and_then(|byte|
                        char::from_u32(byte)
                            .ok_or(String::from("Could not read character."))
                        )
                    .map(Atom::Char)
                },
            4 => {
                self.num_read += 8;
                self.source
                    .read_f64::<BigEndian>()
                    .map(Atom::Float)
                    .map_err(|why| String::from(why.description()))
                },
            _ => unimplemented!()
        }
    }
    // Decodes a CONS cell
    //
    // Consumes a varying number of bytes depending on the length of the list.
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    fn decode_cons(&mut self) -> Result<Option<Box<List<SVMCell>>>, String> {
        self.next_cell()
            .and_then(|car|
                car.ok_or(String::from("EOF while decoding CONS cell"))
            )
            .map(|car| {
                debug!("Decoded {:?}, {} bytes read", car, self.num_read);
                car
            })
            .and_then(|car| {
                let mut buf = [0;1];
                try!(self.source.read(&mut buf) // try to get next byte
                         .map_err(|why| String::from(why.description())));
                self.num_read += 1;
                match buf[0] {
                    BYTE_CONS =>
                        self.decode_cons()
                            .and_then(|cdr| cdr.ok_or(
                                String::from("EOF while decoding CONS")) )
                            .map( |cdr| (car, cdr) ),
                    BYTE_NIL  => Ok((car, Box::new(Nil))),
                    b         => Err(
                        format!("Unexpected byte {:#02x} while decoding CONS", b)
                    )
                }
            })
            .map(|(car, cdr)| Some(Box::new( Cons(car, cdr)) ))
    }

    /// Decodes the next cell in the source
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    pub fn next_cell(&mut self) -> Result<Option<SVMCell>,String> {
        let mut buf = [0;1];
        match self.source.read(&mut buf) {
            Ok(1)   => { // a byte was read
                self.num_read += 1;
                debug!("Read {:#X}, {} bytes read", buf[0], self.num_read);
                match buf[0] {
                    b if b < 0x30 => decode_inst(&b)
                                        .map(SVMCell::InstCell)
                                        .map(Some),
                    b if b >= CONST_START &&
                         b < (CONST_START + CONST_LEN) =>
                                    self.decode_const(&b)
                                        .map(SVMCell::AtomCell)
                                        .map(Some),
                    BYTE_CONS    => self.decode_cons()
                                        .map(|cell|
                                              cell.map(SVMCell::ListCell)
                                        ),
                    b            => Err(format!("Unsupported byte {:#02x}", b))
                }
            },
            Ok(0)    => Ok(None), //  we're out of bytes - EOF
            Ok(_)    => unreachable!(), //
            Err(why) => Err(String::from(why.description()))
        }
    }

}

#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.1.0") )]
impl<'a, R> Iterator for Decoder<'a, R>
where R: Read {
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    type Item = SVMCell;

    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    fn next(&mut self) -> Option<SVMCell> {
        self.next_cell()
            .unwrap()
    }
}
#[cfg_attr(feature = "unstable",
    stable(feature = "decode", since="0.1.0") )]
impl<'a, R> fmt::Debug for Decoder<'a, R>
where R: fmt::Debug {
    #[cfg_attr(feature = "unstable",
        stable(feature = "decode", since="0.1.0") )]
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Decoding from: {:?}, {} bytes read",
            self.source,
            self.num_read
        )
    }

}

/// Trait for an object that can be encoded to Seax bytecode.
///
/// All types that can be encoded must implement this.
#[cfg_attr(feature = "unstable",
    stable(feature = "encode", since="0.1.0") )]
pub trait Encode {
    /// Encodes this object to a list of bytes.
    #[cfg_attr(feature = "unstable",
        stable(feature = "encode", since="0.1.0") )]
    fn emit(&self) -> Vec<u8>;
}
#[cfg_attr(feature = "unstable",
    stable(feature = "encode", since="0.1.0") )]
impl Encode for SVMCell {
    #[cfg_attr(feature = "unstable",
        stable(feature = "encode", since="0.1.0") )]
    fn emit(&self) -> Vec<u8> {
        match *self {
            AtomCell(ref atom) => atom.emit(),
            InstCell(inst) => vec![inst as u8],
            ListCell(ref list) => (*list).emit()
        }
    }
}

#[cfg_attr(feature = "unstable",
    stable(feature = "encode", since="0.1.0") )]
impl Encode for Atom {
    #[cfg_attr(feature = "unstable",
        stable(feature = "encode", since="0.1.0") )]
    fn emit(&self) -> Vec<u8> {
        match *self {
            UInt(value) => {
                let mut buf = vec![0xC1];
                buf.write_u64::<BigEndian>(value)
                   .unwrap();
                buf
            },
            SInt(value) => {
                let mut buf = vec![0xC2];
                buf.write_i64::<BigEndian>(value)
                   .unwrap();
                buf
            },
            Char(value) => {
                let mut buf = vec![0xC3];
                buf.write_u32::<BigEndian>(value as u32)
                   .unwrap();
                buf
            },
            Float(value) => {
                let mut buf = vec![0xC4];
                buf.write_f64::<BigEndian>(value)
                   .unwrap();
                buf
            }
        }
    }
}

// #[cfg_attr(feature = "unstable",
//     stable(feature = "encode", since="0.1.0") )]
// impl Encode for Inst {
//     #[cfg_attr(feature = "unstable",
//         stable(feature = "encode", since="0.1.0") )]
//     fn emit(&self) -> Vec<u8> {
//         match *self {
//             NIL     => vec![BYTE_NIL],
//             LD      => vec![0x01],
//             LDF     => vec![0x02],
//             AP      => vec![0x03],
//             APCC    => vec![0x04],
//             JOIN    => vec![0x05],
//             RAP     => vec![0x06],
//             RET     => vec![0x07],
//             DUM     => vec![0x08],
//             SEL     => vec![0x09],
//             ADD     => vec![0x0A],
//             SUB     => vec![0x0B],
//             MUL     => vec![0x0C],
//             DIV     => vec![0x0D],
//             MOD     => vec![0x0E],
//             FDIV    => vec![0x0F],
//             EQ      => vec![0x10],
//             GT      => vec![0x11],
//             GTE     => vec![0x12],
//             LT      => vec![0x13],
//             LTE     => vec![0x14],
//             ATOM    => vec![0x15],
//             NULL    => vec![0x16],
//             READC   => vec![0x17],
//             WRITEC  => vec![0x18],
//             CONS    => vec![0x19],
//             CAR     => vec![0x1A],
//             CDR     => vec![0x1B],
//             LDC     => vec![0x1C],
//             STOP    => vec![0x1D]
//         }
//     }
// }

#[cfg_attr(feature = "unstable",
    stable(feature = "encode", since="0.1.0") )]
impl<T> Encode for List<T>
where T: Encode {
    #[cfg_attr(feature = "unstable",
        stable(feature = "encode", since="0.1.0") )]
    fn emit(&self) -> Vec<u8> {
        match *self {
            Cons(ref it, ref tail) => {
                let mut result = vec![BYTE_CONS];
                push_all!(result, &it.emit());
                push_all!(result, &(*tail.emit()));
                result
            },
            Nil => vec![BYTE_NIL]
        }
    }
}