esexpr_binary/
lib.rs

1
2use num_bigint::{BigInt, BigUint, Sign};
3
4use derive_more::From;
5
6use std::{collections::HashMap, io::{Read, Write}};
7
8use esexpr::{ESExpr, ESExprCodec};
9
10#[derive(From, Debug)]
11pub enum ParseError {
12    #[from(ignore)]
13    InvalidTokenByte(u8),
14
15    #[from(ignore)]
16    InvalidStringTableIndex,
17
18    #[from(ignore)]
19    InvalidLength,
20
21    #[from(ignore)]
22    UnexpectedKeywordToken,
23
24    #[from(ignore)]
25    UnexpectedConstructorEnd,
26
27    #[from(ignore)]
28    UnexpectedEndOfFile,
29
30    #[from(ignore)]
31    InvalidStringPool(esexpr::DecodeError),
32
33    IOError(std::io::Error),
34    Utf8Error(std::str::Utf8Error),
35
36}
37
38enum VarIntTag {
39    ConstructorStart,
40    NonNegIntValue,
41    NegIntValue,
42    StringLengthValue,
43    StringPoolValue,
44    BytesLengthValue,
45    KeywordArgument,
46}
47
48enum ExprToken {
49    ConstructorStart(usize),
50    ConstructorStartKnown(&'static str),
51    ConstructorEnd,
52    Keyword(usize),
53    IntValue(BigInt),
54    StringValue(String),
55    StringPoolValue(usize),
56    BinaryValue(Vec<u8>),
57    Float32Value(f32),
58    Float64Value(f64),
59    BooleanValue(bool),
60    NullValue(BigUint),
61    AppendStringTable,
62}
63
64
65const TAG_VARINT_MASK: u8 = 0xE0;
66const TAG_VARINT_CONSTRUCTOR_START: u8 = 0x00;
67const TAG_VARINT_NON_NEG_INT: u8 = 0x20;
68const TAG_VARINT_NEG_INT: u8 = 0x40;
69const TAG_VARINT_STRING_LENGTH: u8 = 0x60;
70const TAG_VARINT_STRING_POOL: u8 = 0x80;
71const TAG_VARINT_BYTES_LENGTH: u8 = 0xA0;
72const TAG_VARINT_KEYWORD: u8 = 0xC0;
73
74const TAG_CONSTRUCTOR_END: u8 = 0xE0;
75const TAG_TRUE: u8 = 0xE1;
76const TAG_FALSE: u8 = 0xE2;
77const TAG_NULL0: u8 = 0xE3;
78const TAG_FLOAT32: u8 = 0xE4;
79const TAG_FLOAT64: u8 = 0xE5;
80const TAG_CONSTRUCTOR_START_STRING_TABLE: u8 = 0xE6;
81const TAG_CONSTRUCTOR_START_LIST: u8 = 0xE7;
82const TAG_NULL1: u8 = 0xE8;
83const TAG_NULL2: u8 = 0xE9;
84const TAG_NULLN: u8 = 0xEA;
85const TAG_APPEND_STRING_TABLE: u8 = 0xEB;
86
87
88enum ExprPlus {
89    Expr(ESExpr),
90    Keyword(usize),
91    ConstructorEnd,
92    AppendedToStringTable,
93    EndOfFile,
94}
95
96
97struct TokenReader<R> {
98    read: R,
99}
100
101impl <R: Read> Iterator for TokenReader<R> {
102    type Item = Result<ExprToken, ParseError>;
103
104    fn next(&mut self) -> Option<Self::Item> {
105        read_token_impl(self).transpose()
106    }
107}
108
109fn read_token_impl<R: Read>(reader: &mut TokenReader<R>) -> Result<Option<ExprToken>, ParseError> {
110    let mut b: [u8; 1] = [0];
111
112    if reader.read.read(&mut b)? == 0 {
113        return Ok(None);
114    }
115
116    let b: u8 = b[0];
117
118    Ok(Some(
119        if (b & TAG_VARINT_MASK) == TAG_VARINT_MASK {
120            match b {
121                TAG_CONSTRUCTOR_END => ExprToken::ConstructorEnd,
122                TAG_TRUE => ExprToken::BooleanValue(true),
123                TAG_FALSE => ExprToken::BooleanValue(false),
124                TAG_NULL0 => ExprToken::NullValue(BigUint::ZERO),
125                TAG_NULL1 => ExprToken::NullValue(BigUint::from(1u32)),
126                TAG_NULL2 => ExprToken::NullValue(BigUint::from(2u32)),
127                TAG_NULLN => {
128                    let n = read_int_full(reader)?;
129                    ExprToken::NullValue(n + 3u32)
130                },
131                TAG_FLOAT32 => {
132                    let buffer: [u8; 4] = read_bytes(reader)?;
133                    ExprToken::Float32Value(f32::from_le_bytes(buffer))
134                },
135                TAG_FLOAT64 => {
136                    let buffer: [u8; 8] = read_bytes(reader)?;
137                    ExprToken::Float64Value(f64::from_le_bytes(buffer))
138                },
139                TAG_CONSTRUCTOR_START_STRING_TABLE => ExprToken::ConstructorStartKnown("string-table"),
140                TAG_CONSTRUCTOR_START_LIST => ExprToken::ConstructorStartKnown("list"),
141                TAG_APPEND_STRING_TABLE => ExprToken::AppendStringTable,
142                _ => {
143                    return Err(ParseError::InvalidTokenByte(b));
144                },
145            }
146        }
147        else {
148            let tag = match b & TAG_VARINT_MASK {
149                TAG_VARINT_CONSTRUCTOR_START => VarIntTag::ConstructorStart,
150                TAG_VARINT_NON_NEG_INT => VarIntTag::NonNegIntValue,
151                TAG_VARINT_NEG_INT => VarIntTag::NegIntValue,
152                TAG_VARINT_STRING_LENGTH => VarIntTag::StringLengthValue,
153                TAG_VARINT_STRING_POOL => VarIntTag::StringPoolValue,
154                TAG_VARINT_BYTES_LENGTH => VarIntTag::BytesLengthValue,
155                TAG_VARINT_KEYWORD => VarIntTag::KeywordArgument,
156                _ => panic!("Should not be reachable"),
157            };
158
159            let mut n = read_int(reader, b)?;
160
161            match tag {
162                VarIntTag::ConstructorStart => ExprToken::ConstructorStart(get_string_table_index(n)?),
163                VarIntTag::NonNegIntValue => ExprToken::IntValue(BigInt::from_biguint(Sign::Plus, n)),
164                VarIntTag::NegIntValue => {
165                    n += 1u32;
166                    ExprToken::IntValue(BigInt::from_biguint(Sign::Minus, n))
167                },
168                VarIntTag::StringLengthValue => {
169                    let len = get_length(n)?;
170                    let mut buff = vec![0u8; len];
171                    reader.read.read_exact(&mut buff)?;
172                    ExprToken::StringValue(std::str::from_utf8(&buff)?.to_owned())
173                },
174                VarIntTag::StringPoolValue => ExprToken::StringPoolValue(get_string_table_index(n)?),
175                VarIntTag::BytesLengthValue => {
176                    let len = get_length(n)?;
177                    let mut buff = vec![0u8; len];
178                    reader.read.read_exact(&mut buff)?;
179                    ExprToken::BinaryValue(buff)
180                },
181                VarIntTag::KeywordArgument => ExprToken::Keyword(get_string_table_index(n)?),
182            }
183        }
184    ))
185
186}
187
188fn read_int<R: Read>(reader: &mut TokenReader<R>, initial: u8) -> Result<BigUint, ParseError> {
189    let current = initial & 0x0F;
190    let bit_offset = 4;
191    let has_next = (initial & 0x10) == 0x10;
192
193    read_int_rest(reader, current, bit_offset, has_next)
194}
195
196fn read_int_full<R: Read>(reader: &mut TokenReader<R>) -> Result<BigUint, ParseError> {
197    let current = 0;
198    let bit_offset = 0;
199    let has_next = true;
200
201    read_int_rest(reader, current, bit_offset, has_next)
202}
203
204fn read_int_rest<R: Read>(reader: &mut TokenReader<R>, mut current: u8, mut bit_offset: i32, mut has_next: bool) -> Result<BigUint, ParseError> {
205    let mut buffer = Vec::new();
206
207    while has_next {
208        let b = read_byte(reader)?;
209
210        has_next = (b & 0x80) == 0x80;
211
212        let value = b & 0x7F;
213        let low = value << bit_offset;
214        let high = if bit_offset > 1 { value >> (8 - bit_offset) } else { 0 };
215        
216
217
218        current |= low;
219        bit_offset += 7;
220        if bit_offset >= 8 {
221            bit_offset -= 8;
222            buffer.push(current);
223            current = high;
224        }
225    }
226
227    if bit_offset > 0 {
228        buffer.push(current);
229    }
230    
231    Ok(BigUint::from_bytes_le(&buffer))
232}
233
234
235
236fn read_bytes<R: Read, const N: usize>(reader: &mut TokenReader<R>) -> Result<[u8; N], std::io::Error> {
237    let mut b: [u8; N] = [0; N];
238    reader.read.read_exact(&mut b)?;
239    Ok(b)
240}
241
242fn read_byte<R: Read>(reader: &mut TokenReader<R>) -> Result<u8, std::io::Error> {
243    Ok(read_bytes::<R, 1>(reader)?[0])
244}
245
246fn get_string_table_index(i: BigUint) -> Result<usize, ParseError> {
247    i.try_into().map_err(|_| ParseError::InvalidStringTableIndex)
248}
249
250fn get_length(i: BigUint) -> Result<usize, ParseError> {
251    i.try_into().map_err(|_| ParseError::InvalidLength)
252}
253
254pub trait ExprParser {
255    fn try_read_next_expr(&mut self) -> Result<Option<ESExpr>, ParseError>;
256    fn read_next_expr(&mut self) -> Result<ESExpr, ParseError>;
257}
258
259struct ExprParserImpl<I> {
260    string_pool: Vec<String>,
261    iter: I,
262}
263
264impl <I: Iterator<Item=Result<ExprToken, ParseError>>> ExprParser for ExprParserImpl<I> {
265    fn try_read_next_expr(&mut self) -> Result<Option<ESExpr>, ParseError> {
266        loop {
267            return match self.read_expr_plus()? {
268                ExprPlus::Expr(expr) => Ok(Some(expr)),
269                ExprPlus::Keyword(_) => Err(ParseError::UnexpectedKeywordToken),
270                ExprPlus::ConstructorEnd => Err(ParseError::UnexpectedConstructorEnd),
271                ExprPlus::AppendedToStringTable => continue,
272                ExprPlus::EndOfFile => Ok(None),
273            }
274        }
275    }
276
277    fn read_next_expr(&mut self) -> Result<ESExpr, ParseError> {
278        self.try_read_next_expr()?.ok_or(ParseError::UnexpectedEndOfFile)
279    }
280}
281
282impl <I: Iterator<Item=Result<ExprToken, ParseError>>> ExprParserImpl<I> {
283    fn read_expr_plus(&mut self) -> Result<ExprPlus, ParseError> {
284        let Some(token) = self.iter.next().transpose()? else {
285            return Ok(ExprPlus::EndOfFile)
286        };
287
288        Ok(ExprPlus::Expr(match token {
289            ExprToken::ConstructorStart(index) => {
290                let name = self.get_string(index)?;
291                self.read_expr_constructor(name)?
292            },
293            ExprToken::ConstructorStartKnown(name) => {
294                self.read_expr_constructor(name.to_owned())?
295            }
296            ExprToken::ConstructorEnd => return Ok(ExprPlus::ConstructorEnd),
297            ExprToken::Keyword(index) => return Ok(ExprPlus::Keyword(index)),
298            ExprToken::IntValue(i) => ESExpr::Int(i),
299            ExprToken::StringValue(s) => ESExpr::Str(s),
300            ExprToken::StringPoolValue(index) => ESExpr::Str(self.get_string(index)?),
301            ExprToken::BinaryValue(b) => ESExpr::Binary(b),
302            ExprToken::Float32Value(f) => ESExpr::Float32(f),
303            ExprToken::Float64Value(d) => ESExpr::Float64(d),
304            ExprToken::BooleanValue(b) => ESExpr::Bool(b),
305            ExprToken::NullValue(level) => ESExpr::Null(level),
306            ExprToken::AppendStringTable => {
307                let new_string_table = self.read_next_expr()?;
308                let new_string_table = AppendedStringPool::decode_esexpr(new_string_table)
309                    .map_err(ParseError::InvalidStringPool)?;
310
311                match new_string_table {
312                    AppendedStringPool::Fixed(mut fixed_string_pool) =>
313                        self.string_pool.append(&mut fixed_string_pool.strings),
314
315                    AppendedStringPool::Single(s) =>
316                        self.string_pool.push(s),
317                }
318
319                return Ok(ExprPlus::AppendedToStringTable)
320            }
321        }))
322    }
323
324    fn read_expr_constructor(&mut self, name: String) -> Result<ESExpr, ParseError> {
325        let mut args = Vec::new();
326        let mut kwargs = HashMap::new();
327
328        loop {
329            match self.read_expr_plus()? {
330                ExprPlus::Expr(expr) => args.push(expr),
331                ExprPlus::Keyword(index) => {
332                    let kw = self.get_string(index)?;
333                    let value = self.read_next_expr()?;
334                    kwargs.insert(kw, value);
335                },
336                ExprPlus::ConstructorEnd => break,
337                ExprPlus::AppendedToStringTable => continue,
338                ExprPlus::EndOfFile => return Err(ParseError::UnexpectedEndOfFile),
339            }
340        }
341
342        Ok(ESExpr::Constructor { name, args, kwargs })
343    }
344
345    fn get_string(&self, i: usize) -> Result<String, ParseError> {
346        self.string_pool.get(i)
347            .map(|s| s.as_str().to_owned())
348            .ok_or(ParseError::InvalidStringTableIndex)
349    }
350}
351
352impl <I: Iterator<Item=Result<ExprToken, ParseError>>> Iterator for ExprParserImpl<I> {
353    type Item = Result<ESExpr, ParseError>;
354
355    fn next(&mut self) -> Option<Self::Item> {
356        self.try_read_next_expr().transpose()
357    }
358}
359
360pub fn parse_existing_string_pool<'a, F: Read + 'a>(f: F, string_pool: Vec<String>) -> impl ExprParser + Iterator<Item=Result<ESExpr, ParseError>> + 'a {
361    ExprParserImpl {
362        iter: TokenReader { read: f },
363        string_pool,
364    }
365}
366
367pub fn parse<'a, F: Read + 'a>(f: F) -> impl ExprParser + Iterator<Item=Result<ESExpr, ParseError>> + 'a {
368    parse_existing_string_pool(f, Vec::new())
369}
370
371
372#[derive(From, Debug)]
373pub enum GeneratorError {
374    #[from(ignore)]
375    StringNotInStringPool,
376
377    IOError(std::io::Error),
378}
379
380
381pub trait StringPool {
382    fn lookup(&mut self, s: &str) -> Option<usize>;
383}
384
385
386pub struct ExprGenerator<'a, W> {
387    out: &'a mut W,
388    string_pool: Vec<String>,
389}
390
391impl <'a, W: Write> ExprGenerator<'a, W> {
392    pub fn new(out: &'a mut W) -> Self {
393        ExprGenerator {
394            out,
395            string_pool: Vec::new(),
396        }
397    }
398
399    pub fn new_with_string_pool(out: &'a mut W, string_pool: Vec<String>) -> Self {
400        ExprGenerator {
401            out,
402            string_pool,
403        }
404    }
405
406    pub fn generate(&mut self, expr: &ESExpr) -> Result<(), GeneratorError> {
407        let old_string_pool_end = self.string_pool.len();
408
409        let mut generator = ExprGenerator {
410            out: &mut std::io::sink(),
411            string_pool: Vec::new(),
412        };
413
414        std::mem::swap(&mut self.string_pool, &mut generator.string_pool);
415        // Dummy generator to catch new strings
416        generator.generate_expr(expr)?;
417
418        std::mem::swap(&mut self.string_pool, &mut generator.string_pool);
419    
420        match &self.string_pool[old_string_pool_end..] {
421            [] => {},
422            [ s ] => {
423                let s = s.to_owned();
424                self.write(TAG_APPEND_STRING_TABLE)?;
425                self.generate_expr(&ESExpr::Str(s.to_owned()))?;
426            },
427            new_strings => {
428                let sp_expr = FixedStringPool {
429                    strings: new_strings.to_vec(),
430                }.encode_esexpr();
431
432                self.write(TAG_APPEND_STRING_TABLE)?;
433                self.generate_expr(&sp_expr)?;
434            }
435        }
436        
437        self.generate_expr(expr)?;
438        Ok(())
439    }
440
441    fn generate_expr(&mut self, expr: &ESExpr) -> Result<(), GeneratorError> {
442        match expr {
443            ESExpr::Constructor { name, args, kwargs } => {
444                match name.as_str() {
445                    "string-table" => self.write(TAG_CONSTRUCTOR_START_STRING_TABLE)?,
446                    "list" => self.write(TAG_CONSTRUCTOR_START_LIST)?,
447                    _ => {
448                        let index = self.get_string_pool_index(&name)?;
449                        self.write_int_tag(TAG_VARINT_CONSTRUCTOR_START, &BigUint::from(index))?;
450                    }
451                }
452
453                for arg in args {
454                    self.generate_expr(arg)?;
455                }
456
457                for (kw, value) in kwargs {
458                    let index = self.get_string_pool_index(&kw)?;
459                    self.write_int_tag(TAG_VARINT_KEYWORD, &BigUint::from(index))?;
460                    self.generate_expr(value)?;
461                }
462
463                self.write(TAG_CONSTRUCTOR_END)?;
464            },
465            ESExpr::Bool(true) => {
466                self.write(TAG_TRUE)?;
467            },
468            ESExpr::Bool(false) => {
469                self.write(TAG_FALSE)?;
470            },
471            ESExpr::Int(i) => {
472                let (sign, mut magnitude) = i.clone().into_parts();
473
474                match sign {
475                    Sign::NoSign | Sign::Plus => {
476                        self.write_int_tag(TAG_VARINT_NON_NEG_INT, &magnitude)?;
477                    },
478
479                    Sign::Minus => {
480                        magnitude -= 1usize;
481                        self.write_int_tag(TAG_VARINT_NEG_INT, &magnitude)?;
482                    },
483                }
484            },
485            ESExpr::Str(s) => {
486                self.write_int_tag(TAG_VARINT_STRING_LENGTH, &BigUint::from(s.len()))?;
487                self.out.write_all(&s.as_bytes())?;
488            },
489            ESExpr::Binary(b) => {
490                self.write_int_tag(TAG_VARINT_BYTES_LENGTH, &BigUint::from(b.len()))?;
491                self.out.write_all(&b)?;
492            },
493            ESExpr::Float32(f) => {
494                self.write(TAG_FLOAT32)?;
495                self.out.write_all(&f32::to_le_bytes(*f))?;
496            },
497            ESExpr::Float64(d) => {
498                self.write(TAG_FLOAT64)?;
499                self.out.write_all(&f64::to_le_bytes(*d))?;
500            },
501            ESExpr::Null(level) => {
502                if *level == BigUint::ZERO {
503                    self.write(TAG_NULL0)?;
504                }
505                else if *level == BigUint::from(1u32) {
506                    self.write(TAG_NULL1)?;
507                }
508                else if *level == BigUint::from(2u32) {
509                    self.write(TAG_NULL2)?;
510                }
511                else {
512                    self.write(TAG_NULLN)?;
513                    self.write_int_full(&(level - 3u32))?;
514                }
515            },
516        }
517
518        Ok(())
519    }
520
521    fn get_string_pool_index(&mut self, s: &str) -> Result<usize, GeneratorError> {
522        if let Some(index) = self.string_pool.iter().position(|s2| s2 == s) {
523            return Ok(index);
524        }
525
526        let index = self.string_pool.len();
527        self.string_pool.push(s.to_owned());
528
529        self.write(TAG_APPEND_STRING_TABLE)?;
530        self.generate_expr(&ESExpr::Str(s.to_owned()))?;
531
532        Ok(index)
533    }
534
535    fn write_int_tag(&mut self, tag: u8, i: &BigUint) -> Result<(), GeneratorError> {
536        let buff = i.to_bytes_le();
537
538        let b0 = *buff.get(0).unwrap_or(&0);
539        let mut current = tag | (b0 & 0x0F);
540        if buff.len() < 2 && (b0 & 0xF0) == 0 {
541            self.write(current)?;
542            return Ok(());
543        }
544
545        current |= 0x10;
546        self.write(current)?;
547
548        current = b0 >> 4;
549        let bit_index = 4;
550
551        self.write_int_rest(&buff[1..], current, bit_index)
552    }
553
554    fn write_int_full(&mut self, i: &BigUint) -> Result<(), GeneratorError> {
555        if *i == BigUint::ZERO {
556            self.write(0)?;
557            return Ok(());
558        }
559
560        let buff = i.to_bytes_le();
561        let current = 0;
562        let bit_index = 0;
563
564        self.write_int_rest(&buff, current, bit_index)
565    }
566
567    fn write_int_rest(&mut self, buff: &[u8], mut current: u8, mut bit_index: i32) -> Result<(), GeneratorError> {
568        for (i, b) in buff.iter().copied().enumerate() {            
569            let mut bit_index2 = 0;
570            while bit_index2 < 8 {
571                let written_bits = std::cmp::min(7 - bit_index, 8 - bit_index2);
572                current |= ((b >> bit_index2) & 0x7F) << bit_index;
573
574                
575                bit_index += written_bits;
576                bit_index2 += written_bits;
577                if bit_index >= 7 {
578                    if i < buff.len() - 1 || (bit_index2 < 8 && (b >> bit_index2) != 0) {
579                        current |= 0x80;
580                    }
581
582                    self.write(current)?;
583                    bit_index = 0;
584                    current = 0;
585                }
586            }
587        }
588
589        if current != 0 {
590            self.write(current)?;
591        }
592
593        Ok(())
594    }
595
596    fn write(&mut self, b: u8) -> Result<(), GeneratorError> {
597        Ok(self.out.write_all(std::slice::from_ref(&b))?)
598    }
599}
600
601
602#[derive(ESExprCodec, Debug, PartialEq, Clone)]
603#[constructor = "string-table"]
604pub struct FixedStringPool {
605    #[vararg]
606    pub strings: Vec<String>,
607}
608
609impl StringPool for FixedStringPool {
610    fn lookup(&mut self, s: &str) -> Option<usize> {
611        self.strings.iter().position(|a| a == s)
612    }
613}
614
615#[derive(ESExprCodec, Debug, PartialEq, Clone)]
616pub enum AppendedStringPool {
617    #[inline_value]
618    Fixed(FixedStringPool),
619
620    #[inline_value]
621    Single(String),
622}
623
624
625#[cfg(test)]
626mod test {
627    use std::str::FromStr;
628
629    use num_bigint::BigUint;
630
631    use crate::*;
632
633    #[test]
634    fn encode_int() {
635        fn check(n: &str, enc: &[u8]) {
636            let n = BigUint::from_str(n).unwrap();
637            
638            let mut buff: Vec<u8> = Vec::new();
639            let mut gen = ExprGenerator {
640                out: &mut buff,
641                string_pool: vec![],
642            };
643
644            gen.write_int_tag(TAG_VARINT_NON_NEG_INT, &n).unwrap();
645
646            assert_eq!(enc, &buff);
647
648            let mut reader = TokenReader {
649                read: &enc[1..],
650            };
651            let m = read_int(&mut reader, enc[0]).unwrap();
652            assert_eq!(n, m);
653        }
654
655        check("4", &[0x24]);
656        check("9223372036854775807", &[0x3F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07]);
657        check("18446744073709551615", &[0x3F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F]);
658        check("12345678901234567890", &[0x32, 0xAD, 0xE1, 0xC7, 0xF5, 0x8C, 0xD3, 0xD2, 0xDA, 0x0A]);
659        check("98765432109876543210", &[0x3A, 0xEE, 0xCF, 0xC9, 0xF2, 0xB8, 0x9A, 0x95, 0xD5, 0x55]);
660    }
661}
662