smol_file/
lib.rs

1//! # smol - 5 bit encoding file format
2//! what is smol? smol is a file format that compresses text into 5 bits per letter instead of the normal 8 bits.
3//! this is achived by having a charset lesser than 32 chars so all letters fits nicely into 5 bits (32 values). 
4//! this makes all sort of problem such as an byte being 8 so multiple letters will overlap eachother
5//! but this is all handeled through this library for ease of use.  
6//!   
7//! this is not made for any production applications only as an hobby.  
8//!   
9//! ## how does this work?  
10//! this is the entire charset  
11//! ```
12//! " abcdefghijklmnopqrstuvwxyz.!?12"  
13//!  ^ space
14//! ```  
15//!  
16//! first 30 chars are normal (space, alphabet, ! and ?)  
17//!   
18//! the last two (1 and 2) have special functions which is:  
19//! (1): enters number mode - all characters read will be interpreted as their number part. [A = 1, B = 2 ... I = 9]  
20//! (2): makes the next procceding character uppercase or special action in some cases. example: newlines are formatted `"2 "`
21//!   
22//! see also [`SmolBlob`]
23
24use std::io::{ Cursor, Read };
25use anyhow::{ Result, Error };
26use utils::{char_to_index, index_to_char};
27mod utils;
28
29// CHANGE THIS EACH RELEASE
30const VERSION: u64 = 2;
31
32#[allow(dead_code)]
33pub struct SmolBlob {
34    version: u64,
35    buffer: Vec<u8>,
36
37    size: u64,
38
39    current: u16,
40    offset: u8,
41
42    number_mode: bool,
43    super_mode: bool,
44
45    current_bit: u8,
46    byte_index: usize,
47}
48
49impl Default for SmolBlob {
50    fn default() -> Self {
51        Self { 
52            version: VERSION, 
53            buffer: vec![], 
54            size: 0, 
55            current: 0, 
56            offset: 0, 
57            number_mode: false, 
58            super_mode: false,
59            current_bit: 0,
60            byte_index: 0,
61        }
62    }
63}
64
65/// [`SmolBlob`] is a chunk of smol data
66impl SmolBlob {
67    /// converts the [`SmolBlob`] data into a valid file buffer
68    pub fn buffer(&self) -> Vec<u8> {
69        let mut buffer = vec![];
70        buffer.extend(b"smol");
71
72        let mut header = vec![];
73        leb128::write::unsigned(&mut header, self.version).unwrap();
74        leb128::write::unsigned(&mut header, self.size).unwrap();
75
76        leb128::write::unsigned(&mut buffer, header.len() as u64).unwrap();
77        buffer.extend(header);
78        leb128::write::unsigned(&mut buffer, self.buffer.len() as u64).unwrap();
79        buffer.extend(self.buffer.clone());
80
81        buffer
82    }
83
84    /// returns the inner buffer length **ONLY**, not to be confused by [`SmolBlob::buffer`].len()
85    pub fn len(&self) -> usize {
86        self.buffer().len()
87    }
88
89    /// encodes a [`String`] and returns a [`SmolBlob`]
90    /// # example
91    ///
92    /// ```
93    /// let encoded: SmolBlob = SmolBlob::encode(&input);
94    /// fs::write("smol.bin", &encoded.buffer()).unwrap();
95    /// ```
96    pub fn encode(str: &String) -> SmolBlob {
97        let mut blob = SmolBlob::default();
98        for char in str.chars().into_iter() {
99            if char.is_numeric() {
100                if !blob.number_mode {
101                    blob.push_char('1');
102                    blob.number_mode = true;
103                }
104                let number: u32 = char.to_digit(10).unwrap();
105                blob.push_char(index_to_char(number as usize));
106            } else {
107                if blob.number_mode {
108                    blob.push_char('1');
109                    blob.number_mode = false;
110                }
111
112                if char.is_uppercase() {
113                    blob.push_char('2');
114                }
115
116                blob.push_char(char.to_ascii_lowercase());
117            } 
118        }
119
120        if blob.number_mode {
121            blob.push_char('1');
122            blob.number_mode = false;
123        }
124    
125        if blob.offset > 0 {
126            blob.buffer.push(blob.current as u8);
127        }
128    
129        return blob;
130    }
131
132    fn push_char(&mut self, char: char) {
133        self.size += 1;
134
135        let index = utils::char_to_index(char);
136        self.current |= (index as u16) << self.offset;
137        self.offset += 5;
138        if self.offset >= 8 {
139            self.buffer.push((self.current & 0xff) as u8);
140            self.current = (self.current & 0xff00) >> 8;
141            self.offset -= 8;
142        }
143    }
144
145    /// decodes a [`SmolBlob::buffer`] and returns a [`String`]  
146    /// # example
147    ///
148    /// ```
149    /// let decoded: String = SmolBlob::decode(&encoded.buffer()).unwrap();
150    /// fs::write("unsmol.bin", &decoded).unwrap();
151    /// ```
152    pub fn decode(input: &Vec<u8>) -> Result<String, Error> {
153        if input.len() < 4 {
154            return Err(Error::msg("file is too small, is it truncated?"));
155        }
156        let mut curs = Cursor::new(input);
157        
158        let mut magic = [0u8; 4];
159        curs.read_exact(&mut magic)?;
160    
161        if &magic != b"smol" {
162            return Err(Error::msg("invalid file magic, this is not an smol file"));
163        }
164    
165        let _header_size = leb128::read::unsigned(&mut curs)?;
166        let version = leb128::read::unsigned(&mut curs)?;
167
168        let buffer_size;
169        let size = match version {
170            1 => {
171                buffer_size = leb128::read::unsigned(&mut curs)?;
172                buffer_size
173            }
174            _ => {
175                let size = leb128::read::unsigned(&mut curs)?;
176                buffer_size = leb128::read::unsigned(&mut curs)?;
177                size
178            }
179        };
180
181        let mut blob = SmolBlob::default();
182        blob.buffer = vec![0u8; buffer_size as usize];
183        curs.read_exact(&mut blob.buffer)?;
184    
185        let mut out_string = String::new();
186
187        for _ in 0..size {
188            let char = match blob.read_char() {
189                Some(c) => c,
190                None => break,
191            };
192            
193            if char.is_numeric() {
194                match char {
195                    '1' => {
196                        blob.number_mode = !blob.number_mode;
197                    }
198                    '2' => {
199                        blob.super_mode = true;
200                    }
201                    _ => {
202                        return Err(Error::msg("invalid data when decoding"));
203                    }
204                }
205            } else {
206                if blob.number_mode {
207                    let num: u32 = char_to_index(char) as u32;
208                    out_string.push(char::from_digit(num, 10).unwrap());
209                } else {
210                    if blob.super_mode {
211                        out_string.push(char.to_ascii_uppercase());
212                        blob.super_mode = false;
213                    } else {
214                        out_string.push(char);
215                    }
216                }
217            }
218        }
219    
220        Ok(out_string)
221    }
222
223    fn read_char(&mut self) -> Option<char> {
224        let byte_index = match self.buffer.get(self.byte_index) {
225            Some(i) => i,
226            None => return None,
227        };
228
229        let mut current: u8 = (byte_index >> self.current_bit) & 0x1F;
230    
231        if self.current_bit >= 4 {
232            if self.byte_index == (self.buffer.len() - 1) {
233                return None;
234            } 
235
236            current |= (self.buffer[self.byte_index + 1]  << (8 - self.current_bit)) & 0x1F;
237        }
238
239        self.current_bit += 5;
240        if self.current_bit >= 8 {
241            self.byte_index += 1;
242            self.current_bit -= 8;
243        }
244
245        let char = utils::index_to_char(current as usize);
246
247        return Some(char);
248    }
249
250    /// decodes a [`SmolBlob`] and returns a [`String`]  
251    /// internally this is the same as [`SmolBlob::decode`] but with [`SmolBlob`] instead of [`Vec<u8>`] buffer
252    /// # example
253    ///
254    /// ```
255    /// let decoded: String = SmolBlob::decode_blob(&encoded).unwrap();
256    /// fs::write("unsmol.bin", &decoded).unwrap();
257    /// ```
258    pub fn decode_blob(input: &SmolBlob) -> Result<String, Error> {
259        return SmolBlob::decode(&input.buffer());
260    }
261}