smaz2/
smaz2.rs

1use lazy_static::lazy_static;
2
3lazy_static! {
4    /// 128 common bigrams
5    static ref BIGRAMS: &'static str = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty";
6    /// 256 common English words of length four letters or more.
7    static ref WORDS: Vec<&'static str> = vec!["that", "this", "with", "from", "your", "have", "more", "will", "home",
8"about", "page", "search", "free", "other", "information", "time", "they",
9"what", "which", "their", "news", "there", "only", "when", "contact", "here",
10"business", "also", "help", "view", "online", "first", "been", "would", "were",
11"some", "these", "click", "like", "service", "than", "find", "date", "back",
12"people", "list", "name", "just", "over", "year", "into", "email", "health",
13"world", "next", "used", "work", "last", "most", "music", "data", "make",
14"them", "should", "product", "post", "city", "policy", "number", "such",
15"please", "available", "copyright", "support", "message", "after", "best",
16"software", "then", "good", "video", "well", "where", "info", "right", "public",
17"high", "school", "through", "each", "order", "very", "privacy", "book", "item",
18"company", "read", "group", "need", "many", "user", "said", "does", "under",
19"general", "research", "university", "january", "mail", "full", "review",
20"program", "life", "know", "days", "management", "part", "could", "great",
21"united", "real", "international", "center", "ebay", "must", "store", "travel",
22"comment", "made", "development", "report", "detail", "line", "term", "before",
23"hotel", "send", "type", "because", "local", "those", "using", "result",
24"office", "education", "national", "design", "take", "posted", "internet",
25"address", "community", "within", "state", "area", "want", "phone", "shipping",
26"reserved", "subject", "between", "forum", "family", "long", "based", "code",
27"show", "even", "black", "check", "special", "price", "website", "index",
28"being", "women", "much", "sign", "file", "link", "open", "today", "technology",
29"south", "case", "project", "same", "version", "section", "found", "sport",
30"house", "related", "security", "both", "county", "american", "game", "member",
31"power", "while", "care", "network", "down", "computer", "system", "three",
32"total", "place", "following", "download", "without", "access", "think",
33"north", "resource", "current", "media", "control", "water", "history",
34"picture", "size", "personal", "since", "including", "guide", "shop",
35"directory", "board", "location", "change", "white", "text", "small", "rating",
36"rate", "government", "child", "during", "return", "student", "shopping",
37"account", "site", "level", "digital", "profile", "previous", "form", "event",
38"love", "main", "another", "class", "still"];
39}
40
41
42///  Compress the string 's' of 'len' bytes and stores the compression
43//   result in 'dst' for a maximum of 'dstlen' bytes.
44pub fn compress(s: &str) -> Option<Vec<u8>> {
45    let mut dst: Vec<u8> = vec![];
46    let mut verblen = 0u8;
47    let s_bytes = s.as_bytes();
48    let mut cursor = 0;
49
50    while cursor < s_bytes.len() {
51        let mut matched = false;
52
53        if s_bytes.len() - cursor >= 4 {
54            for (i, w) in WORDS.iter().enumerate() {
55                if s[cursor..].starts_with(w) {
56                    let escape_code = match s.as_bytes()[cursor..].get(w.len()) {
57                        Some(&b' ') => 7,
58                        _ => 6,
59                    };
60
61                    dst.push(escape_code);
62                    dst.push(i as u8);
63                    cursor += w.len();
64                    verblen = 0;
65                    matched = true;
66                    break;
67                }
68            }
69        }
70
71        if matched {
72            continue;
73        }
74
75        if s_bytes.len() - cursor >= 2 {
76            let bigram_slice = &BIGRAMS.as_bytes()[..BIGRAMS.len() - 1];
77            for (i, bigram) in bigram_slice.chunks(2).enumerate() {
78                if s_bytes[cursor..cursor + 2] == *bigram {
79                    dst.push(0x80 | i as u8);
80                    cursor += 2;
81                    verblen = 0;
82                    matched = true;
83                    break;
84                }
85            }
86        }
87
88        if matched {
89            continue;
90        }
91
92        let byte = s_bytes[cursor];
93        if !(0x01..=0x08).contains(&byte) {
94            dst.push(byte);
95            cursor += 1;
96            verblen = 0;
97        } else {
98            verblen += 1;
99            if verblen == 1 {
100                dst.extend(&[verblen, byte]);
101            } else {
102                let len_idx = dst.len() - verblen as usize - 1;
103                dst[len_idx] = verblen;
104                dst.push(byte);
105                if verblen == 5 {
106                    verblen = 0;
107                }
108            }
109            cursor += 1;
110        }
111    }
112
113    Some(dst)
114}
115
116
117/// Decompress the string 'c' of 'clen' bytes and stores the decompression
118/// result in String 's'.
119pub fn decompress(c: &[u8]) -> Option<String> {
120    let mut res = String::new();
121    let bigrams_bytes = BIGRAMS.as_bytes();
122    let mut i = 0;
123
124    while i < c.len() {
125        match c[i] {
126            0x80..=0xFF => {
127                let index = ((c[i] & 0x7F) as usize) * 2;
128                if index + 1 < bigrams_bytes.len() {
129                    res.push(bigrams_bytes[index] as char);
130                    res.push(bigrams_bytes[index + 1] as char);
131                    i += 1;
132                } else {
133                    return None; // Invalid bigram index
134                }
135            }
136            0x01..=0x05 => {
137                let length = c[i] as usize;
138                if i + length < c.len() {
139                    res.extend(c[i + 1..i + 1 + length].iter().map(|&b| b as char));
140                    i += length + 1;
141                } else {
142                    return None; // Invalid byte range for verbatim sequence
143                }
144            }
145            0x06..=0x08 => {
146                if i + 1 < c.len() {
147                    let index = c[i + 1] as usize;
148                    if let Some(word) = WORDS.get(index) {
149                        res.push_str(word);
150                        if c[i] == 0x07 || c[i] == 0x08 {
151                            res.push(' ');
152                        }
153                        i += 2;
154                    } else {
155                        return None; // Invalid word index
156                    }
157                } else {
158                    return None; // Not enough bytes for a word escape code
159                }
160            }
161            _ => {
162                res.push(c[i] as char);
163                i += 1;
164            }
165        }
166    }
167
168    Some(res)
169}