1use lazy_static::lazy_static;
2
3lazy_static! {
4 static ref BIGRAMS: &'static str = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty";
6 static ref WORDS: Vec<&'static str> = vec!["that", "this", "with", "from", "your", "have", "more", "will", "home",
8"about", "page", "search", "free", "other", "information", "time", "they",
9"what", "which", "their", "news", "there", "only", "when", "contact", "here",
10"business", "also", "help", "view", "online", "first", "been", "would", "were",
11"some", "these", "click", "like", "service", "than", "find", "date", "back",
12"people", "list", "name", "just", "over", "year", "into", "email", "health",
13"world", "next", "used", "work", "last", "most", "music", "data", "make",
14"them", "should", "product", "post", "city", "policy", "number", "such",
15"please", "available", "copyright", "support", "message", "after", "best",
16"software", "then", "good", "video", "well", "where", "info", "right", "public",
17"high", "school", "through", "each", "order", "very", "privacy", "book", "item",
18"company", "read", "group", "need", "many", "user", "said", "does", "under",
19"general", "research", "university", "january", "mail", "full", "review",
20"program", "life", "know", "days", "management", "part", "could", "great",
21"united", "real", "international", "center", "ebay", "must", "store", "travel",
22"comment", "made", "development", "report", "detail", "line", "term", "before",
23"hotel", "send", "type", "because", "local", "those", "using", "result",
24"office", "education", "national", "design", "take", "posted", "internet",
25"address", "community", "within", "state", "area", "want", "phone", "shipping",
26"reserved", "subject", "between", "forum", "family", "long", "based", "code",
27"show", "even", "black", "check", "special", "price", "website", "index",
28"being", "women", "much", "sign", "file", "link", "open", "today", "technology",
29"south", "case", "project", "same", "version", "section", "found", "sport",
30"house", "related", "security", "both", "county", "american", "game", "member",
31"power", "while", "care", "network", "down", "computer", "system", "three",
32"total", "place", "following", "download", "without", "access", "think",
33"north", "resource", "current", "media", "control", "water", "history",
34"picture", "size", "personal", "since", "including", "guide", "shop",
35"directory", "board", "location", "change", "white", "text", "small", "rating",
36"rate", "government", "child", "during", "return", "student", "shopping",
37"account", "site", "level", "digital", "profile", "previous", "form", "event",
38"love", "main", "another", "class", "still"];
39}
40
41
42pub fn compress(s: &str) -> Option<Vec<u8>> {
45 let mut dst: Vec<u8> = vec![];
46 let mut verblen = 0u8;
47 let s_bytes = s.as_bytes();
48 let mut cursor = 0;
49
50 while cursor < s_bytes.len() {
51 let mut matched = false;
52
53 if s_bytes.len() - cursor >= 4 {
54 for (i, w) in WORDS.iter().enumerate() {
55 if s[cursor..].starts_with(w) {
56 let escape_code = match s.as_bytes()[cursor..].get(w.len()) {
57 Some(&b' ') => 7,
58 _ => 6,
59 };
60
61 dst.push(escape_code);
62 dst.push(i as u8);
63 cursor += w.len();
64 verblen = 0;
65 matched = true;
66 break;
67 }
68 }
69 }
70
71 if matched {
72 continue;
73 }
74
75 if s_bytes.len() - cursor >= 2 {
76 let bigram_slice = &BIGRAMS.as_bytes()[..BIGRAMS.len() - 1];
77 for (i, bigram) in bigram_slice.chunks(2).enumerate() {
78 if s_bytes[cursor..cursor + 2] == *bigram {
79 dst.push(0x80 | i as u8);
80 cursor += 2;
81 verblen = 0;
82 matched = true;
83 break;
84 }
85 }
86 }
87
88 if matched {
89 continue;
90 }
91
92 let byte = s_bytes[cursor];
93 if !(0x01..=0x08).contains(&byte) {
94 dst.push(byte);
95 cursor += 1;
96 verblen = 0;
97 } else {
98 verblen += 1;
99 if verblen == 1 {
100 dst.extend(&[verblen, byte]);
101 } else {
102 let len_idx = dst.len() - verblen as usize - 1;
103 dst[len_idx] = verblen;
104 dst.push(byte);
105 if verblen == 5 {
106 verblen = 0;
107 }
108 }
109 cursor += 1;
110 }
111 }
112
113 Some(dst)
114}
115
116
117pub fn decompress(c: &[u8]) -> Option<String> {
120 let mut res = String::new();
121 let bigrams_bytes = BIGRAMS.as_bytes();
122 let mut i = 0;
123
124 while i < c.len() {
125 match c[i] {
126 0x80..=0xFF => {
127 let index = ((c[i] & 0x7F) as usize) * 2;
128 if index + 1 < bigrams_bytes.len() {
129 res.push(bigrams_bytes[index] as char);
130 res.push(bigrams_bytes[index + 1] as char);
131 i += 1;
132 } else {
133 return None; }
135 }
136 0x01..=0x05 => {
137 let length = c[i] as usize;
138 if i + length < c.len() {
139 res.extend(c[i + 1..i + 1 + length].iter().map(|&b| b as char));
140 i += length + 1;
141 } else {
142 return None; }
144 }
145 0x06..=0x08 => {
146 if i + 1 < c.len() {
147 let index = c[i + 1] as usize;
148 if let Some(word) = WORDS.get(index) {
149 res.push_str(word);
150 if c[i] == 0x07 || c[i] == 0x08 {
151 res.push(' ');
152 }
153 i += 2;
154 } else {
155 return None; }
157 } else {
158 return None; }
160 }
161 _ => {
162 res.push(c[i] as char);
163 i += 1;
164 }
165 }
166 }
167
168 Some(res)
169}