1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
2use bit_vec::BitVec;
3use huffman_compress::{Book, CodeBuilder, Tree};
4use lazy_static::*;
5use speedy::{Endianness, Readable};
6use std::io::{prelude::*, Cursor};
7use std::str::from_utf8_unchecked;
8use utf8_width::get_width;
9
10type VecStrU64 = Vec<(String, u64)>;
11
12#[derive(PartialEq, Debug, Readable)]
13struct WordCharCount(VecStrU64, Vec<(char, u64)>);
14
15lazy_static! {
16 pub static ref WORD_CHAR_COUNT: (VecStrU64, VecStrU64) = {
17 let zst = include_bytes!("d.zst");
18 let mut zstc = Cursor::new(zst);
19 let mut decoder = ruzstd::StreamingDecoder::new(&mut zstc).unwrap();
20 let mut bytes: Vec<u8> = Vec::with_capacity(zst.len() * 5);
21 decoder.read_to_end(&mut bytes).unwrap();
22
23 let wcc = WordCharCount::read_from_buffer_with_ctx(Endianness::LittleEndian, &bytes).unwrap();
24
25 let mut word_li = wcc.0;
26 let char_li = wcc.1;
27 let mut char_vec = Vec::with_capacity(char_li.len());
28 for (k, v) in char_li {
29 if k == '\n' {
30 word_li.push(("\n\n".into(), v / 4));
31 }
32 char_vec.push((k.to_string(), v));
33 }
34
35 (word_li, char_vec)
36 };
37 pub static ref G: (AhoCorasick, Book<&'static [u8]>, Tree<&'static [u8]>) = {
38 let (word_li, char_li) = &*WORD_CHAR_COUNT;
39
40 let mut weights = Vec::with_capacity(word_li.len() + char_li.len());
41 for (k, v) in word_li.iter().chain(char_li.iter()) {
42 weights.push((k.as_bytes(), *v));
43 }
44
45 let (book, tree) = CodeBuilder::from_iter(weights.into_iter()).finish();
46
47 let ac = AhoCorasickBuilder::new()
48 .match_kind(MatchKind::LeftmostFirst)
49 .build(
50 [&b"\r\n"[..]]
51 .iter()
52 .chain(&word_li.iter().map(|x| x.0.as_bytes()).collect::<Vec<_>>()),
53 );
54
55 (ac, book, tree)
56 };
57}
58
59pub fn init() {
60 lazy_static::initialize(&WORD_CHAR_COUNT);
61 lazy_static::initialize(&G);
62}
63
64pub fn encode(input: &[u8]) -> Vec<u8> {
65 let (ac, book, _) = &*G;
66
67 let mut buffer = BitVec::new();
68 let mut pos = 0;
69
70 macro_rules! encode_char {
71 ($max:ident) => {
72 while pos < $max {
73 let n = get_width(input[pos]);
74 if n == 0 {
75 pos += 1;
76 continue;
77 }
78 let pos_next = pos + n;
79 let mut c = unsafe { input.get_unchecked(pos..pos_next) };
80 if n == 1 && c == b"\r" {
81 c = b"\n";
82 }
83 pos = pos_next;
84 let _ = book.encode(&mut buffer, &c);
85 }
86 };
87 }
88
89 for mat in ac.find_iter(input) {
90 let start = mat.start();
91 let end = mat.end();
92 encode_char!(start);
93 pos = end;
94 let c = {
95 if mat.pattern() == 0 {
96 &b"\n"[..]
97 } else {
98 unsafe { input.get_unchecked(start..end) }
99 }
100 };
101 let _ = book.encode(&mut buffer, c);
102 }
103
104 let len = input.len();
105 encode_char!(len);
106
107 buffer.push(true);
108
109 buffer.to_bytes()
110}
111
112pub fn decode(input: &[u8]) -> String {
113 let tree = &(G.2);
115 let mut bits = BitVec::from_bytes(input);
116 let mut len = bits.len();
117
118 while len != 0 {
119 len -= 1;
120 if bits[len] {
121 break;
122 }
123 }
124 unsafe { bits.set_len(len) };
125
126 let mut result = Vec::with_capacity(len);
127
128 for i in tree.unbounded_decoder(&bits) {
129 for j in i {
130 result.push(*j);
131 }
132 }
133
134 unsafe { from_utf8_unchecked(&result) }.to_string()
135}
136
137