rmw_utf8/
lib.rs

1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
2use bit_vec::BitVec;
3use huffman_compress::{Book, CodeBuilder, Tree};
4use lazy_static::*;
5use speedy::{Endianness, Readable};
6use std::io::{prelude::*, Cursor};
7use std::str::from_utf8_unchecked;
8use utf8_width::get_width;
9
10type VecStrU64 = Vec<(String, u64)>;
11
12#[derive(PartialEq, Debug, Readable)]
13struct WordCharCount(VecStrU64, Vec<(char, u64)>);
14
15lazy_static! {
16  pub static ref WORD_CHAR_COUNT: (VecStrU64, VecStrU64) = {
17    let zst = include_bytes!("d.zst");
18    let mut zstc = Cursor::new(zst);
19    let mut decoder = ruzstd::StreamingDecoder::new(&mut zstc).unwrap();
20    let mut bytes: Vec<u8> = Vec::with_capacity(zst.len() * 5);
21    decoder.read_to_end(&mut bytes).unwrap();
22
23    let wcc = WordCharCount::read_from_buffer_with_ctx(Endianness::LittleEndian, &bytes).unwrap();
24
25    let mut word_li = wcc.0;
26    let char_li = wcc.1;
27    let mut char_vec = Vec::with_capacity(char_li.len());
28    for (k, v) in char_li {
29      if k == '\n' {
30        word_li.push(("\n\n".into(), v / 4));
31      }
32      char_vec.push((k.to_string(), v));
33    }
34
35    (word_li, char_vec)
36  };
37  pub static ref G: (AhoCorasick, Book<&'static [u8]>, Tree<&'static [u8]>) = {
38    let (word_li, char_li) = &*WORD_CHAR_COUNT;
39
40    let mut weights = Vec::with_capacity(word_li.len() + char_li.len());
41    for (k, v) in word_li.iter().chain(char_li.iter()) {
42      weights.push((k.as_bytes(), *v));
43    }
44
45    let (book, tree) = CodeBuilder::from_iter(weights.into_iter()).finish();
46
47    let ac = AhoCorasickBuilder::new()
48      .match_kind(MatchKind::LeftmostFirst)
49      .build(
50        [&b"\r\n"[..]]
51          .iter()
52          .chain(&word_li.iter().map(|x| x.0.as_bytes()).collect::<Vec<_>>()),
53      );
54
55    (ac, book, tree)
56  };
57}
58
59pub fn init() {
60  lazy_static::initialize(&WORD_CHAR_COUNT);
61  lazy_static::initialize(&G);
62}
63
64pub fn encode(input: &[u8]) -> Vec<u8> {
65  let (ac, book, _) = &*G;
66
67  let mut buffer = BitVec::new();
68  let mut pos = 0;
69
70  macro_rules! encode_char {
71    ($max:ident) => {
72      while pos < $max {
73        let n = get_width(input[pos]);
74        if n == 0 {
75          pos += 1;
76          continue;
77        }
78        let pos_next = pos + n;
79        let mut c = unsafe { input.get_unchecked(pos..pos_next) };
80        if n == 1 && c == b"\r" {
81          c = b"\n";
82        }
83        pos = pos_next;
84        let _ = book.encode(&mut buffer, &c);
85      }
86    };
87  }
88
89  for mat in ac.find_iter(input) {
90    let start = mat.start();
91    let end = mat.end();
92    encode_char!(start);
93    pos = end;
94    let c = {
95      if mat.pattern() == 0 {
96        &b"\n"[..]
97      } else {
98        unsafe { input.get_unchecked(start..end) }
99      }
100    };
101    let _ = book.encode(&mut buffer, c);
102  }
103
104  let len = input.len();
105  encode_char!(len);
106
107  buffer.push(true);
108
109  buffer.to_bytes()
110}
111
112pub fn decode(input: &[u8]) -> String {
113  // 解压缩
114  let tree = &(G.2);
115  let mut bits = BitVec::from_bytes(input);
116  let mut len = bits.len();
117
118  while len != 0 {
119    len -= 1;
120    if bits[len] {
121      break;
122    }
123  }
124  unsafe { bits.set_len(len) };
125
126  let mut result = Vec::with_capacity(len);
127
128  for i in tree.unbounded_decoder(&bits) {
129    for j in i {
130      result.push(*j);
131    }
132  }
133
134  unsafe { from_utf8_unchecked(&result) }.to_string()
135}
136
137/*
138fn main() -> Result<(), Box<dyn Error>> {
139  for input in [
140    "市场上绝大多数人基本上都仅仅着眼当前盛况的外象,而没有思考背后的根本原因。",
141    "2006美版7.7分爱情《触不到的恋人》BD1080p.中文字幕",
142  ] {
143    let compressed = encode(input.as_bytes());
144    println!(
145      "\n{:?}\nbytes {} -> {} compresse ratio = {:.2}%",
146      &compressed,
147      input.len(),
148      compressed.len(),
149      100.0 * (compressed.len() as f64 / input.len() as f64),
150    );
151
152    println!("{}", input);
153    println!("{}", decode(&compressed));
154  }
155  Ok(())
156}
157*/