zim/
cluster.rs

1use std::fmt;
2use std::io::Cursor;
3use std::io::Read;
4use std::ops::Deref;
5use std::sync::{Arc, RwLock};
6
7use bitreader::BitReader;
8use byteorder::{LittleEndian, ReadBytesExt};
9use memmap::Mmap;
10use ouroboros::self_referencing;
11use xz2::read::XzDecoder;
12
13use crate::errors::{Error, Result};
14
15#[repr(u8)]
16#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
17pub enum Compression {
18    None = 0,
19    Zlib = 2,
20    Bzip2 = 3,
21    Lzma2 = 4,
22    Zstd = 5,
23}
24
25impl From<Compression> for u8 {
26    fn from(mode: Compression) -> u8 {
27        mode as u8
28    }
29}
30
31impl Compression {
32    pub fn from(raw: u8) -> Result<Compression> {
33        match raw {
34            0 => Ok(Compression::None),
35            1 => Ok(Compression::None),
36            2 => Ok(Compression::Zlib),
37            3 => Ok(Compression::Bzip2),
38            4 => Ok(Compression::Lzma2),
39            5 => Ok(Compression::Zstd),
40            _ => Err(Error::UnknownCompression(raw)),
41        }
42    }
43}
44
45/// A cluster of blobs
46///
47/// Within an ZIM archive, clusters contain several blobs of data that are all compressed together.
48/// Each blob is the data for an article.
49#[derive(Clone)]
50pub struct Cluster<'a>(Arc<RwLock<InnerCluster<'a>>>);
51
52pub struct InnerCluster<'a> {
53    extended: bool,
54    compression: Compression,
55    start: u64,
56    end: u64,
57    size: u64,
58    view: &'a [u8],
59    blob_list: Option<Vec<u64>>, // offsets into data
60    decompressed: Option<Vec<u8>>,
61}
62
63impl<'a> fmt::Debug for Cluster<'a> {
64    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65        let raw = self.0.read().unwrap();
66        f.debug_struct("Cluster")
67            .field("extended", &raw.extended)
68            .field("compression", &raw.compression)
69            .field("start", &raw.start)
70            .field("end", &raw.end)
71            .field("size", &raw.size)
72            .field("view len", &raw.view.len())
73            .field("blob_list", &raw.blob_list)
74            .field(
75                "decompressed len",
76                &raw.decompressed.as_ref().map(|s| s.len()),
77            )
78            .finish()
79    }
80}
81
82impl<'a> Cluster<'a> {
83    pub fn new(
84        master_view: &'a Mmap,
85        cluster_list: &'a Vec<u64>,
86        idx: u32,
87        checksum_pos: u64,
88        version: u16,
89    ) -> Result<Cluster<'a>> {
90        Ok(Cluster(Arc::new(RwLock::new(InnerCluster::new(
91            master_view,
92            cluster_list,
93            idx,
94            checksum_pos,
95            version,
96        )?))))
97    }
98
99    pub fn decompress(&self) -> Result<()> {
100        self.0.write().unwrap().decompress()
101    }
102
103    pub fn compression(&self) -> Compression {
104        self.0.read().unwrap().compression
105    }
106
107    pub fn get_blob<'b: 'a>(&'b self, idx: u32) -> Result<Blob<'a, 'b>> {
108        {
109            let lock = self.0.read().unwrap();
110            if lock.needs_decompression() {
111                drop(lock);
112                self.0.write().unwrap().decompress()?;
113            }
114        }
115
116        let blob = BlobTryBuilder {
117            guard: self.0.read().unwrap(),
118            slice_builder: |guard| guard.get_blob(idx),
119        }
120        .try_build()?;
121
122        Ok(blob)
123    }
124}
125
126#[self_referencing]
127pub struct Blob<'a, 'b: 'a> {
128    guard: std::sync::RwLockReadGuard<'b, InnerCluster<'a>>,
129    #[borrows(guard)]
130    slice: &'this [u8],
131}
132
133impl<'a, 'b: 'a> Deref for Blob<'a, 'b> {
134    type Target = [u8];
135    fn deref(&self) -> &Self::Target {
136        self.borrow_slice()
137    }
138}
139
140impl<'a, 'b: 'a> AsRef<[u8]> for Blob<'a, 'b> {
141    fn as_ref(&self) -> &[u8] {
142        self.borrow_slice()
143    }
144}
145
146impl<'a> InnerCluster<'a> {
147    fn new(
148        master_view: &'a Mmap,
149        cluster_list: &'a Vec<u64>,
150        idx: u32,
151        checksum_pos: u64,
152        version: u16,
153    ) -> Result<Self> {
154        let idx = idx as usize;
155        let start = cluster_list[idx];
156        let end = if idx < cluster_list.len() - 1 {
157            cluster_list[idx + 1]
158        } else {
159            checksum_pos
160        };
161
162        assert!(end > start);
163        let cluster_size = end - start;
164        let cluster_view = master_view
165            .get(start as usize..end as usize)
166            .ok_or(Error::OutOfBounds)?;
167
168        let (extended, compression) =
169            parse_details(cluster_view.first().ok_or(Error::OutOfBounds)?)?;
170
171        // extended clusters are only allowed in version 6
172        if extended && version != 6 {
173            return Err(Error::InvalidClusterExtension);
174        }
175
176        let blob_list = if Compression::None == compression {
177            let cur = Cursor::new(&cluster_view[1..]);
178            Some(parse_blob_list(cur, extended)?)
179        } else {
180            None
181        };
182
183        Ok(Self {
184            extended,
185            compression,
186            start,
187            end,
188            size: cluster_size,
189            view: cluster_view,
190            decompressed: None,
191            blob_list,
192        })
193    }
194
195    fn needs_decompression(&self) -> bool {
196        match self.compression {
197            Compression::Lzma2 | Compression::Bzip2 | Compression::Zlib | Compression::Zstd => {
198                self.decompressed.is_none() || self.blob_list.is_none()
199            }
200            Compression::None => false,
201        }
202    }
203
204    fn decompress(&mut self) -> Result<()> {
205        if self.decompressed.is_none() {
206            match self.compression {
207                Compression::Lzma2 => {
208                    let mut decoder = XzDecoder::new(&self.view[1..]);
209                    let mut d = Vec::with_capacity(self.view.len());
210                    decoder.read_to_end(&mut d)?;
211                    self.decompressed = Some(d);
212                }
213                Compression::Bzip2 => {
214                    todo!("bzip2");
215                }
216                Compression::Zlib => {
217                    todo!("zlib");
218                }
219                Compression::Zstd => {
220                    let out = zstd::stream::decode_all(&self.view[1..])?;
221                    self.decompressed = Some(out);
222                }
223                Compression::None => {}
224            }
225        }
226
227        if self.blob_list.is_none() {
228            match self.compression {
229                Compression::Lzma2 | Compression::Bzip2 | Compression::Zlib | Compression::Zstd => {
230                    let cur = Cursor::new(self.decompressed.as_ref().unwrap());
231                    let blob_list = parse_blob_list(cur, self.extended)?;
232                    self.blob_list = Some(blob_list);
233                }
234                Compression::None => {}
235            }
236        }
237
238        Ok(())
239    }
240
241    fn get_blob(&self, idx: u32) -> Result<&[u8]> {
242        match self.blob_list {
243            Some(ref list) => {
244                let start = list[idx as usize] as usize;
245                let n = idx as usize + 1;
246                let end = if list.len() > n {
247                    list[n] as usize
248                } else {
249                    self.size as usize
250                };
251
252                Ok(match self.compression {
253                    Compression::Lzma2
254                    | Compression::Bzip2
255                    | Compression::Zlib
256                    | Compression::Zstd => {
257                        // decompressed, so we know this exists
258                        &self.decompressed.as_ref().unwrap().as_slice()[start..end]
259                    }
260                    Compression::None => &self.view[1 + start..1 + end],
261                })
262            }
263            None => Err(Error::MissingBlobList),
264        }
265    }
266}
267
268/// Parses the cluster information.
269///
270/// Fourth low bits:
271///   - 0: default (no compression),
272///   - 1: none (inherited from Zeno),
273///   - 4: LZMA2 compressed
274/// Firth bits :
275///   - 0: normal (OFFSET_SIZE=4)
276///   - 1: extended (OFFSET_SIZE=8)
277fn parse_details(details: &u8) -> Result<(bool, Compression)> {
278    let slice = &[*details];
279    let mut reader = BitReader::new(slice);
280    // skip first three bits
281    reader.skip(3)?;
282
283    // extended mode is the 4th bits from the left
284    // compression are the last four bits
285
286    Ok((reader.read_bool()?, Compression::from(reader.read_u8(4)?)?))
287}
288
289fn parse_blob_list<T: ReadBytesExt>(mut cur: T, extended: bool) -> Result<Vec<u64>> {
290    let mut blob_list = Vec::new();
291
292    // determine the count of blobs, by reading the first offset
293    let first = if extended {
294        cur.read_u64::<LittleEndian>()?
295    } else {
296        cur.read_u32::<LittleEndian>()? as u64
297    };
298
299    let count = if extended { first / 8 } else { first / 4 };
300
301    blob_list.push(first);
302
303    for _ in 0..(count as usize - 1) {
304        if extended {
305            blob_list.push(cur.read_u64::<LittleEndian>()?);
306        } else {
307            blob_list.push(cur.read_u32::<LittleEndian>()? as u64);
308        }
309    }
310
311    Ok(blob_list)
312}