Skip to main content

tiny_tree_magic/
lib.rs

1use std::collections::{HashMap, VecDeque, hash_map::Entry};
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10    cache: C,
11    types: Box<[FrozenType]>,
12    matches: FrozenMatches,
13    children: FrozenChildren,
14    text_idx: u32,
15    binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20    matches: Vec<Match>,
21    children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26    cnt: u32,
27    off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32    mime_type_off: u32,
33    matches: Range<u32>,
34    children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43    C: AsRef<[u8]>,
44{
45    /// Open the given MIME-info `cache`
46    ///
47    /// This can be an in-memory buffer or a memory map of a [mime.cache file](https://specifications.freedesktop.org/shared-mime-info/latest/ar01s02.html#id-1.3.12).
48    ///
49    /// ```no_run
50    /// # use std::error::Error;
51    /// use std::fs::File;
52    ///
53    /// use memmap2::Mmap;
54    /// use tiny_tree_magic::Database;
55    ///
56    /// let cache = unsafe { Mmap::map(&File::open("/usr/share/mime/mime.cache")?)? };
57    ///
58    /// let database = Database::open(cache)?;
59    /// # Ok::<_, Box<dyn Error>>(())
60    /// ```
61    #[cold]
62    pub fn open(cache: C) -> Result<Self, Error> {
63        let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
64
65        let mut parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
66
67        let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
68
69        let (text_off, binary_off) = parents_to_children(cache.as_ref(), &mut parents, &mut types)?;
70
71        let types = sort(types, parents)?;
72
73        Ok(Self::freeze(cache, types, text_off, binary_off))
74    }
75
76    fn freeze(cache: C, types: Vec<(u32, Type)>, text_off: u32, binary_off: u32) -> Self {
77        let mut matches =
78            Vec::with_capacity(types.iter().map(|(_, r#type)| r#type.matches.len()).sum());
79        let mut children =
80            Vec::with_capacity(types.iter().map(|(_, r#type)| r#type.children.len()).sum());
81
82        let types = types
83            .into_iter()
84            .map(|(mime_type_off, r#type)| {
85                let matches_start = u32::try_from(matches.len()).unwrap();
86                let matches_len = u32::try_from(r#type.matches.len()).unwrap();
87                let matches_end = matches_start.checked_add(matches_len).unwrap();
88                matches.extend(r#type.matches);
89
90                let children_start = u32::try_from(children.len()).unwrap();
91                let children_len = u32::try_from(r#type.children.len()).unwrap();
92                let children_end = children_start.checked_add(children_len).unwrap();
93                children.extend(r#type.children);
94
95                FrozenType {
96                    mime_type_off,
97                    matches: matches_start..matches_end,
98                    children: children_start..children_end,
99                }
100            })
101            .collect::<Box<[_]>>();
102
103        let mime_type_off_to_type_idx = types
104            .iter()
105            .enumerate()
106            .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
107            .collect::<HashMap<_, _>>();
108
109        for type_idx in &mut children {
110            *type_idx = mime_type_off_to_type_idx[type_idx];
111        }
112
113        Self {
114            cache,
115            types,
116            matches: matches.into(),
117            children: children.into(),
118            text_idx: mime_type_off_to_type_idx[&text_off],
119            binary_idx: mime_type_off_to_type_idx[&binary_off],
120        }
121    }
122
123    pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
124        let is_text = memchr(0, bytes).is_none();
125
126        let text_type = &self.types[self.text_idx as usize];
127        let binary_type = &self.types[self.binary_idx as usize];
128
129        let mime_type_off = if is_text
130            && let Some(mime_type_off) = self.match_children(None, &text_type.children, bytes)?
131        {
132            mime_type_off
133        } else if let Some(mime_type_off) =
134            self.match_children(None, &binary_type.children, bytes)?
135        {
136            mime_type_off
137        } else if is_text {
138            text_type.mime_type_off
139        } else {
140            binary_type.mime_type_off
141        };
142
143        resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
144    }
145
146    fn match_children(
147        &self,
148        default: Option<u32>,
149        children: &Range<u32>,
150        bytes: &[u8],
151    ) -> Result<Option<u32>, Error> {
152        let children = &self.children[children.start as usize..children.end as usize];
153
154        for type_idx in children {
155            let r#type = &self.types[*type_idx as usize];
156
157            let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
158
159            for r#match in matches {
160                if try_match(self.cache.as_ref(), 0, r#match.cnt, r#match.off, bytes)
161                    .ok_or(Error::MalformedMatch)?
162                {
163                    return self.match_children(
164                        Some(r#type.mime_type_off),
165                        &r#type.children,
166                        bytes,
167                    );
168                }
169            }
170        }
171
172        Ok(default)
173    }
174}
175
176fn try_match(cache: &[u8], depth: u8, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
177    if depth == 128 {
178        return None;
179    }
180
181    let cnt = cnt as usize;
182    let off = off as usize;
183
184    let matchlets = cache.get(off..)?.get(..32 * cnt)?;
185
186    for idx in 0..cnt {
187        let matchlet = &matchlets[32 * idx..][..32];
188
189        if try_matchlet(cache, matchlet, bytes)? {
190            let cnt = parse_u32(matchlet, 24);
191            let off = parse_u32(matchlet, 28);
192
193            if cnt == 0 || try_match(cache, depth + 1, cnt, off, bytes)? {
194                return Some(true);
195            }
196        }
197    }
198
199    Some(false)
200}
201
202#[inline]
203fn try_matchlet(cache: &[u8], matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
204    let range_start = parse_u32(matchlet, 0) as usize;
205    let range_length = parse_u32(matchlet, 4) as usize;
206    let data_length = parse_u32(matchlet, 12) as usize;
207    let data_offset = parse_u32(matchlet, 16) as usize;
208    let mask_offset = parse_u32(matchlet, 20) as usize;
209
210    let data = cache.get(data_offset..)?.get(..data_length)?;
211
212    let mask = if mask_offset != 0 {
213        Some(cache.get(mask_offset..)?.get(..data_length)?)
214    } else {
215        None
216    };
217
218    for pos in range_start..range_start + range_length {
219        if let Some(bytes) = bytes.get(pos..pos + data_length) {
220            if let Some(mask) = mask {
221                if data
222                    .iter()
223                    .zip(mask)
224                    .zip(bytes)
225                    .all(|((data, mask), byte)| data & mask == byte & mask)
226                {
227                    return Some(true);
228                }
229            } else {
230                if data == bytes {
231                    return Some(true);
232                }
233            }
234        } else {
235            break;
236        }
237    }
238
239    Some(false)
240}
241
242fn sort(
243    mut types: HashMap<u32, Type>,
244    mut parents: HashMap<u32, Vec<u32>>,
245) -> Result<Vec<(u32, Type)>, Error> {
246    let mut sorted = Vec::with_capacity(types.len());
247
248    let mut orphans = types
249        .keys()
250        .copied()
251        .filter(|mime_type_off| !parents.contains_key(mime_type_off))
252        .collect::<VecDeque<_>>();
253
254    while let Some(orphan_mime_type_off) = orphans.pop_back() {
255        let orphan_type = types.remove(&orphan_mime_type_off).unwrap();
256
257        for child_mime_type_off in &orphan_type.children {
258            match parents.entry(*child_mime_type_off) {
259                Entry::Occupied(mut entry) => {
260                    let parents = entry.get_mut();
261
262                    parents.retain(|parent_mime_type_off| {
263                        *parent_mime_type_off != orphan_mime_type_off
264                    });
265
266                    if parents.is_empty() {
267                        entry.remove();
268
269                        orphans.push_front(*child_mime_type_off);
270                    }
271                }
272                Entry::Vacant(_) => unreachable!(),
273            }
274        }
275
276        sorted.push((orphan_mime_type_off, orphan_type));
277    }
278
279    if parents.is_empty() {
280        Ok(sorted)
281    } else {
282        Err(Error::MalformedParents)
283    }
284}
285
286fn parents_to_children(
287    cache: &[u8],
288    parents: &mut HashMap<u32, Vec<u32>>,
289    types: &mut HashMap<u32, Type>,
290) -> Result<(u32, u32), Error> {
291    types.retain(|mime_type_off, _type| resolve_mime_type(cache, *mime_type_off).is_some());
292
293    parents.retain(|mime_type_off, parents| {
294        if types.contains_key(mime_type_off) {
295            parents.retain(|parent_mime_type_off| types.contains_key(parent_mime_type_off));
296
297            !parents.is_empty()
298        } else {
299            false
300        }
301    });
302
303    let text_off =
304        find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
305
306    let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
307        .ok_or(Error::MissingTextPlain)?;
308
309    for mime_type_off in types.keys() {
310        if *mime_type_off != text_off
311            && *mime_type_off != binary_off
312            && let Entry::Vacant(entry) = parents.entry(*mime_type_off)
313        {
314            let mime_type = resolve_mime_type(cache, *mime_type_off).unwrap();
315
316            let parent_mime_type_off = if mime_type.starts_with("text/") {
317                text_off
318            } else {
319                binary_off
320            };
321
322            entry.insert(vec![parent_mime_type_off]);
323        }
324    }
325
326    for (mime_type_off, parents) in parents {
327        for parent_mime_type_off in parents {
328            types
329                .get_mut(parent_mime_type_off)
330                .unwrap()
331                .children
332                .push(*mime_type_off);
333        }
334    }
335
336    for r#type in types.values_mut() {
337        r#type.children.sort_unstable();
338        r#type.children.dedup();
339    }
340
341    Ok((text_off, binary_off))
342}
343
344fn find_or_insert_mime_type(
345    cache: &[u8],
346    types: &mut HashMap<u32, Type>,
347    mime_type0: &str,
348) -> Option<u32> {
349    let mime_type = &mime_type0[..mime_type0.len() - 1];
350
351    let mime_type_off = types
352        .keys()
353        .copied()
354        .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
355
356    mime_type_off.or_else(|| {
357        let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
358
359        types.insert(mime_type_off, Default::default());
360
361        Some(mime_type_off)
362    })
363}
364
365fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
366    let off = mime_type_off as usize;
367    let bytes = cache.get(off..)?;
368
369    let pos = memchr(0, bytes)?;
370    let mime_type = &bytes[..pos];
371
372    from_utf8(mime_type).ok()
373}
374
375fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
376    if cache.len() < 2 * 2 + 9 * 4 {
377        return None;
378    }
379
380    let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
381    let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
382
383    let parents = cache.get(parents_off..)?;
384    let types = cache.get(types_off..)?;
385
386    Some((parents, types))
387}
388
389fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
390    if bytes.len() < 4 {
391        return None;
392    }
393
394    let cnt = parse_u32(bytes, 0) as usize;
395
396    let bytes = bytes.get(4..)?.get(..8 * cnt)?;
397
398    let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
399
400    for idx in 0..cnt {
401        let bytes = &bytes[8 * idx..][..8];
402
403        let mime_type_off = parse_u32(bytes, 0);
404        let parents_off = parse_u32(bytes, 4) as usize;
405
406        let bytes = cache.get(parents_off..)?;
407
408        if bytes.len() < 4 {
409            return None;
410        }
411
412        let cnt = parse_u32(bytes, 0) as usize;
413
414        let bytes = bytes.get(4..)?.get(..4 * cnt)?;
415
416        let parents = parents.entry(mime_type_off).or_default();
417        parents.reserve(cnt);
418
419        for idx in 0..cnt {
420            let parent_mime_type_off = parse_u32(bytes, 4 * idx);
421
422            parents.push(parent_mime_type_off);
423        }
424    }
425
426    Some(parents)
427}
428
429fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
430    if bytes.len() < 3 * 4 {
431        return None;
432    }
433
434    let cnt = parse_u32(bytes, 0) as usize;
435    let off = parse_u32(bytes, 8) as usize;
436
437    let bytes = cache.get(off..)?.get(..16 * cnt)?;
438
439    let mut types = HashMap::<_, Type>::with_capacity(cnt);
440
441    for idx in 0..cnt {
442        let bytes = &bytes[16 * idx..];
443
444        let mime_type_off = parse_u32(bytes, 4);
445
446        let cnt = parse_u32(bytes, 8);
447        let off = parse_u32(bytes, 12);
448
449        types
450            .entry(mime_type_off)
451            .or_default()
452            .matches
453            .push(Match { cnt, off });
454    }
455
456    Some(types)
457}
458
459fn parse_u32(bytes: &[u8], off: usize) -> u32 {
460    u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
461}
462
463#[derive(Debug)]
464pub enum Error {
465    MalformedOffsets,
466    MalformedParents,
467    MalformedTypes,
468    MalformedMatch,
469    MissingTextPlain,
470    MissingApplicationOctetStream,
471    InvalidMimeType,
472}
473
474impl fmt::Display for Error {
475    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
476        let msg = match self {
477            Self::MalformedOffsets => "Malformed offsets",
478            Self::MalformedParents => "Malformed parents",
479            Self::MalformedTypes => "Malformed types",
480            Self::MalformedMatch => "Malformed match",
481            Self::MissingTextPlain => "Missing text/plain MIME type",
482            Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
483            Self::InvalidMimeType => "Invalid MIME type",
484        };
485
486        fmt.write_str(msg)
487    }
488}
489
490impl StdError for Error {}