Skip to main content

tiny_tree_magic/
lib.rs

1use std::collections::HashMap;
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10    cache: C,
11    types: Box<[FrozenType]>,
12    matches: FrozenMatches,
13    children: FrozenChildren,
14    text_idx: u32,
15    binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20    matches: Vec<Match>,
21    children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26    cnt: u32,
27    off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32    mime_type_off: u32,
33    matches: Range<u32>,
34    children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43    C: AsRef<[u8]>,
44{
45    /// Open the given MIME-info `cache`
46    ///
47    /// This can be an in-memory buffer or a memory map of a [mime.cache file](https://specifications.freedesktop.org/shared-mime-info/latest/ar01s02.html#id-1.3.12).
48    ///
49    /// ```no_run
50    /// # use std::error::Error;
51    /// use std::fs::File;
52    ///
53    /// use memmap2::Mmap;
54    /// use tiny_tree_magic::Database;
55    ///
56    /// let cache = unsafe { Mmap::map(&File::open("/usr/share/mime/mime.cache")?)? };
57    ///
58    /// let database = Database::open(cache)?;
59    /// # Ok::<_, Box<dyn Error>>(())
60    /// ```
61    pub fn open(cache: C) -> Result<Self, Error> {
62        let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
63
64        let mut parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
65
66        let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
67
68        let (text_off, binary_off) = add_parents(cache.as_ref(), &mut parents, &mut types)?;
69
70        add_children(&parents, &mut types);
71
72        Ok(Self::freeze(cache, types, text_off, binary_off))
73    }
74
75    fn freeze(cache: C, types: HashMap<u32, Type>, text_off: u32, binary_off: u32) -> Self {
76        let mut matches = Vec::new();
77        let mut children = Vec::new();
78
79        let types = types
80            .into_iter()
81            .map(|(mime_type_off, r#type)| {
82                let matches_start = u32::try_from(matches.len()).unwrap();
83                let matches_end = matches_start + u32::try_from(r#type.matches.len()).unwrap();
84                matches.extend(r#type.matches);
85
86                let children_start = u32::try_from(children.len()).unwrap();
87                let children_end = children_start + u32::try_from(r#type.children.len()).unwrap();
88                children.extend(r#type.children);
89
90                FrozenType {
91                    mime_type_off,
92                    matches: matches_start..matches_end,
93                    children: children_start..children_end,
94                }
95            })
96            .collect::<Box<[_]>>();
97
98        let mime_type_off_to_type_idx = types
99            .iter()
100            .enumerate()
101            .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
102            .collect::<HashMap<_, _>>();
103
104        for type_idx in &mut children {
105            *type_idx = mime_type_off_to_type_idx[type_idx];
106        }
107
108        Self {
109            cache,
110            types,
111            matches: matches.into(),
112            children: children.into(),
113            text_idx: mime_type_off_to_type_idx[&text_off],
114            binary_idx: mime_type_off_to_type_idx[&binary_off],
115        }
116    }
117
118    pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
119        let is_text = memchr(0, bytes).is_none();
120
121        let text_type = &self.types[self.text_idx as usize];
122        let binary_type = &self.types[self.binary_idx as usize];
123
124        let mime_type_off = if is_text
125            && let Some(mime_type_off) = self.match_children(0, &text_type.children, bytes)?
126        {
127            mime_type_off
128        } else if let Some(mime_type_off) = self.match_children(0, &binary_type.children, bytes)? {
129            mime_type_off
130        } else if is_text {
131            text_type.mime_type_off
132        } else {
133            binary_type.mime_type_off
134        };
135
136        resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
137    }
138
139    fn match_children(
140        &self,
141        depth: u8,
142        children: &Range<u32>,
143        bytes: &[u8],
144    ) -> Result<Option<u32>, Error> {
145        if depth == 128 {
146            return Err(Error::MalformedParents);
147        }
148
149        let children = &self.children[children.start as usize..children.end as usize];
150
151        for type_idx in children {
152            let r#type = &self.types[*type_idx as usize];
153
154            let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
155
156            for r#match in matches {
157                if try_match(self.cache.as_ref(), 0, r#match.cnt, r#match.off, bytes)
158                    .ok_or(Error::MalformedMatch)?
159                {
160                    let mime_type_off = self
161                        .match_children(depth + 1, &r#type.children, bytes)?
162                        .unwrap_or(r#type.mime_type_off);
163
164                    return Ok(Some(mime_type_off));
165                }
166            }
167        }
168
169        Ok(None)
170    }
171}
172
173fn try_match(cache: &[u8], depth: u8, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
174    if depth == 128 {
175        return None;
176    }
177
178    let cnt = cnt as usize;
179    let off = off as usize;
180
181    let matchlets = cache.get(off..)?.get(..32 * cnt)?;
182
183    for idx in 0..cnt {
184        let matchlet = &matchlets[32 * idx..][..32];
185
186        if try_matchlet(cache, matchlet, bytes)? {
187            let cnt = parse_u32(matchlet, 24);
188            let off = parse_u32(matchlet, 28);
189
190            if cnt == 0 || try_match(cache, depth + 1, cnt, off, bytes)? {
191                return Some(true);
192            }
193        }
194    }
195
196    Some(false)
197}
198
199#[inline]
200fn try_matchlet(cache: &[u8], matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
201    let range_start = parse_u32(matchlet, 0) as usize;
202    let range_length = parse_u32(matchlet, 4) as usize;
203    let data_length = parse_u32(matchlet, 12) as usize;
204    let data_offset = parse_u32(matchlet, 16) as usize;
205    let mask_offset = parse_u32(matchlet, 20) as usize;
206
207    let data = cache.get(data_offset..)?.get(..data_length)?;
208
209    let mask = if mask_offset != 0 {
210        Some(cache.get(mask_offset..)?.get(..data_length)?)
211    } else {
212        None
213    };
214
215    for pos in range_start..range_start + range_length {
216        if let Some(bytes) = bytes.get(pos..pos + data_length) {
217            if let Some(mask) = mask {
218                if data
219                    .iter()
220                    .zip(mask)
221                    .zip(bytes)
222                    .all(|((data, mask), byte)| data & mask == byte & mask)
223                {
224                    return Some(true);
225                }
226            } else {
227                if data == bytes {
228                    return Some(true);
229                }
230            }
231        } else {
232            break;
233        }
234    }
235
236    Some(false)
237}
238
239fn add_parents(
240    cache: &[u8],
241    parents: &mut HashMap<u32, Vec<u32>>,
242    types: &mut HashMap<u32, Type>,
243) -> Result<(u32, u32), Error> {
244    let text_off =
245        find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
246
247    let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
248        .ok_or(Error::MissingTextPlain)?;
249
250    types.retain(|mime_type_off, _type| {
251        if *mime_type_off == text_off || *mime_type_off == binary_off {
252            true
253        } else if let Some(mime_type) = resolve_mime_type(cache, *mime_type_off) {
254            let parent_mime_type_off = if mime_type.starts_with("text/") {
255                text_off
256            } else {
257                binary_off
258            };
259
260            parents
261                .entry(*mime_type_off)
262                .or_default()
263                .push(parent_mime_type_off);
264
265            true
266        } else {
267            false
268        }
269    });
270
271    Ok((text_off, binary_off))
272}
273
274fn find_or_insert_mime_type(
275    cache: &[u8],
276    types: &mut HashMap<u32, Type>,
277    mime_type0: &str,
278) -> Option<u32> {
279    let mime_type = &mime_type0[..mime_type0.len() - 1];
280
281    let mime_type_off = types
282        .keys()
283        .copied()
284        .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
285
286    mime_type_off.or_else(|| {
287        let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
288
289        types.insert(mime_type_off, Default::default());
290
291        Some(mime_type_off)
292    })
293}
294
295fn add_children(parents: &HashMap<u32, Vec<u32>>, types: &mut HashMap<u32, Type>) {
296    for (mime_type_off, parents) in parents {
297        if types.contains_key(mime_type_off) {
298            for parent_mime_type_off in parents {
299                if let Some(parent_type) = types.get_mut(parent_mime_type_off) {
300                    parent_type.children.push(*mime_type_off);
301                }
302            }
303        }
304    }
305
306    for r#type in types.values_mut() {
307        r#type.children.sort_unstable();
308        r#type.children.dedup();
309    }
310}
311
312fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
313    let off = mime_type_off as usize;
314    let bytes = cache.get(off..)?;
315
316    let pos = memchr(0, bytes)?;
317    let mime_type = &bytes[..pos];
318
319    from_utf8(mime_type).ok()
320}
321
322fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
323    if cache.len() < 2 * 2 + 9 * 4 {
324        return None;
325    }
326
327    let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
328    let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
329
330    let parents = cache.get(parents_off..)?;
331    let types = cache.get(types_off..)?;
332
333    Some((parents, types))
334}
335
336fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
337    if bytes.len() < 4 {
338        return None;
339    }
340
341    let cnt = parse_u32(bytes, 0) as usize;
342
343    let bytes = bytes.get(4..)?.get(..8 * cnt)?;
344
345    let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
346
347    for idx in 0..cnt {
348        let bytes = &bytes[8 * idx..][..8];
349
350        let mime_type_off = parse_u32(bytes, 0);
351        let parents_off = parse_u32(bytes, 4) as usize;
352
353        let bytes = cache.get(parents_off..)?;
354
355        if bytes.len() < 4 {
356            return None;
357        }
358
359        let cnt = parse_u32(bytes, 0) as usize;
360
361        let bytes = bytes.get(4..)?.get(..4 * cnt)?;
362
363        let parents = parents.entry(mime_type_off).or_default();
364        parents.reserve(cnt);
365
366        for idx in 0..cnt {
367            let parent_mime_type_off = parse_u32(bytes, 4 * idx);
368
369            parents.push(parent_mime_type_off);
370        }
371    }
372
373    Some(parents)
374}
375
376fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
377    if bytes.len() < 3 * 4 {
378        return None;
379    }
380
381    let cnt = parse_u32(bytes, 0) as usize;
382    let off = parse_u32(bytes, 8) as usize;
383
384    let bytes = cache.get(off..)?.get(..16 * cnt)?;
385
386    let mut types = HashMap::<_, Type>::with_capacity(cnt);
387
388    for idx in 0..cnt {
389        let bytes = &bytes[16 * idx..];
390
391        let mime_type_off = parse_u32(bytes, 4);
392
393        let cnt = parse_u32(bytes, 8);
394        let off = parse_u32(bytes, 12);
395
396        types
397            .entry(mime_type_off)
398            .or_default()
399            .matches
400            .push(Match { cnt, off });
401    }
402
403    Some(types)
404}
405
406fn parse_u32(bytes: &[u8], off: usize) -> u32 {
407    u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
408}
409
410#[derive(Debug)]
411pub enum Error {
412    MalformedOffsets,
413    MalformedParents,
414    MalformedTypes,
415    MalformedMatch,
416    MissingTextPlain,
417    MissingApplicationOctetStream,
418    InvalidMimeType,
419}
420
421impl fmt::Display for Error {
422    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
423        let msg = match self {
424            Self::MalformedOffsets => "Malformed offsets",
425            Self::MalformedParents => "Malformed parents",
426            Self::MalformedTypes => "Malformed types",
427            Self::MalformedMatch => "Malformed match",
428            Self::MissingTextPlain => "Missing text/plain MIME type",
429            Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
430            Self::InvalidMimeType => "Invalid MIME type",
431        };
432
433        fmt.write_str(msg)
434    }
435}
436
437impl StdError for Error {}