Skip to main content

tiny_tree_magic/
lib.rs

1use std::collections::{HashMap, hash_map::Entry};
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10    cache: C,
11    types: Box<[FrozenType]>,
12    matches: FrozenMatches,
13    children: FrozenChildren,
14    text_idx: u32,
15    binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20    matches: Vec<Match>,
21    children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26    cnt: u32,
27    off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32    mime_type_off: u32,
33    matches: Range<u32>,
34    children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43    C: AsRef<[u8]>,
44{
45    /// Open the given MIME-info `cache`
46    ///
47    /// This can be an in-memory buffer or a memory map of a [mime.cache file](https://specifications.freedesktop.org/shared-mime-info/latest/ar01s02.html#id-1.3.12).
48    ///
49    /// ```no_run
50    /// # use std::error::Error;
51    /// use std::fs::File;
52    ///
53    /// use memmap2::Mmap;
54    /// use tiny_tree_magic::Database;
55    ///
56    /// let cache = unsafe { Mmap::map(&File::open("/usr/share/mime/mime.cache")?)? };
57    ///
58    /// let database = Database::open(cache)?;
59    /// # Ok::<_, Box<dyn Error>>(())
60    /// ```
61    #[cold]
62    pub fn open(cache: C) -> Result<Self, Error> {
63        let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
64
65        let parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
66
67        let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
68
69        let (text_off, binary_off) = parents_to_children(cache.as_ref(), parents, &mut types)?;
70
71        Ok(Self::freeze(cache, types, text_off, binary_off))
72    }
73
74    fn freeze(cache: C, types: HashMap<u32, Type>, text_off: u32, binary_off: u32) -> Self {
75        let mut matches = Vec::new();
76        let mut children = Vec::new();
77
78        let types = types
79            .into_iter()
80            .map(|(mime_type_off, r#type)| {
81                let matches_start = u32::try_from(matches.len()).unwrap();
82                let matches_end = matches_start + u32::try_from(r#type.matches.len()).unwrap();
83                matches.extend(r#type.matches);
84
85                let children_start = u32::try_from(children.len()).unwrap();
86                let children_end = children_start + u32::try_from(r#type.children.len()).unwrap();
87                children.extend(r#type.children);
88
89                FrozenType {
90                    mime_type_off,
91                    matches: matches_start..matches_end,
92                    children: children_start..children_end,
93                }
94            })
95            .collect::<Box<[_]>>();
96
97        let mime_type_off_to_type_idx = types
98            .iter()
99            .enumerate()
100            .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
101            .collect::<HashMap<_, _>>();
102
103        for type_idx in &mut children {
104            *type_idx = mime_type_off_to_type_idx[type_idx];
105        }
106
107        Self {
108            cache,
109            types,
110            matches: matches.into(),
111            children: children.into(),
112            text_idx: mime_type_off_to_type_idx[&text_off],
113            binary_idx: mime_type_off_to_type_idx[&binary_off],
114        }
115    }
116
117    pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
118        let is_text = memchr(0, bytes).is_none();
119
120        let text_type = &self.types[self.text_idx as usize];
121        let binary_type = &self.types[self.binary_idx as usize];
122
123        let mime_type_off = if is_text
124            && let Some(mime_type_off) = self.match_children(0, &text_type.children, bytes)?
125        {
126            mime_type_off
127        } else if let Some(mime_type_off) = self.match_children(0, &binary_type.children, bytes)? {
128            mime_type_off
129        } else if is_text {
130            text_type.mime_type_off
131        } else {
132            binary_type.mime_type_off
133        };
134
135        resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
136    }
137
138    fn match_children(
139        &self,
140        depth: u8,
141        children: &Range<u32>,
142        bytes: &[u8],
143    ) -> Result<Option<u32>, Error> {
144        if depth == 128 {
145            return Err(Error::MalformedParents);
146        }
147
148        let children = &self.children[children.start as usize..children.end as usize];
149
150        for type_idx in children {
151            let r#type = &self.types[*type_idx as usize];
152
153            let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
154
155            for r#match in matches {
156                if try_match(self.cache.as_ref(), 0, r#match.cnt, r#match.off, bytes)
157                    .ok_or(Error::MalformedMatch)?
158                {
159                    let mime_type_off = self
160                        .match_children(depth + 1, &r#type.children, bytes)?
161                        .unwrap_or(r#type.mime_type_off);
162
163                    return Ok(Some(mime_type_off));
164                }
165            }
166        }
167
168        Ok(None)
169    }
170}
171
172fn try_match(cache: &[u8], depth: u8, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
173    if depth == 128 {
174        return None;
175    }
176
177    let cnt = cnt as usize;
178    let off = off as usize;
179
180    let matchlets = cache.get(off..)?.get(..32 * cnt)?;
181
182    for idx in 0..cnt {
183        let matchlet = &matchlets[32 * idx..][..32];
184
185        if try_matchlet(cache, matchlet, bytes)? {
186            let cnt = parse_u32(matchlet, 24);
187            let off = parse_u32(matchlet, 28);
188
189            if cnt == 0 || try_match(cache, depth + 1, cnt, off, bytes)? {
190                return Some(true);
191            }
192        }
193    }
194
195    Some(false)
196}
197
198#[inline]
199fn try_matchlet(cache: &[u8], matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
200    let range_start = parse_u32(matchlet, 0) as usize;
201    let range_length = parse_u32(matchlet, 4) as usize;
202    let data_length = parse_u32(matchlet, 12) as usize;
203    let data_offset = parse_u32(matchlet, 16) as usize;
204    let mask_offset = parse_u32(matchlet, 20) as usize;
205
206    let data = cache.get(data_offset..)?.get(..data_length)?;
207
208    let mask = if mask_offset != 0 {
209        Some(cache.get(mask_offset..)?.get(..data_length)?)
210    } else {
211        None
212    };
213
214    for pos in range_start..range_start + range_length {
215        if let Some(bytes) = bytes.get(pos..pos + data_length) {
216            if let Some(mask) = mask {
217                if data
218                    .iter()
219                    .zip(mask)
220                    .zip(bytes)
221                    .all(|((data, mask), byte)| data & mask == byte & mask)
222                {
223                    return Some(true);
224                }
225            } else {
226                if data == bytes {
227                    return Some(true);
228                }
229            }
230        } else {
231            break;
232        }
233    }
234
235    Some(false)
236}
237
238fn parents_to_children(
239    cache: &[u8],
240    mut parents: HashMap<u32, Vec<u32>>,
241    types: &mut HashMap<u32, Type>,
242) -> Result<(u32, u32), Error> {
243    types.retain(|mime_type_off, _type| resolve_mime_type(cache, *mime_type_off).is_some());
244
245    parents.retain(|mime_type_off, parents| {
246        if types.contains_key(mime_type_off) {
247            parents.retain(|parent_mime_type_off| types.contains_key(parent_mime_type_off));
248
249            !parents.is_empty()
250        } else {
251            false
252        }
253    });
254
255    let text_off =
256        find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
257
258    let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
259        .ok_or(Error::MissingTextPlain)?;
260
261    for mime_type_off in types.keys() {
262        if *mime_type_off != text_off
263            && *mime_type_off != binary_off
264            && let Entry::Vacant(entry) = parents.entry(*mime_type_off)
265        {
266            let mime_type = resolve_mime_type(cache, *mime_type_off).unwrap();
267
268            let parent_mime_type_off = if mime_type.starts_with("text/") {
269                text_off
270            } else {
271                binary_off
272            };
273
274            entry.insert(vec![parent_mime_type_off]);
275        }
276    }
277
278    for (mime_type_off, parents) in &parents {
279        for parent_mime_type_off in parents {
280            types
281                .get_mut(parent_mime_type_off)
282                .unwrap()
283                .children
284                .push(*mime_type_off);
285        }
286    }
287
288    for r#type in types.values_mut() {
289        r#type.children.sort_unstable();
290        r#type.children.dedup();
291    }
292
293    Ok((text_off, binary_off))
294}
295
296fn find_or_insert_mime_type(
297    cache: &[u8],
298    types: &mut HashMap<u32, Type>,
299    mime_type0: &str,
300) -> Option<u32> {
301    let mime_type = &mime_type0[..mime_type0.len() - 1];
302
303    let mime_type_off = types
304        .keys()
305        .copied()
306        .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
307
308    mime_type_off.or_else(|| {
309        let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
310
311        types.insert(mime_type_off, Default::default());
312
313        Some(mime_type_off)
314    })
315}
316
317fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
318    let off = mime_type_off as usize;
319    let bytes = cache.get(off..)?;
320
321    let pos = memchr(0, bytes)?;
322    let mime_type = &bytes[..pos];
323
324    from_utf8(mime_type).ok()
325}
326
327fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
328    if cache.len() < 2 * 2 + 9 * 4 {
329        return None;
330    }
331
332    let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
333    let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
334
335    let parents = cache.get(parents_off..)?;
336    let types = cache.get(types_off..)?;
337
338    Some((parents, types))
339}
340
341fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
342    if bytes.len() < 4 {
343        return None;
344    }
345
346    let cnt = parse_u32(bytes, 0) as usize;
347
348    let bytes = bytes.get(4..)?.get(..8 * cnt)?;
349
350    let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
351
352    for idx in 0..cnt {
353        let bytes = &bytes[8 * idx..][..8];
354
355        let mime_type_off = parse_u32(bytes, 0);
356        let parents_off = parse_u32(bytes, 4) as usize;
357
358        let bytes = cache.get(parents_off..)?;
359
360        if bytes.len() < 4 {
361            return None;
362        }
363
364        let cnt = parse_u32(bytes, 0) as usize;
365
366        let bytes = bytes.get(4..)?.get(..4 * cnt)?;
367
368        let parents = parents.entry(mime_type_off).or_default();
369        parents.reserve(cnt);
370
371        for idx in 0..cnt {
372            let parent_mime_type_off = parse_u32(bytes, 4 * idx);
373
374            parents.push(parent_mime_type_off);
375        }
376    }
377
378    Some(parents)
379}
380
381fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
382    if bytes.len() < 3 * 4 {
383        return None;
384    }
385
386    let cnt = parse_u32(bytes, 0) as usize;
387    let off = parse_u32(bytes, 8) as usize;
388
389    let bytes = cache.get(off..)?.get(..16 * cnt)?;
390
391    let mut types = HashMap::<_, Type>::with_capacity(cnt);
392
393    for idx in 0..cnt {
394        let bytes = &bytes[16 * idx..];
395
396        let mime_type_off = parse_u32(bytes, 4);
397
398        let cnt = parse_u32(bytes, 8);
399        let off = parse_u32(bytes, 12);
400
401        types
402            .entry(mime_type_off)
403            .or_default()
404            .matches
405            .push(Match { cnt, off });
406    }
407
408    Some(types)
409}
410
411fn parse_u32(bytes: &[u8], off: usize) -> u32 {
412    u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
413}
414
415#[derive(Debug)]
416pub enum Error {
417    MalformedOffsets,
418    MalformedParents,
419    MalformedTypes,
420    MalformedMatch,
421    MissingTextPlain,
422    MissingApplicationOctetStream,
423    InvalidMimeType,
424}
425
426impl fmt::Display for Error {
427    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
428        let msg = match self {
429            Self::MalformedOffsets => "Malformed offsets",
430            Self::MalformedParents => "Malformed parents",
431            Self::MalformedTypes => "Malformed types",
432            Self::MalformedMatch => "Malformed match",
433            Self::MissingTextPlain => "Missing text/plain MIME type",
434            Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
435            Self::InvalidMimeType => "Invalid MIME type",
436        };
437
438        fmt.write_str(msg)
439    }
440}
441
442impl StdError for Error {}