1use std::collections::{HashMap, hash_map::Entry};
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10 cache: C,
11 types: Box<[FrozenType]>,
12 matches: FrozenMatches,
13 children: FrozenChildren,
14 text_idx: u32,
15 binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20 matches: Vec<Match>,
21 children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26 cnt: u32,
27 off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32 mime_type_off: u32,
33 matches: Range<u32>,
34 children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43 C: AsRef<[u8]>,
44{
45 #[cold]
62 pub fn open(cache: C) -> Result<Self, Error> {
63 let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
64
65 let parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
66
67 let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
68
69 let (text_off, binary_off) = parents_to_children(cache.as_ref(), parents, &mut types)?;
70
71 Ok(Self::freeze(cache, types, text_off, binary_off))
72 }
73
74 fn freeze(cache: C, types: HashMap<u32, Type>, text_off: u32, binary_off: u32) -> Self {
75 let mut matches = Vec::new();
76 let mut children = Vec::new();
77
78 let types = types
79 .into_iter()
80 .map(|(mime_type_off, r#type)| {
81 let matches_start = u32::try_from(matches.len()).unwrap();
82 let matches_end = matches_start + u32::try_from(r#type.matches.len()).unwrap();
83 matches.extend(r#type.matches);
84
85 let children_start = u32::try_from(children.len()).unwrap();
86 let children_end = children_start + u32::try_from(r#type.children.len()).unwrap();
87 children.extend(r#type.children);
88
89 FrozenType {
90 mime_type_off,
91 matches: matches_start..matches_end,
92 children: children_start..children_end,
93 }
94 })
95 .collect::<Box<[_]>>();
96
97 let mime_type_off_to_type_idx = types
98 .iter()
99 .enumerate()
100 .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
101 .collect::<HashMap<_, _>>();
102
103 for type_idx in &mut children {
104 *type_idx = mime_type_off_to_type_idx[type_idx];
105 }
106
107 Self {
108 cache,
109 types,
110 matches: matches.into(),
111 children: children.into(),
112 text_idx: mime_type_off_to_type_idx[&text_off],
113 binary_idx: mime_type_off_to_type_idx[&binary_off],
114 }
115 }
116
117 pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
118 let is_text = memchr(0, bytes).is_none();
119
120 let text_type = &self.types[self.text_idx as usize];
121 let binary_type = &self.types[self.binary_idx as usize];
122
123 let mime_type_off = if is_text
124 && let Some(mime_type_off) = self.match_children(0, None, &text_type.children, bytes)?
125 {
126 mime_type_off
127 } else if let Some(mime_type_off) =
128 self.match_children(0, None, &binary_type.children, bytes)?
129 {
130 mime_type_off
131 } else if is_text {
132 text_type.mime_type_off
133 } else {
134 binary_type.mime_type_off
135 };
136
137 resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
138 }
139
140 fn match_children(
141 &self,
142 depth: u8,
143 default: Option<u32>,
144 children: &Range<u32>,
145 bytes: &[u8],
146 ) -> Result<Option<u32>, Error> {
147 if depth == 128 {
148 return Err(Error::MalformedParents);
149 }
150
151 let children = &self.children[children.start as usize..children.end as usize];
152
153 for type_idx in children {
154 let r#type = &self.types[*type_idx as usize];
155
156 let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
157
158 for r#match in matches {
159 if try_match(self.cache.as_ref(), 0, r#match.cnt, r#match.off, bytes)
160 .ok_or(Error::MalformedMatch)?
161 {
162 return self.match_children(
163 depth + 1,
164 Some(r#type.mime_type_off),
165 &r#type.children,
166 bytes,
167 );
168 }
169 }
170 }
171
172 Ok(default)
173 }
174}
175
176fn try_match(cache: &[u8], depth: u8, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
177 if depth == 128 {
178 return None;
179 }
180
181 let cnt = cnt as usize;
182 let off = off as usize;
183
184 let matchlets = cache.get(off..)?.get(..32 * cnt)?;
185
186 for idx in 0..cnt {
187 let matchlet = &matchlets[32 * idx..][..32];
188
189 if try_matchlet(cache, matchlet, bytes)? {
190 let cnt = parse_u32(matchlet, 24);
191 let off = parse_u32(matchlet, 28);
192
193 if cnt == 0 || try_match(cache, depth + 1, cnt, off, bytes)? {
194 return Some(true);
195 }
196 }
197 }
198
199 Some(false)
200}
201
202#[inline]
203fn try_matchlet(cache: &[u8], matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
204 let range_start = parse_u32(matchlet, 0) as usize;
205 let range_length = parse_u32(matchlet, 4) as usize;
206 let data_length = parse_u32(matchlet, 12) as usize;
207 let data_offset = parse_u32(matchlet, 16) as usize;
208 let mask_offset = parse_u32(matchlet, 20) as usize;
209
210 let data = cache.get(data_offset..)?.get(..data_length)?;
211
212 let mask = if mask_offset != 0 {
213 Some(cache.get(mask_offset..)?.get(..data_length)?)
214 } else {
215 None
216 };
217
218 for pos in range_start..range_start + range_length {
219 if let Some(bytes) = bytes.get(pos..pos + data_length) {
220 if let Some(mask) = mask {
221 if data
222 .iter()
223 .zip(mask)
224 .zip(bytes)
225 .all(|((data, mask), byte)| data & mask == byte & mask)
226 {
227 return Some(true);
228 }
229 } else {
230 if data == bytes {
231 return Some(true);
232 }
233 }
234 } else {
235 break;
236 }
237 }
238
239 Some(false)
240}
241
242fn parents_to_children(
243 cache: &[u8],
244 mut parents: HashMap<u32, Vec<u32>>,
245 types: &mut HashMap<u32, Type>,
246) -> Result<(u32, u32), Error> {
247 types.retain(|mime_type_off, _type| resolve_mime_type(cache, *mime_type_off).is_some());
248
249 parents.retain(|mime_type_off, parents| {
250 if types.contains_key(mime_type_off) {
251 parents.retain(|parent_mime_type_off| types.contains_key(parent_mime_type_off));
252
253 !parents.is_empty()
254 } else {
255 false
256 }
257 });
258
259 let text_off =
260 find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
261
262 let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
263 .ok_or(Error::MissingTextPlain)?;
264
265 for mime_type_off in types.keys() {
266 if *mime_type_off != text_off
267 && *mime_type_off != binary_off
268 && let Entry::Vacant(entry) = parents.entry(*mime_type_off)
269 {
270 let mime_type = resolve_mime_type(cache, *mime_type_off).unwrap();
271
272 let parent_mime_type_off = if mime_type.starts_with("text/") {
273 text_off
274 } else {
275 binary_off
276 };
277
278 entry.insert(vec![parent_mime_type_off]);
279 }
280 }
281
282 for (mime_type_off, parents) in &parents {
283 for parent_mime_type_off in parents {
284 types
285 .get_mut(parent_mime_type_off)
286 .unwrap()
287 .children
288 .push(*mime_type_off);
289 }
290 }
291
292 for r#type in types.values_mut() {
293 r#type.children.sort_unstable();
294 r#type.children.dedup();
295 }
296
297 Ok((text_off, binary_off))
298}
299
300fn find_or_insert_mime_type(
301 cache: &[u8],
302 types: &mut HashMap<u32, Type>,
303 mime_type0: &str,
304) -> Option<u32> {
305 let mime_type = &mime_type0[..mime_type0.len() - 1];
306
307 let mime_type_off = types
308 .keys()
309 .copied()
310 .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
311
312 mime_type_off.or_else(|| {
313 let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
314
315 types.insert(mime_type_off, Default::default());
316
317 Some(mime_type_off)
318 })
319}
320
321fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
322 let off = mime_type_off as usize;
323 let bytes = cache.get(off..)?;
324
325 let pos = memchr(0, bytes)?;
326 let mime_type = &bytes[..pos];
327
328 from_utf8(mime_type).ok()
329}
330
331fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
332 if cache.len() < 2 * 2 + 9 * 4 {
333 return None;
334 }
335
336 let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
337 let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
338
339 let parents = cache.get(parents_off..)?;
340 let types = cache.get(types_off..)?;
341
342 Some((parents, types))
343}
344
345fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
346 if bytes.len() < 4 {
347 return None;
348 }
349
350 let cnt = parse_u32(bytes, 0) as usize;
351
352 let bytes = bytes.get(4..)?.get(..8 * cnt)?;
353
354 let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
355
356 for idx in 0..cnt {
357 let bytes = &bytes[8 * idx..][..8];
358
359 let mime_type_off = parse_u32(bytes, 0);
360 let parents_off = parse_u32(bytes, 4) as usize;
361
362 let bytes = cache.get(parents_off..)?;
363
364 if bytes.len() < 4 {
365 return None;
366 }
367
368 let cnt = parse_u32(bytes, 0) as usize;
369
370 let bytes = bytes.get(4..)?.get(..4 * cnt)?;
371
372 let parents = parents.entry(mime_type_off).or_default();
373 parents.reserve(cnt);
374
375 for idx in 0..cnt {
376 let parent_mime_type_off = parse_u32(bytes, 4 * idx);
377
378 parents.push(parent_mime_type_off);
379 }
380 }
381
382 Some(parents)
383}
384
385fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
386 if bytes.len() < 3 * 4 {
387 return None;
388 }
389
390 let cnt = parse_u32(bytes, 0) as usize;
391 let off = parse_u32(bytes, 8) as usize;
392
393 let bytes = cache.get(off..)?.get(..16 * cnt)?;
394
395 let mut types = HashMap::<_, Type>::with_capacity(cnt);
396
397 for idx in 0..cnt {
398 let bytes = &bytes[16 * idx..];
399
400 let mime_type_off = parse_u32(bytes, 4);
401
402 let cnt = parse_u32(bytes, 8);
403 let off = parse_u32(bytes, 12);
404
405 types
406 .entry(mime_type_off)
407 .or_default()
408 .matches
409 .push(Match { cnt, off });
410 }
411
412 Some(types)
413}
414
415fn parse_u32(bytes: &[u8], off: usize) -> u32 {
416 u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
417}
418
419#[derive(Debug)]
420pub enum Error {
421 MalformedOffsets,
422 MalformedParents,
423 MalformedTypes,
424 MalformedMatch,
425 MissingTextPlain,
426 MissingApplicationOctetStream,
427 InvalidMimeType,
428}
429
430impl fmt::Display for Error {
431 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
432 let msg = match self {
433 Self::MalformedOffsets => "Malformed offsets",
434 Self::MalformedParents => "Malformed parents",
435 Self::MalformedTypes => "Malformed types",
436 Self::MalformedMatch => "Malformed match",
437 Self::MissingTextPlain => "Missing text/plain MIME type",
438 Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
439 Self::InvalidMimeType => "Invalid MIME type",
440 };
441
442 fmt.write_str(msg)
443 }
444}
445
446impl StdError for Error {}