1use std::collections::{HashMap, VecDeque, hash_map::Entry};
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10 cache: C,
11 types: Box<[FrozenType]>,
12 matches: FrozenMatches,
13 children: FrozenChildren,
14 text_idx: u32,
15 binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20 matches: Vec<Match>,
21 children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26 cnt: u32,
27 off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32 mime_type_off: u32,
33 matches: Range<u32>,
34 children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43 C: AsRef<[u8]>,
44{
45 #[cold]
62 pub fn open(cache: C) -> Result<Self, Error> {
63 let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
64
65 let mut parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
66
67 let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
68
69 let (text_off, binary_off) = parents_to_children(cache.as_ref(), &mut parents, &mut types)?;
70
71 let types = sort(types, parents)?;
72
73 Ok(Self::freeze(cache, types, text_off, binary_off))
74 }
75
76 fn freeze(cache: C, types: Vec<(u32, Type)>, text_off: u32, binary_off: u32) -> Self {
77 let mut matches =
78 Vec::with_capacity(types.iter().map(|(_, r#type)| r#type.matches.len()).sum());
79 let mut children =
80 Vec::with_capacity(types.iter().map(|(_, r#type)| r#type.children.len()).sum());
81
82 let types = types
83 .into_iter()
84 .map(|(mime_type_off, r#type)| {
85 let matches_start = u32::try_from(matches.len()).unwrap();
86 let matches_len = u32::try_from(r#type.matches.len()).unwrap();
87 let matches_end = matches_start.checked_add(matches_len).unwrap();
88 matches.extend(r#type.matches);
89
90 let children_start = u32::try_from(children.len()).unwrap();
91 let children_len = u32::try_from(r#type.children.len()).unwrap();
92 let children_end = children_start.checked_add(children_len).unwrap();
93 children.extend(r#type.children);
94
95 FrozenType {
96 mime_type_off,
97 matches: matches_start..matches_end,
98 children: children_start..children_end,
99 }
100 })
101 .collect::<Box<[_]>>();
102
103 let mime_type_off_to_type_idx = types
104 .iter()
105 .enumerate()
106 .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
107 .collect::<HashMap<_, _>>();
108
109 for type_idx in &mut children {
110 *type_idx = mime_type_off_to_type_idx[type_idx];
111 }
112
113 Self {
114 cache,
115 types,
116 matches: matches.into(),
117 children: children.into(),
118 text_idx: mime_type_off_to_type_idx[&text_off],
119 binary_idx: mime_type_off_to_type_idx[&binary_off],
120 }
121 }
122
123 pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
124 let is_text = memchr(0, bytes).is_none();
125
126 let text_type = &self.types[self.text_idx as usize];
127 let binary_type = &self.types[self.binary_idx as usize];
128
129 let mime_type_off = if is_text
130 && let Some(mime_type_off) = self.match_children(None, &text_type.children, bytes)?
131 {
132 mime_type_off
133 } else if let Some(mime_type_off) =
134 self.match_children(None, &binary_type.children, bytes)?
135 {
136 mime_type_off
137 } else if is_text {
138 text_type.mime_type_off
139 } else {
140 binary_type.mime_type_off
141 };
142
143 resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
144 }
145
146 fn match_children(
147 &self,
148 default: Option<u32>,
149 children: &Range<u32>,
150 bytes: &[u8],
151 ) -> Result<Option<u32>, Error> {
152 let children = &self.children[children.start as usize..children.end as usize];
153
154 for type_idx in children {
155 let r#type = &self.types[*type_idx as usize];
156
157 let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
158
159 for r#match in matches {
160 if try_match(self.cache.as_ref(), 0, r#match.cnt, r#match.off, bytes)
161 .ok_or(Error::MalformedMatch)?
162 {
163 return self.match_children(
164 Some(r#type.mime_type_off),
165 &r#type.children,
166 bytes,
167 );
168 }
169 }
170 }
171
172 Ok(default)
173 }
174}
175
176fn try_match(cache: &[u8], depth: u8, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
177 if depth == 128 {
178 return None;
179 }
180
181 let cnt = cnt as usize;
182 let off = off as usize;
183
184 let matchlets = cache.get(off..)?.get(..32 * cnt)?;
185
186 for idx in 0..cnt {
187 let matchlet = &matchlets[32 * idx..][..32];
188
189 if try_matchlet(cache, matchlet, bytes)? {
190 let cnt = parse_u32(matchlet, 24);
191 let off = parse_u32(matchlet, 28);
192
193 if cnt == 0 || try_match(cache, depth + 1, cnt, off, bytes)? {
194 return Some(true);
195 }
196 }
197 }
198
199 Some(false)
200}
201
202#[inline]
203fn try_matchlet(cache: &[u8], matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
204 let range_start = parse_u32(matchlet, 0) as usize;
205 let range_length = parse_u32(matchlet, 4) as usize;
206 let data_length = parse_u32(matchlet, 12) as usize;
207 let data_offset = parse_u32(matchlet, 16) as usize;
208 let mask_offset = parse_u32(matchlet, 20) as usize;
209
210 let data = cache.get(data_offset..)?.get(..data_length)?;
211
212 let mask = if mask_offset != 0 {
213 Some(cache.get(mask_offset..)?.get(..data_length)?)
214 } else {
215 None
216 };
217
218 for pos in range_start..range_start + range_length {
219 if let Some(bytes) = bytes.get(pos..pos + data_length) {
220 if let Some(mask) = mask {
221 if data
222 .iter()
223 .zip(mask)
224 .zip(bytes)
225 .all(|((data, mask), byte)| data & mask == byte & mask)
226 {
227 return Some(true);
228 }
229 } else {
230 if compare(data, bytes) {
231 return Some(true);
232 }
233 }
234 } else {
235 break;
236 }
237 }
238
239 Some(false)
240}
241
242#[inline]
243fn compare(lhs: &[u8], rhs: &[u8]) -> bool {
244 let len = lhs.len();
245 debug_assert_eq!(len, rhs.len());
246
247 if len >= 8 {
248 let lhs_lo = u64::from_le_bytes(lhs[..8].try_into().unwrap());
249 let rhs_lo = u64::from_le_bytes(rhs[..8].try_into().unwrap());
250 if lhs_lo != rhs_lo {
251 return false;
252 } else if len == 8 {
253 return true;
254 }
255
256 let lhs_hi = u64::from_le_bytes(lhs[len - 8..].try_into().unwrap());
257 let rhs_hi = u64::from_le_bytes(rhs[len - 8..].try_into().unwrap());
258 if lhs_hi != rhs_hi {
259 return false;
260 } else if len <= 16 {
261 return true;
262 }
263 } else if len >= 4 {
264 let lhs_lo = u32::from_le_bytes(lhs[..4].try_into().unwrap());
265 let rhs_lo = u32::from_le_bytes(rhs[..4].try_into().unwrap());
266 if lhs_lo != rhs_lo {
267 return false;
268 }
269
270 let lhs_hi = u32::from_le_bytes(lhs[len - 4..].try_into().unwrap());
271 let rhs_hi = u32::from_le_bytes(rhs[len - 4..].try_into().unwrap());
272 return lhs_hi == rhs_hi;
273 } else if len > 0 {
274 let lhs_lo = lhs[0];
275 let rhs_lo = rhs[0];
276 if lhs_lo != rhs_lo {
277 return false;
278 }
279
280 let lhs_mid = lhs[len / 2];
281 let rhs_mid = rhs[len / 2];
282 if lhs_mid != rhs_mid {
283 return false;
284 }
285
286 let lhs_hi = lhs[len - 1];
287 let rhs_hi = rhs[len - 1];
288 return lhs_hi == rhs_hi;
289 } else {
290 return true;
291 }
292
293 lhs == rhs
294}
295
296fn sort(
297 mut types: HashMap<u32, Type>,
298 mut parents: HashMap<u32, Vec<u32>>,
299) -> Result<Vec<(u32, Type)>, Error> {
300 let mut sorted = Vec::with_capacity(types.len());
301
302 let mut orphans = types
303 .keys()
304 .copied()
305 .filter(|mime_type_off| !parents.contains_key(mime_type_off))
306 .collect::<VecDeque<_>>();
307
308 while let Some(orphan_mime_type_off) = orphans.pop_back() {
309 let orphan_type = types.remove(&orphan_mime_type_off).unwrap();
310
311 for child_mime_type_off in &orphan_type.children {
312 match parents.entry(*child_mime_type_off) {
313 Entry::Occupied(mut entry) => {
314 let parents = entry.get_mut();
315
316 parents.retain(|parent_mime_type_off| {
317 *parent_mime_type_off != orphan_mime_type_off
318 });
319
320 if parents.is_empty() {
321 entry.remove();
322
323 orphans.push_front(*child_mime_type_off);
324 }
325 }
326 Entry::Vacant(_) => unreachable!(),
327 }
328 }
329
330 sorted.push((orphan_mime_type_off, orphan_type));
331 }
332
333 if parents.is_empty() {
334 Ok(sorted)
335 } else {
336 Err(Error::MalformedParents)
337 }
338}
339
340fn parents_to_children(
341 cache: &[u8],
342 parents: &mut HashMap<u32, Vec<u32>>,
343 types: &mut HashMap<u32, Type>,
344) -> Result<(u32, u32), Error> {
345 types.retain(|mime_type_off, _type| resolve_mime_type(cache, *mime_type_off).is_some());
346
347 parents.retain(|mime_type_off, parents| {
348 if types.contains_key(mime_type_off) {
349 parents.retain(|parent_mime_type_off| types.contains_key(parent_mime_type_off));
350
351 !parents.is_empty()
352 } else {
353 false
354 }
355 });
356
357 let text_off =
358 find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
359
360 let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
361 .ok_or(Error::MissingApplicationOctetStream)?;
362
363 for mime_type_off in types.keys() {
364 if *mime_type_off != text_off
365 && *mime_type_off != binary_off
366 && let Entry::Vacant(entry) = parents.entry(*mime_type_off)
367 {
368 let mime_type = resolve_mime_type(cache, *mime_type_off).unwrap();
369
370 let parent_mime_type_off = if mime_type.starts_with("text/") {
371 text_off
372 } else {
373 binary_off
374 };
375
376 entry.insert(vec![parent_mime_type_off]);
377 }
378 }
379
380 for (mime_type_off, parents) in parents {
381 for parent_mime_type_off in parents {
382 types
383 .get_mut(parent_mime_type_off)
384 .unwrap()
385 .children
386 .push(*mime_type_off);
387 }
388 }
389
390 for r#type in types.values_mut() {
391 r#type.children.sort_unstable();
392 r#type.children.dedup();
393 }
394
395 Ok((text_off, binary_off))
396}
397
398fn find_or_insert_mime_type(
399 cache: &[u8],
400 types: &mut HashMap<u32, Type>,
401 mime_type0: &str,
402) -> Option<u32> {
403 let mime_type = &mime_type0[..mime_type0.len() - 1];
404
405 let mime_type_off = types
406 .keys()
407 .copied()
408 .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
409
410 mime_type_off.or_else(|| {
411 let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
412
413 types.insert(mime_type_off, Default::default());
414
415 Some(mime_type_off)
416 })
417}
418
419fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
420 let off = mime_type_off as usize;
421 let bytes = cache.get(off..)?;
422
423 let pos = memchr(0, bytes)?;
424 let mime_type = &bytes[..pos];
425
426 from_utf8(mime_type).ok()
427}
428
429fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
430 if cache.len() < 2 * 2 + 9 * 4 {
431 return None;
432 }
433
434 let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
435 let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
436
437 let parents = cache.get(parents_off..)?;
438 let types = cache.get(types_off..)?;
439
440 Some((parents, types))
441}
442
443fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
444 if bytes.len() < 4 {
445 return None;
446 }
447
448 let cnt = parse_u32(bytes, 0) as usize;
449
450 let bytes = bytes.get(4..)?.get(..8 * cnt)?;
451
452 let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
453
454 for idx in 0..cnt {
455 let bytes = &bytes[8 * idx..][..8];
456
457 let mime_type_off = parse_u32(bytes, 0);
458 let parents_off = parse_u32(bytes, 4) as usize;
459
460 let bytes = cache.get(parents_off..)?;
461
462 if bytes.len() < 4 {
463 return None;
464 }
465
466 let cnt = parse_u32(bytes, 0) as usize;
467
468 let bytes = bytes.get(4..)?.get(..4 * cnt)?;
469
470 let parents = parents.entry(mime_type_off).or_default();
471 parents.reserve(cnt);
472
473 for idx in 0..cnt {
474 let parent_mime_type_off = parse_u32(bytes, 4 * idx);
475
476 parents.push(parent_mime_type_off);
477 }
478 }
479
480 Some(parents)
481}
482
483fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
484 if bytes.len() < 3 * 4 {
485 return None;
486 }
487
488 let cnt = parse_u32(bytes, 0) as usize;
489 let off = parse_u32(bytes, 8) as usize;
490
491 let bytes = cache.get(off..)?.get(..16 * cnt)?;
492
493 let mut types = HashMap::<_, Type>::with_capacity(cnt);
494
495 for idx in 0..cnt {
496 let bytes = &bytes[16 * idx..];
497
498 let mime_type_off = parse_u32(bytes, 4);
499
500 let cnt = parse_u32(bytes, 8);
501 let off = parse_u32(bytes, 12);
502
503 types
504 .entry(mime_type_off)
505 .or_default()
506 .matches
507 .push(Match { cnt, off });
508 }
509
510 Some(types)
511}
512
513fn parse_u32(bytes: &[u8], off: usize) -> u32 {
514 u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
515}
516
517#[derive(Debug)]
518pub enum Error {
519 MalformedOffsets,
520 MalformedParents,
521 MalformedTypes,
522 MalformedMatch,
523 MissingTextPlain,
524 MissingApplicationOctetStream,
525 InvalidMimeType,
526}
527
528impl fmt::Display for Error {
529 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
530 let msg = match self {
531 Self::MalformedOffsets => "Malformed offsets",
532 Self::MalformedParents => "Malformed parents",
533 Self::MalformedTypes => "Malformed types",
534 Self::MalformedMatch => "Malformed match",
535 Self::MissingTextPlain => "Missing text/plain MIME type",
536 Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
537 Self::InvalidMimeType => "Invalid MIME type",
538 };
539
540 fmt.write_str(msg)
541 }
542}
543
544impl StdError for Error {}