1use std::collections::HashMap;
2use std::error::Error as StdError;
3use std::fmt;
4use std::ops::Range;
5use std::str::from_utf8;
6
7use memchr::{memchr, memmem::find};
8
9pub struct Database<C> {
10 cache: C,
11 types: Box<[FrozenType]>,
12 matches: FrozenMatches,
13 children: FrozenChildren,
14 text_idx: u32,
15 binary_idx: u32,
16}
17
18#[derive(Debug, Default)]
19struct Type {
20 matches: Vec<Match>,
21 children: Vec<u32>,
22}
23
24#[derive(Debug)]
25struct Match {
26 cnt: u32,
27 off: u32,
28}
29
30#[derive(Debug)]
31struct FrozenType {
32 mime_type_off: u32,
33 matches: Range<u32>,
34 children: Range<u32>,
35}
36
37type FrozenMatches = Box<[Match]>;
38
39type FrozenChildren = Box<[u32]>;
40
41impl<C> Database<C>
42where
43 C: AsRef<[u8]>,
44{
45 pub fn open(cache: C) -> Result<Self, Error> {
62 let (parents, types) = parse_offsets(cache.as_ref()).ok_or(Error::MalformedOffsets)?;
63
64 let mut parents = parse_parents(cache.as_ref(), parents).ok_or(Error::MalformedParents)?;
65
66 let mut types = parse_types(cache.as_ref(), types).ok_or(Error::MalformedTypes)?;
67
68 let (text_off, binary_off) = add_parents(cache.as_ref(), &mut parents, &mut types)?;
69
70 add_children(&parents, &mut types);
71
72 Ok(Self::freeze(cache, types, text_off, binary_off))
73 }
74
75 fn freeze(cache: C, types: HashMap<u32, Type>, text_off: u32, binary_off: u32) -> Self {
76 let mut matches = Vec::new();
77 let mut children = Vec::new();
78
79 let types = types
80 .into_iter()
81 .map(|(mime_type_off, r#type)| {
82 let matches_start = u32::try_from(matches.len()).unwrap();
83 let matches_end = matches_start + u32::try_from(r#type.matches.len()).unwrap();
84 matches.extend(r#type.matches);
85
86 let children_start = u32::try_from(children.len()).unwrap();
87 let children_end = children_start + u32::try_from(r#type.children.len()).unwrap();
88 children.extend(r#type.children);
89
90 FrozenType {
91 mime_type_off,
92 matches: matches_start..matches_end,
93 children: children_start..children_end,
94 }
95 })
96 .collect::<Box<[_]>>();
97
98 let mime_type_off_to_type_idx = types
99 .iter()
100 .enumerate()
101 .map(|(idx, r#type)| (r#type.mime_type_off, u32::try_from(idx).unwrap()))
102 .collect::<HashMap<_, _>>();
103
104 for type_idx in &mut children {
105 *type_idx = mime_type_off_to_type_idx[type_idx];
106 }
107
108 Self {
109 cache,
110 types,
111 matches: matches.into(),
112 children: children.into(),
113 text_idx: mime_type_off_to_type_idx[&text_off],
114 binary_idx: mime_type_off_to_type_idx[&binary_off],
115 }
116 }
117
118 pub fn r#match<'a>(&'a self, bytes: &[u8]) -> Result<&'a str, Error> {
119 let is_text = memchr(0, bytes).is_none();
120
121 let text_type = &self.types[self.text_idx as usize];
122 let binary_type = &self.types[self.binary_idx as usize];
123
124 let mime_type_off = if is_text
125 && let Some(mime_type_off) = self.match_children(&text_type.children, bytes)?
126 {
127 mime_type_off
128 } else if let Some(mime_type_off) = self.match_children(&binary_type.children, bytes)? {
129 mime_type_off
130 } else if is_text {
131 text_type.mime_type_off
132 } else {
133 binary_type.mime_type_off
134 };
135
136 resolve_mime_type(self.cache.as_ref(), mime_type_off).ok_or(Error::InvalidMimeType)
137 }
138
139 fn match_children(&self, children: &Range<u32>, bytes: &[u8]) -> Result<Option<u32>, Error> {
140 let children = &self.children[children.start as usize..children.end as usize];
141
142 for type_idx in children {
143 let r#type = &self.types[*type_idx as usize];
144
145 let matches = &self.matches[r#type.matches.start as usize..r#type.matches.end as usize];
146 debug_assert!(!matches.is_empty());
147
148 for r#match in matches {
149 if self
150 .try_match(r#match.cnt, r#match.off, bytes)
151 .ok_or(Error::MalformedMatch)?
152 {
153 let mime_type_off = self
154 .match_children(&r#type.children, bytes)?
155 .unwrap_or(r#type.mime_type_off);
156
157 return Ok(Some(mime_type_off));
158 }
159 }
160 }
161
162 Ok(None)
163 }
164
165 fn try_match(&self, cnt: u32, off: u32, bytes: &[u8]) -> Option<bool> {
166 let cnt = cnt as usize;
167 let off = off as usize;
168
169 let matchlets = self.cache.as_ref().get(off..)?.get(..32 * cnt)?;
170
171 for idx in 0..cnt {
172 let matchlet = &matchlets[32 * idx..][..32];
173
174 if self.try_matchlet(matchlet, bytes)? {
175 let cnt = parse_u32(matchlet, 24);
176 let off = parse_u32(matchlet, 28);
177
178 if cnt == 0 || self.try_match(cnt, off, bytes)? {
179 return Some(true);
180 }
181 }
182 }
183
184 Some(false)
185 }
186
187 #[inline]
188 fn try_matchlet(&self, matchlet: &[u8], bytes: &[u8]) -> Option<bool> {
189 let range_start = parse_u32(matchlet, 0) as usize;
190 let range_length = parse_u32(matchlet, 4) as usize;
191 let data_length = parse_u32(matchlet, 12) as usize;
192 let data_offset = parse_u32(matchlet, 16) as usize;
193 let mask_offset = parse_u32(matchlet, 20) as usize;
194
195 let data = self.cache.as_ref().get(data_offset..)?.get(..data_length)?;
196
197 let mask = if mask_offset != 0 {
198 Some(self.cache.as_ref().get(mask_offset..)?.get(..data_length)?)
199 } else {
200 None
201 };
202
203 for pos in range_start..range_start + range_length {
204 if let Some(bytes) = bytes.get(pos..pos + data_length) {
205 if let Some(mask) = mask {
206 if data
207 .iter()
208 .zip(mask)
209 .zip(bytes)
210 .all(|((data, mask), byte)| data & mask == byte & mask)
211 {
212 return Some(true);
213 }
214 } else {
215 if data == bytes {
216 return Some(true);
217 }
218 }
219 } else {
220 break;
221 }
222 }
223
224 Some(false)
225 }
226}
227
228fn add_parents(
229 cache: &[u8],
230 parents: &mut HashMap<u32, Vec<u32>>,
231 types: &mut HashMap<u32, Type>,
232) -> Result<(u32, u32), Error> {
233 let text_off =
234 find_or_insert_mime_type(cache, types, "text/plain\0").ok_or(Error::MissingTextPlain)?;
235
236 let binary_off = find_or_insert_mime_type(cache, types, "application/octet-stream\0")
237 .ok_or(Error::MissingTextPlain)?;
238
239 types.retain(|mime_type_off, _type| {
240 if *mime_type_off == text_off || *mime_type_off == binary_off {
241 true
242 } else if let Some(mime_type) = resolve_mime_type(cache, *mime_type_off) {
243 let parent_mime_type_off = if mime_type.starts_with("text/") {
244 text_off
245 } else {
246 binary_off
247 };
248
249 parents
250 .entry(*mime_type_off)
251 .or_default()
252 .push(parent_mime_type_off);
253
254 true
255 } else {
256 false
257 }
258 });
259
260 Ok((text_off, binary_off))
261}
262
263fn find_or_insert_mime_type(
264 cache: &[u8],
265 types: &mut HashMap<u32, Type>,
266 mime_type0: &str,
267) -> Option<u32> {
268 let mime_type = &mime_type0[..mime_type0.len() - 1];
269
270 let mime_type_off = types
271 .keys()
272 .copied()
273 .find(|mime_type_off| resolve_mime_type(cache, *mime_type_off) == Some(mime_type));
274
275 mime_type_off.or_else(|| {
276 let mime_type_off = find(cache, mime_type0.as_bytes())?.try_into().ok()?;
277
278 types.insert(mime_type_off, Default::default());
279
280 Some(mime_type_off)
281 })
282}
283
284fn add_children(parents: &HashMap<u32, Vec<u32>>, types: &mut HashMap<u32, Type>) {
285 for (mime_type_off, parents) in parents {
286 if types.contains_key(mime_type_off) {
287 for parent_mime_type_off in parents {
288 if let Some(parent_type) = types.get_mut(parent_mime_type_off) {
289 parent_type.children.push(*mime_type_off);
290 }
291 }
292 }
293 }
294
295 for r#type in types.values_mut() {
296 r#type.children.sort_unstable();
297 r#type.children.dedup();
298 }
299}
300
301fn resolve_mime_type(cache: &[u8], mime_type_off: u32) -> Option<&str> {
302 let off = mime_type_off as usize;
303 let bytes = cache.get(off..)?;
304
305 let pos = memchr(0, bytes)?;
306 let mime_type = &bytes[..pos];
307
308 from_utf8(mime_type).ok()
309}
310
311fn parse_offsets(cache: &[u8]) -> Option<(&[u8], &[u8])> {
312 if cache.len() < 2 * 2 + 9 * 4 {
313 return None;
314 }
315
316 let parents_off = parse_u32(cache, 2 * 2 + 4) as usize;
317 let types_off = parse_u32(cache, 2 * 2 + 5 * 4) as usize;
318
319 let parents = cache.get(parents_off..)?;
320 let types = cache.get(types_off..)?;
321
322 Some((parents, types))
323}
324
325fn parse_parents<'a>(cache: &'a [u8], bytes: &'a [u8]) -> Option<HashMap<u32, Vec<u32>>> {
326 if bytes.len() < 4 {
327 return None;
328 }
329
330 let cnt = parse_u32(bytes, 0) as usize;
331
332 let bytes = bytes.get(4..)?.get(..8 * cnt)?;
333
334 let mut parents = HashMap::<_, Vec<_>>::with_capacity(cnt);
335
336 for idx in 0..cnt {
337 let bytes = &bytes[8 * idx..][..8];
338
339 let mime_type_off = parse_u32(bytes, 0);
340 let parents_off = parse_u32(bytes, 4) as usize;
341
342 let bytes = cache.get(parents_off..)?;
343
344 if bytes.len() < 4 {
345 return None;
346 }
347
348 let cnt = parse_u32(bytes, 0) as usize;
349
350 let bytes = bytes.get(4..)?.get(..4 * cnt)?;
351
352 let parents = parents.entry(mime_type_off).or_default();
353 parents.reserve(cnt);
354
355 for idx in 0..cnt {
356 let parent_mime_type_off = parse_u32(bytes, 4 * idx);
357
358 parents.push(parent_mime_type_off);
359 }
360 }
361
362 Some(parents)
363}
364
365fn parse_types(cache: &[u8], bytes: &[u8]) -> Option<HashMap<u32, Type>> {
366 if bytes.len() < 3 * 4 {
367 return None;
368 }
369
370 let cnt = parse_u32(bytes, 0) as usize;
371 let off = parse_u32(bytes, 8) as usize;
372
373 let bytes = cache.get(off..)?.get(..16 * cnt)?;
374
375 let mut types = HashMap::<_, Type>::with_capacity(cnt);
376
377 for idx in 0..cnt {
378 let bytes = &bytes[16 * idx..];
379
380 let mime_type_off = parse_u32(bytes, 4);
381
382 let cnt = parse_u32(bytes, 8);
383 let off = parse_u32(bytes, 12);
384
385 types
386 .entry(mime_type_off)
387 .or_default()
388 .matches
389 .push(Match { cnt, off });
390 }
391
392 Some(types)
393}
394
395fn parse_u32(bytes: &[u8], off: usize) -> u32 {
396 u32::from_be_bytes(bytes[off..][..4].try_into().unwrap())
397}
398
399#[derive(Debug)]
400pub enum Error {
401 MalformedOffsets,
402 MalformedParents,
403 MalformedTypes,
404 MalformedMatch,
405 MissingTextPlain,
406 MissingApplicationOctetStream,
407 InvalidMimeType,
408}
409
410impl fmt::Display for Error {
411 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
412 let msg = match self {
413 Self::MalformedOffsets => "Malformed offsets",
414 Self::MalformedParents => "Malformed parents",
415 Self::MalformedTypes => "Malformed types",
416 Self::MalformedMatch => "Malformed match",
417 Self::MissingTextPlain => "Missing text/plain MIME type",
418 Self::MissingApplicationOctetStream => "Missing application/octet-stream MIME type",
419 Self::InvalidMimeType => "Invalid MIME type",
420 };
421
422 fmt.write_str(msg)
423 }
424}
425
426impl StdError for Error {}