nom_exif/
parser.rs

1use std::{
2    cmp::{max, min},
3    fmt::{Debug, Display},
4    fs::File,
5    io::{self, Read, Seek},
6    marker::PhantomData,
7    net::TcpStream,
8    ops::Range,
9    path::Path,
10};
11
12use crate::{
13    buffer::Buffers,
14    error::{ParsedError, ParsingError, ParsingErrorState},
15    exif::{parse_exif_iter, TiffHeader},
16    file::Mime,
17    partial_vec::PartialVec,
18    skip::Skip,
19    video::parse_track_info,
20    ExifIter, Seekable, TrackInfo, Unseekable,
21};
22
23/// `MediaSource` represents a media data source that can be parsed by
24/// [`MediaParser`].
25///
26/// - Use `MediaSource::file_path(path)` or `MediaSource::file(file)` to create
27///   a MediaSource from a file
28///
29/// - Use `MediaSource::tcp_stream(stream)` to create a MediaSource from a `TcpStream`
30/// - In other cases:
31///
32///   - Use `MediaSource::seekable(reader)` to create a MediaSource from a `Read + Seek`
33///   
34///   - Use `MediaSource::unseekable(reader)` to create a MediaSource from a
35///     reader that only impl `Read`
36///   
37/// `seekable` is preferred to `unseekable`, since the former is more efficient
38/// when the parser needs to skip a large number of bytes.
39///
40/// Passing in a `BufRead` should be avoided because [`MediaParser`] comes with
41/// its own buffer management and the buffers can be shared between multiple
42/// parsing tasks, thus avoiding frequent memory allocations.
43pub struct MediaSource<R, S = Seekable> {
44    pub(crate) reader: R,
45    pub(crate) buf: Vec<u8>,
46    pub(crate) mime: Mime,
47    phantom: PhantomData<S>,
48}
49
50impl<R, S: Skip<R>> Debug for MediaSource<R, S> {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        f.debug_struct("MediaSource")
53            // .field("reader", &self.reader)
54            .field("mime", &self.mime)
55            .field("seekable", &S::debug())
56            .finish_non_exhaustive()
57    }
58}
59
60// Should be enough for parsing header
61const HEADER_PARSE_BUF_SIZE: usize = 128;
62
63impl<R: Read, S: Skip<R>> MediaSource<R, S> {
64    #[tracing::instrument(skip(reader))]
65    fn build(mut reader: R) -> crate::Result<Self> {
66        // TODO: reuse MediaParser to parse header
67        let mut buf = Vec::with_capacity(HEADER_PARSE_BUF_SIZE);
68        reader
69            .by_ref()
70            .take(HEADER_PARSE_BUF_SIZE as u64)
71            .read_to_end(&mut buf)?;
72        let mime: Mime = buf.as_slice().try_into()?;
73        tracing::debug!(?mime);
74        Ok(Self {
75            reader,
76            buf,
77            mime,
78            phantom: PhantomData,
79        })
80    }
81
82    pub fn has_track(&self) -> bool {
83        match self.mime {
84            Mime::Image(_) => false,
85            Mime::Video(_) => true,
86        }
87    }
88
89    pub fn has_exif(&self) -> bool {
90        match self.mime {
91            Mime::Image(_) => true,
92            Mime::Video(_) => false,
93        }
94    }
95}
96
97impl<R: Read + Seek> MediaSource<R, Seekable> {
98    pub fn seekable(reader: R) -> crate::Result<Self> {
99        Self::build(reader)
100    }
101}
102
103impl<R: Read> MediaSource<R, Unseekable> {
104    pub fn unseekable(reader: R) -> crate::Result<Self> {
105        Self::build(reader)
106    }
107}
108
109impl MediaSource<File, Seekable> {
110    pub fn file_path<P: AsRef<Path>>(path: P) -> crate::Result<Self> {
111        Self::seekable(File::open(path)?)
112    }
113
114    pub fn file(file: File) -> crate::Result<Self> {
115        Self::seekable(file)
116    }
117}
118
119impl MediaSource<TcpStream, Unseekable> {
120    pub fn tcp_stream(stream: TcpStream) -> crate::Result<Self> {
121        Self::unseekable(stream)
122    }
123}
124
125// Keep align with 4K
126pub(crate) const INIT_BUF_SIZE: usize = 4096;
127pub(crate) const MIN_GROW_SIZE: usize = 4096;
128// Max size of APP1 is 0xFFFF
129// pub(crate) const MAX_GROW_SIZE: usize = 63 * 1024;
130// Set a reasonable upper limit for single buffer allocation.
131pub(crate) const MAX_ALLOC_SIZE: usize = 1024 * 1024 * 1024;
132
133pub(crate) trait Buf {
134    fn buffer(&self) -> &[u8];
135    fn clear(&mut self);
136
137    fn set_position(&mut self, pos: usize);
138    #[allow(unused)]
139    fn position(&self) -> usize;
140}
141
142#[derive(Debug, Clone)]
143pub(crate) enum ParsingState {
144    TiffHeader(TiffHeader),
145    HeifExifSize(usize),
146    Cr3ExifSize(usize),
147}
148
149impl Display for ParsingState {
150    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
151        match self {
152            ParsingState::TiffHeader(h) => Display::fmt(&format!("ParsingState: {h:?})"), f),
153            ParsingState::HeifExifSize(n) => Display::fmt(&format!("ParsingState: {n}"), f),
154            ParsingState::Cr3ExifSize(n) => Display::fmt(&format!("ParsingState: {n}"), f),
155        }
156    }
157}
158
159pub(crate) trait BufParser: Buf + Debug {
160    fn fill_buf<R: Read>(&mut self, reader: &mut R, size: usize) -> io::Result<usize>;
161    fn load_and_parse<R: Read, S: Skip<R>, P, O>(
162        &mut self,
163        reader: &mut R,
164        mut parse: P,
165    ) -> Result<O, ParsedError>
166    where
167        P: FnMut(&[u8], Option<ParsingState>) -> Result<O, ParsingErrorState>,
168    {
169        self.load_and_parse_with_offset::<R, S, _, _>(
170            reader,
171            |data, _, state| parse(data, state),
172            0,
173        )
174    }
175
176    #[tracing::instrument(skip_all)]
177    fn load_and_parse_with_offset<R: Read, S: Skip<R>, P, O>(
178        &mut self,
179        reader: &mut R,
180        mut parse: P,
181        offset: usize,
182    ) -> Result<O, ParsedError>
183    where
184        P: FnMut(&[u8], usize, Option<ParsingState>) -> Result<O, ParsingErrorState>,
185    {
186        if offset >= self.buffer().len() {
187            self.fill_buf(reader, MIN_GROW_SIZE)?;
188        }
189
190        let mut parsing_state: Option<ParsingState> = None;
191        loop {
192            let res = parse(self.buffer(), offset, parsing_state.take());
193            match res {
194                Ok(o) => return Ok(o),
195                Err(es) => {
196                    tracing::debug!(?es);
197                    parsing_state = es.state;
198
199                    match es.err {
200                        ParsingError::ClearAndSkip(n) => {
201                            self.clear_and_skip::<R, S>(reader, n)?;
202                        }
203                        ParsingError::Need(i) => {
204                            tracing::debug!(need = i, "need more bytes");
205                            let to_read = max(i, MIN_GROW_SIZE);
206                            // let to_read = min(to_read, MAX_GROW_SIZE);
207
208                            let n = self.fill_buf(reader, to_read)?;
209                            if n == 0 {
210                                return Err(ParsedError::NoEnoughBytes);
211                            }
212                            tracing::debug!(n, "actual read");
213                        }
214                        ParsingError::Failed(s) => return Err(ParsedError::Failed(s)),
215                    }
216                }
217            }
218        }
219    }
220
221    #[tracing::instrument(skip(reader))]
222    fn clear_and_skip<R: Read, S: Skip<R>>(
223        &mut self,
224        reader: &mut R,
225        n: usize,
226    ) -> Result<(), ParsedError> {
227        tracing::debug!("ClearAndSkip");
228        if n <= self.buffer().len() {
229            tracing::debug!(n, "skip by set_position");
230            self.set_position(n);
231            return Ok(());
232        }
233
234        let skip_n = n - self.buffer().len();
235        tracing::debug!(skip_n, "clear and skip bytes");
236        self.clear();
237
238        let done = S::skip_by_seek(
239            reader,
240            skip_n
241                .try_into()
242                .map_err(|_| ParsedError::Failed("skip too many bytes".into()))?,
243        )?;
244        if !done {
245            tracing::debug!(skip_n, "skip by using our buffer");
246            let mut skipped = 0;
247            while skipped < skip_n {
248                let mut to_skip = skip_n - skipped;
249                to_skip = min(to_skip, MAX_ALLOC_SIZE);
250                let n = self.fill_buf(reader, to_skip)?;
251                skipped += n;
252                if skipped <= skip_n {
253                    self.clear();
254                } else {
255                    let remain = skipped - skip_n;
256                    self.set_position(self.buffer().len() - remain);
257                    break;
258                }
259            }
260        } else {
261            tracing::debug!(skip_n, "skip with seek");
262        }
263
264        if self.buffer().is_empty() {
265            self.fill_buf(reader, MIN_GROW_SIZE)?;
266        }
267        Ok(())
268    }
269}
270
271impl BufParser for MediaParser {
272    #[tracing::instrument(skip(self, reader), fields(buf_len=self.buf().len()))]
273    fn fill_buf<R: Read>(&mut self, reader: &mut R, size: usize) -> io::Result<usize> {
274        if size.saturating_add(self.buf().len()) > MAX_ALLOC_SIZE {
275            tracing::error!(?size, "the requested buffer size is too big");
276            return Err(io::ErrorKind::Unsupported.into());
277        }
278        self.buf_mut().reserve_exact(size);
279
280        let n = reader.take(size as u64).read_to_end(self.buf_mut())?;
281        if n == 0 {
282            tracing::error!(buf_len = self.buf().len(), "fill_buf: EOF");
283            return Err(std::io::ErrorKind::UnexpectedEof.into());
284        }
285
286        tracing::debug!(
287            ?size,
288            ?n,
289            buf_len = self.buf().len(),
290            "fill_buf: read bytes"
291        );
292
293        Ok(n)
294    }
295}
296
297impl Buf for MediaParser {
298    fn buffer(&self) -> &[u8] {
299        &self.buf()[self.position..]
300    }
301
302    fn clear(&mut self) {
303        self.buf_mut().clear();
304    }
305
306    fn set_position(&mut self, pos: usize) {
307        self.position = pos;
308    }
309
310    fn position(&self) -> usize {
311        self.position
312    }
313}
314
315pub trait ParseOutput<R, S>: Sized {
316    fn parse(parser: &mut MediaParser, ms: MediaSource<R, S>) -> crate::Result<Self>;
317}
318
319impl<R: Read, S: Skip<R>> ParseOutput<R, S> for ExifIter {
320    fn parse(parser: &mut MediaParser, mut ms: MediaSource<R, S>) -> crate::Result<Self> {
321        if !ms.has_exif() {
322            return Err(crate::Error::ParseFailed("no Exif data here".into()));
323        }
324        parse_exif_iter::<R, S>(parser, ms.mime.unwrap_image(), &mut ms.reader)
325    }
326}
327
328impl<R: Read, S: Skip<R>> ParseOutput<R, S> for TrackInfo {
329    fn parse(parser: &mut MediaParser, mut ms: MediaSource<R, S>) -> crate::Result<Self> {
330        if !ms.has_track() {
331            return Err(crate::Error::ParseFailed("no track info here".into()));
332        }
333        let out = parser.load_and_parse::<R, S, _, _>(ms.reader.by_ref(), |data, _| {
334            parse_track_info(data, ms.mime.unwrap_video())
335                .map_err(|e| ParsingErrorState::new(e, None))
336        })?;
337        Ok(out)
338    }
339}
340
341/// A `MediaParser`/`AsyncMediaParser` can parse media info from a
342/// [`MediaSource`].
343///
344/// `MediaParser`/`AsyncMediaParser` manages inner parse buffers that can be
345/// shared between multiple parsing tasks, thus avoiding frequent memory
346/// allocations.
347///
348/// Therefore:
349///
350/// - Try to reuse a `MediaParser`/`AsyncMediaParser` instead of creating a new
351///   one every time you need it.
352///   
353/// - `MediaSource` should be created directly from `Read`, not from `BufRead`.
354///
355/// ## Example
356///
357/// ```rust
358/// use nom_exif::*;
359/// use chrono::DateTime;
360///
361/// let mut parser = MediaParser::new();
362///
363/// // ------------------- Parse Exif Info
364/// let ms = MediaSource::file_path("./testdata/exif.heic").unwrap();
365/// assert!(ms.has_exif());
366/// let mut iter: ExifIter = parser.parse(ms).unwrap();
367///
368/// let entry = iter.next().unwrap();
369/// assert_eq!(entry.tag().unwrap(), ExifTag::Make);
370/// assert_eq!(entry.get_value().unwrap().as_str().unwrap(), "Apple");
371///
372/// // Convert `ExifIter` into an `Exif`. Clone it before converting, so that
373/// // we can start the iteration from the beginning.
374/// let exif: Exif = iter.clone().into();
375/// assert_eq!(exif.get(ExifTag::Make).unwrap().as_str().unwrap(), "Apple");
376///
377/// // ------------------- Parse Track Info
378/// let ms = MediaSource::file_path("./testdata/meta.mov").unwrap();
379/// assert!(ms.has_track());
380/// let info: TrackInfo = parser.parse(ms).unwrap();
381///
382/// assert_eq!(info.get(TrackInfoTag::Make), Some(&"Apple".into()));
383/// assert_eq!(info.get(TrackInfoTag::Model), Some(&"iPhone X".into()));
384/// assert_eq!(info.get(TrackInfoTag::GpsIso6709), Some(&"+27.1281+100.2508+000.000/".into()));
385/// assert_eq!(info.get_gps_info().unwrap().latitude_ref, 'N');
386/// assert_eq!(
387///     info.get_gps_info().unwrap().latitude,
388///     [(27, 1), (7, 1), (68, 100)].into(),
389/// );
390/// ```
391pub struct MediaParser {
392    bb: Buffers,
393    buf: Option<Vec<u8>>,
394    position: usize,
395}
396
397impl Debug for MediaParser {
398    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
399        f.debug_struct("MediaParser")
400            .field("buffers", &self.bb)
401            .field("buf len", &self.buf.as_ref().map(|x| x.len()))
402            .field("position", &self.position)
403            .finish_non_exhaustive()
404    }
405}
406
407impl Default for MediaParser {
408    fn default() -> Self {
409        Self {
410            bb: Buffers::new(),
411            buf: None,
412            position: 0,
413        }
414    }
415}
416
417pub(crate) trait ShareBuf {
418    fn share_buf(&mut self, range: Range<usize>) -> PartialVec;
419}
420
421impl ShareBuf for MediaParser {
422    fn share_buf(&mut self, mut range: Range<usize>) -> PartialVec {
423        let buf = self.buf.take().unwrap();
424        let vec = self.bb.release_to_share(buf);
425        range.start += self.position;
426        range.end += self.position;
427        PartialVec::new(vec, range)
428    }
429}
430
431impl MediaParser {
432    pub fn new() -> Self {
433        Self::default()
434    }
435
436    /// `MediaParser`/`AsyncMediaParser` comes with its own buffer management,
437    /// so that buffers can be reused during multiple parsing processes to
438    /// avoid frequent memory allocations. Therefore, try to reuse a
439    /// `MediaParser` instead of creating a new one every time you need it.
440    ///     
441    /// **Note**:
442    ///
443    /// - For [`ExifIter`] as parse output, Please avoid holding the `ExifIter`
444    ///   object all the time and drop it immediately after use. Otherwise, the
445    ///   parsing buffer referenced by the `ExifIter` object will not be reused
446    ///   by [`MediaParser`], resulting in repeated memory allocation in the
447    ///   subsequent parsing process.
448    ///
449    ///   If you really need to retain some data, please take out the required
450    ///   Entry values ​​and save them, or convert the `ExifIter` into an
451    ///   [`crate::Exif`] object to retain all Entry values.
452    ///
453    /// - For [`TrackInfo`] as parse output, you don't need to worry about
454    ///   this, because `TrackInfo` dosn't reference the parsing buffer.
455    pub fn parse<R: Read, S, O: ParseOutput<R, S>>(
456        &mut self,
457        mut ms: MediaSource<R, S>,
458    ) -> crate::Result<O> {
459        self.reset();
460        self.acquire_buf();
461
462        self.buf_mut().append(&mut ms.buf);
463        let res = self.do_parse(ms);
464
465        self.reset();
466        res
467    }
468
469    fn do_parse<R: Read, S, O: ParseOutput<R, S>>(
470        &mut self,
471        mut ms: MediaSource<R, S>,
472    ) -> Result<O, crate::Error> {
473        self.fill_buf(&mut ms.reader, INIT_BUF_SIZE)?;
474        let res = ParseOutput::parse(self, ms)?;
475        Ok(res)
476    }
477
478    fn reset(&mut self) {
479        // Ensure buf has been released
480        if let Some(buf) = self.buf.take() {
481            self.bb.release(buf);
482        }
483
484        // Reset position
485        self.set_position(0);
486    }
487
488    pub(crate) fn buf(&self) -> &Vec<u8> {
489        match self.buf.as_ref() {
490            Some(b) => b,
491            None => panic!("no buf here"),
492        }
493    }
494
495    fn buf_mut(&mut self) -> &mut Vec<u8> {
496        match self.buf.as_mut() {
497            Some(b) => b,
498            None => panic!("no buf here"),
499        }
500    }
501
502    fn acquire_buf(&mut self) {
503        assert!(self.buf.is_none());
504        self.buf = Some(self.bb.acquire());
505    }
506}
507
508#[cfg(test)]
509mod tests {
510    use std::sync::{LazyLock, Mutex, MutexGuard};
511
512    use super::*;
513    use test_case::case;
514
515    enum TrackExif {
516        Track,
517        Exif,
518        NoData,
519        Invalid,
520    }
521    use TrackExif::*;
522
523    static PARSER: LazyLock<Mutex<MediaParser>> = LazyLock::new(|| Mutex::new(MediaParser::new()));
524    fn parser() -> MutexGuard<'static, MediaParser> {
525        PARSER.lock().unwrap()
526    }
527
528    #[case("3gp_640x360.3gp", Track)]
529    #[case("broken.jpg", Exif)]
530    #[case("compatible-brands-fail.heic", Invalid)]
531    #[case("compatible-brands-fail.mov", Invalid)]
532    #[case("compatible-brands.heic", NoData)]
533    #[case("compatible-brands.mov", NoData)]
534    #[case("embedded-in-heic.mov", Track)]
535    #[case("exif.heic", Exif)]
536    #[case("exif.jpg", Exif)]
537    #[case("exif-no-tz.jpg", Exif)]
538    #[case("fujifilm_x_t1_01.raf.meta", Exif)]
539    #[case("meta.mov", Track)]
540    #[case("meta.mp4", Track)]
541    #[case("mka.mka", Track)]
542    #[case("mkv_640x360.mkv", Track)]
543    #[case("exif-one-entry.heic", Exif)]
544    #[case("no-exif.jpg", NoData)]
545    #[case("tif.tif", Exif)]
546    #[case("ramdisk.img", Invalid)]
547    #[case("webm_480.webm", Track)]
548    fn parse_media(path: &str, te: TrackExif) {
549        let mut parser = parser();
550        let ms = MediaSource::file_path(Path::new("testdata").join(path));
551        match te {
552            Track => {
553                let ms = ms.unwrap();
554                // println!("path: {path} mime: {:?}", ms.mime);
555                assert!(ms.has_track());
556                let _: TrackInfo = parser.parse(ms).unwrap();
557            }
558            Exif => {
559                let ms = ms.unwrap();
560                // println!("path: {path} mime: {:?}", ms.mime);
561                assert!(ms.has_exif());
562                let mut it: ExifIter = parser.parse(ms).unwrap();
563                let _ = it.parse_gps_info();
564
565                if path.contains("one-entry") {
566                    assert!(it.next().is_some());
567                    assert!(it.next().is_none());
568
569                    let exif: crate::Exif = it.clone_and_rewind().into();
570                    assert!(exif.get(ExifTag::Orientation).is_some());
571                } else {
572                    let _: crate::Exif = it.clone_and_rewind().into();
573                }
574            }
575            NoData => {
576                let ms = ms.unwrap();
577                // println!("path: {path} mime: {:?}", ms.mime);
578                if ms.has_exif() {
579                    let res: Result<ExifIter, _> = parser.parse(ms);
580                    res.unwrap_err();
581                } else if ms.has_track() {
582                    let res: Result<TrackInfo, _> = parser.parse(ms);
583                    res.unwrap_err();
584                }
585            }
586            Invalid => {
587                ms.unwrap_err();
588            }
589        }
590    }
591
592    use crate::testkit::open_sample;
593    use crate::{EntryValue, ExifTag, TrackInfoTag};
594    use chrono::DateTime;
595    use test_case::test_case;
596
597    use crate::video::TrackInfoTag::*;
598
599    #[test_case("mkv_640x360.mkv", ImageWidth, 640_u32.into())]
600    #[test_case("mkv_640x360.mkv", ImageHeight, 360_u32.into())]
601    #[test_case("mkv_640x360.mkv", DurationMs, 13346_u64.into())]
602    #[test_case("mkv_640x360.mkv", CreateDate, DateTime::parse_from_str("2008-08-08T08:08:08Z", "%+").unwrap().into())]
603    #[test_case("meta.mov", Make, "Apple".into())]
604    #[test_case("meta.mov", Model, "iPhone X".into())]
605    #[test_case("meta.mov", GpsIso6709, "+27.1281+100.2508+000.000/".into())]
606    #[test_case("meta.mp4", ImageWidth, 1920_u32.into())]
607    #[test_case("meta.mp4", ImageHeight, 1080_u32.into())]
608    #[test_case("meta.mp4", DurationMs, 1063_u64.into())]
609    #[test_case("meta.mp4", GpsIso6709, "+27.2939+112.6932/".into())]
610    #[test_case("meta.mp4", CreateDate, DateTime::parse_from_str("2024-02-03T07:05:38Z", "%+").unwrap().into())]
611    #[test_case("udta.auth.mp4", Author, "ReplayKitRecording".into(); "udta author")]
612    #[test_case("auth.mov", Author, "ReplayKitRecording".into(); "mov author")]
613    fn parse_track_info(path: &str, tag: TrackInfoTag, v: EntryValue) {
614        let mut parser = parser();
615
616        let mf = MediaSource::file(open_sample(path).unwrap()).unwrap();
617        let info: TrackInfo = parser.parse(mf).unwrap();
618        assert_eq!(info.get(tag).unwrap(), &v);
619
620        let mf = MediaSource::unseekable(open_sample(path).unwrap()).unwrap();
621        let info: TrackInfo = parser.parse(mf).unwrap();
622        assert_eq!(info.get(tag).unwrap(), &v);
623    }
624
625    #[test_case("crash_moov-trak")]
626    #[test_case("crash_skip_large")]
627    #[test_case("crash_add_large")]
628    fn parse_track_crash(path: &str) {
629        let mut parser = parser();
630
631        let mf = MediaSource::file(open_sample(path).unwrap()).unwrap();
632        let _: TrackInfo = parser.parse(mf).unwrap_or_default();
633
634        let mf = MediaSource::unseekable(open_sample(path).unwrap()).unwrap();
635        let _: TrackInfo = parser.parse(mf).unwrap_or_default();
636    }
637}