nom_exif/
parser.rs

1use std::{
2    cmp::{max, min},
3    fmt::{Debug, Display},
4    fs::File,
5    io::{self, Read, Seek},
6    marker::PhantomData,
7    net::TcpStream,
8    ops::Range,
9    path::Path,
10};
11
12use crate::{
13    buffer::Buffers,
14    error::{ParsedError, ParsingError, ParsingErrorState},
15    exif::{parse_exif_iter, TiffHeader},
16    file::Mime,
17    partial_vec::PartialVec,
18    skip::Skip,
19    video::parse_track_info,
20    ExifIter, Seekable, TrackInfo, Unseekable,
21};
22
23/// `MediaSource` represents a media data source that can be parsed by
24/// [`MediaParser`].
25///
26/// - Use `MediaSource::file_path(path)` or `MediaSource::file(file)` to create
27///   a MediaSource from a file
28///
29/// - Use `MediaSource::tcp_stream(stream)` to create a MediaSource from a `TcpStream`
30/// - In other cases:
31///
32///   - Use `MediaSource::seekable(reader)` to create a MediaSource from a `Read + Seek`
33///   
34///   - Use `MediaSource::unseekable(reader)` to create a MediaSource from a
35///     reader that only impl `Read`
36///   
37/// `seekable` is preferred to `unseekable`, since the former is more efficient
38/// when the parser needs to skip a large number of bytes.
39///
40/// Passing in a `BufRead` should be avoided because [`MediaParser`] comes with
41/// its own buffer management and the buffers can be shared between multiple
42/// parsing tasks, thus avoiding frequent memory allocations.
43pub struct MediaSource<R, S = Seekable> {
44    pub(crate) reader: R,
45    pub(crate) buf: Vec<u8>,
46    pub(crate) mime: Mime,
47    phantom: PhantomData<S>,
48}
49
50impl<R, S: Skip<R>> Debug for MediaSource<R, S> {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        f.debug_struct("MediaSource")
53            // .field("reader", &self.reader)
54            .field("mime", &self.mime)
55            .field("seekable", &S::debug())
56            .finish_non_exhaustive()
57    }
58}
59
60// Should be enough for parsing header
61const HEADER_PARSE_BUF_SIZE: usize = 128;
62
63impl<R: Read, S: Skip<R>> MediaSource<R, S> {
64    #[tracing::instrument(skip(reader))]
65    fn build(mut reader: R) -> crate::Result<Self> {
66        // TODO: reuse MediaParser to parse header
67        let mut buf = Vec::with_capacity(HEADER_PARSE_BUF_SIZE);
68        reader
69            .by_ref()
70            .take(HEADER_PARSE_BUF_SIZE as u64)
71            .read_to_end(&mut buf)?;
72        let mime: Mime = buf.as_slice().try_into()?;
73        tracing::debug!(?mime);
74        Ok(Self {
75            reader,
76            buf,
77            mime,
78            phantom: PhantomData,
79        })
80    }
81
82    pub fn has_track(&self) -> bool {
83        match self.mime {
84            Mime::Image(_) => false,
85            Mime::Video(_) => true,
86        }
87    }
88
89    pub fn has_exif(&self) -> bool {
90        match self.mime {
91            Mime::Image(_) => true,
92            Mime::Video(_) => false,
93        }
94    }
95}
96
97impl<R: Read + Seek> MediaSource<R, Seekable> {
98    pub fn seekable(reader: R) -> crate::Result<Self> {
99        Self::build(reader)
100    }
101}
102
103impl<R: Read> MediaSource<R, Unseekable> {
104    pub fn unseekable(reader: R) -> crate::Result<Self> {
105        Self::build(reader)
106    }
107}
108
109impl MediaSource<File, Seekable> {
110    pub fn file_path<P: AsRef<Path>>(path: P) -> crate::Result<Self> {
111        Self::seekable(File::open(path)?)
112    }
113
114    pub fn file(file: File) -> crate::Result<Self> {
115        Self::seekable(file)
116    }
117}
118
119impl MediaSource<TcpStream, Unseekable> {
120    pub fn tcp_stream(stream: TcpStream) -> crate::Result<Self> {
121        Self::unseekable(stream)
122    }
123}
124
125// Keep align with 4K
126pub(crate) const INIT_BUF_SIZE: usize = 4096;
127pub(crate) const MIN_GROW_SIZE: usize = 4096;
128// Max size of APP1 is 0xFFFF
129// pub(crate) const MAX_GROW_SIZE: usize = 63 * 1024;
130// Set a reasonable upper limit for single buffer allocation.
131pub(crate) const MAX_ALLOC_SIZE: usize = 1024 * 1024 * 1024;
132
133pub(crate) trait Buf {
134    fn buffer(&self) -> &[u8];
135    fn clear(&mut self);
136
137    fn set_position(&mut self, pos: usize);
138    #[allow(unused)]
139    fn position(&self) -> usize;
140}
141
142#[derive(Debug, Clone)]
143pub(crate) enum ParsingState {
144    TiffHeader(TiffHeader),
145    HeifExifSize(usize),
146}
147
148impl Display for ParsingState {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        match self {
151            ParsingState::TiffHeader(h) => Display::fmt(&format!("ParsingState: {h:?})"), f),
152            ParsingState::HeifExifSize(n) => Display::fmt(&format!("ParsingState: {n}"), f),
153        }
154    }
155}
156
157pub(crate) trait BufParser: Buf + Debug {
158    fn fill_buf<R: Read>(&mut self, reader: &mut R, size: usize) -> io::Result<usize>;
159    fn load_and_parse<R: Read, S: Skip<R>, P, O>(
160        &mut self,
161        reader: &mut R,
162        mut parse: P,
163    ) -> Result<O, ParsedError>
164    where
165        P: FnMut(&[u8], Option<ParsingState>) -> Result<O, ParsingErrorState>,
166    {
167        self.load_and_parse_with_offset::<R, S, _, _>(
168            reader,
169            |data, _, state| parse(data, state),
170            0,
171        )
172    }
173
174    #[tracing::instrument(skip_all)]
175    fn load_and_parse_with_offset<R: Read, S: Skip<R>, P, O>(
176        &mut self,
177        reader: &mut R,
178        mut parse: P,
179        offset: usize,
180    ) -> Result<O, ParsedError>
181    where
182        P: FnMut(&[u8], usize, Option<ParsingState>) -> Result<O, ParsingErrorState>,
183    {
184        if offset >= self.buffer().len() {
185            self.fill_buf(reader, MIN_GROW_SIZE)?;
186        }
187
188        let mut parsing_state: Option<ParsingState> = None;
189        loop {
190            let res = parse(self.buffer(), offset, parsing_state.take());
191            match res {
192                Ok(o) => return Ok(o),
193                Err(es) => {
194                    tracing::debug!(?es);
195                    parsing_state = es.state;
196
197                    match es.err {
198                        ParsingError::ClearAndSkip(n) => {
199                            self.clear_and_skip::<R, S>(reader, n)?;
200                        }
201                        ParsingError::Need(i) => {
202                            tracing::debug!(need = i, "need more bytes");
203                            let to_read = max(i, MIN_GROW_SIZE);
204                            // let to_read = min(to_read, MAX_GROW_SIZE);
205
206                            let n = self.fill_buf(reader, to_read)?;
207                            if n == 0 {
208                                return Err(ParsedError::NoEnoughBytes);
209                            }
210                            tracing::debug!(n, "actual read");
211                        }
212                        ParsingError::Failed(s) => return Err(ParsedError::Failed(s)),
213                    }
214                }
215            }
216        }
217    }
218
219    #[tracing::instrument(skip(reader))]
220    fn clear_and_skip<R: Read, S: Skip<R>>(
221        &mut self,
222        reader: &mut R,
223        n: usize,
224    ) -> Result<(), ParsedError> {
225        tracing::debug!("ClearAndSkip");
226        if n <= self.buffer().len() {
227            tracing::debug!(n, "skip by set_position");
228            self.set_position(n);
229            return Ok(());
230        }
231
232        let skip_n = n - self.buffer().len();
233        tracing::debug!(skip_n, "clear and skip bytes");
234        self.clear();
235
236        let done = S::skip_by_seek(
237            reader,
238            skip_n
239                .try_into()
240                .map_err(|_| ParsedError::Failed("skip too many bytes".into()))?,
241        )?;
242        if !done {
243            tracing::debug!(skip_n, "skip by using our buffer");
244            let mut skipped = 0;
245            while skipped < skip_n {
246                let mut to_skip = skip_n - skipped;
247                to_skip = min(to_skip, MAX_ALLOC_SIZE);
248                let n = self.fill_buf(reader, to_skip)?;
249                skipped += n;
250                if skipped <= skip_n {
251                    self.clear();
252                } else {
253                    let remain = skipped - skip_n;
254                    self.set_position(self.buffer().len() - remain);
255                    break;
256                }
257            }
258        } else {
259            tracing::debug!(skip_n, "skip with seek");
260        }
261
262        if self.buffer().is_empty() {
263            self.fill_buf(reader, MIN_GROW_SIZE)?;
264        }
265        Ok(())
266    }
267}
268
269impl BufParser for MediaParser {
270    #[tracing::instrument(skip(self, reader), fields(buf_len=self.buf().len()))]
271    fn fill_buf<R: Read>(&mut self, reader: &mut R, size: usize) -> io::Result<usize> {
272        if size.saturating_add(self.buf().len()) > MAX_ALLOC_SIZE {
273            tracing::error!(?size, "the requested buffer size is too big");
274            return Err(io::ErrorKind::Unsupported.into());
275        }
276        self.buf_mut().reserve_exact(size);
277
278        let n = reader.take(size as u64).read_to_end(self.buf_mut())?;
279        if n == 0 {
280            tracing::error!(buf_len = self.buf().len(), "fill_buf: EOF");
281            return Err(std::io::ErrorKind::UnexpectedEof.into());
282        }
283
284        tracing::debug!(
285            ?size,
286            ?n,
287            buf_len = self.buf().len(),
288            "fill_buf: read bytes"
289        );
290
291        Ok(n)
292    }
293}
294
295impl Buf for MediaParser {
296    fn buffer(&self) -> &[u8] {
297        &self.buf()[self.position..]
298    }
299
300    fn clear(&mut self) {
301        self.buf_mut().clear();
302    }
303
304    fn set_position(&mut self, pos: usize) {
305        self.position = pos;
306    }
307
308    fn position(&self) -> usize {
309        self.position
310    }
311}
312
313pub trait ParseOutput<R, S>: Sized {
314    fn parse(parser: &mut MediaParser, ms: MediaSource<R, S>) -> crate::Result<Self>;
315}
316
317impl<R: Read, S: Skip<R>> ParseOutput<R, S> for ExifIter {
318    fn parse(parser: &mut MediaParser, mut ms: MediaSource<R, S>) -> crate::Result<Self> {
319        if !ms.has_exif() {
320            return Err(crate::Error::ParseFailed("no Exif data here".into()));
321        }
322        parse_exif_iter::<R, S>(parser, ms.mime.unwrap_image(), &mut ms.reader)
323    }
324}
325
326impl<R: Read, S: Skip<R>> ParseOutput<R, S> for TrackInfo {
327    fn parse(parser: &mut MediaParser, mut ms: MediaSource<R, S>) -> crate::Result<Self> {
328        if !ms.has_track() {
329            return Err(crate::Error::ParseFailed("no track info here".into()));
330        }
331        let out = parser.load_and_parse::<R, S, _, _>(ms.reader.by_ref(), |data, _| {
332            parse_track_info(data, ms.mime.unwrap_video())
333                .map_err(|e| ParsingErrorState::new(e, None))
334        })?;
335        Ok(out)
336    }
337}
338
339/// A `MediaParser`/`AsyncMediaParser` can parse media info from a
340/// [`MediaSource`].
341///
342/// `MediaParser`/`AsyncMediaParser` manages inner parse buffers that can be
343/// shared between multiple parsing tasks, thus avoiding frequent memory
344/// allocations.
345///
346/// Therefore:
347///
348/// - Try to reuse a `MediaParser`/`AsyncMediaParser` instead of creating a new
349///   one every time you need it.
350///   
351/// - `MediaSource` should be created directly from `Read`, not from `BufRead`.
352///
353/// ## Example
354///
355/// ```rust
356/// use nom_exif::*;
357/// use chrono::DateTime;
358///
359/// let mut parser = MediaParser::new();
360///
361/// // ------------------- Parse Exif Info
362/// let ms = MediaSource::file_path("./testdata/exif.heic").unwrap();
363/// assert!(ms.has_exif());
364/// let mut iter: ExifIter = parser.parse(ms).unwrap();
365///
366/// let entry = iter.next().unwrap();
367/// assert_eq!(entry.tag().unwrap(), ExifTag::Make);
368/// assert_eq!(entry.get_value().unwrap().as_str().unwrap(), "Apple");
369///
370/// // Convert `ExifIter` into an `Exif`. Clone it before converting, so that
371/// // we can start the iteration from the beginning.
372/// let exif: Exif = iter.clone().into();
373/// assert_eq!(exif.get(ExifTag::Make).unwrap().as_str().unwrap(), "Apple");
374///
375/// // ------------------- Parse Track Info
376/// let ms = MediaSource::file_path("./testdata/meta.mov").unwrap();
377/// assert!(ms.has_track());
378/// let info: TrackInfo = parser.parse(ms).unwrap();
379///
380/// assert_eq!(info.get(TrackInfoTag::Make), Some(&"Apple".into()));
381/// assert_eq!(info.get(TrackInfoTag::Model), Some(&"iPhone X".into()));
382/// assert_eq!(info.get(TrackInfoTag::GpsIso6709), Some(&"+27.1281+100.2508+000.000/".into()));
383/// assert_eq!(info.get_gps_info().unwrap().latitude_ref, 'N');
384/// assert_eq!(
385///     info.get_gps_info().unwrap().latitude,
386///     [(27, 1), (7, 1), (68, 100)].into(),
387/// );
388/// ```
389pub struct MediaParser {
390    bb: Buffers,
391    buf: Option<Vec<u8>>,
392    position: usize,
393}
394
395impl Debug for MediaParser {
396    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
397        f.debug_struct("MediaParser")
398            .field("buffers", &self.bb)
399            .field("buf len", &self.buf.as_ref().map(|x| x.len()))
400            .field("position", &self.position)
401            .finish_non_exhaustive()
402    }
403}
404
405impl Default for MediaParser {
406    fn default() -> Self {
407        Self {
408            bb: Buffers::new(),
409            buf: None,
410            position: 0,
411        }
412    }
413}
414
415pub(crate) trait ShareBuf {
416    fn share_buf(&mut self, range: Range<usize>) -> PartialVec;
417}
418
419impl ShareBuf for MediaParser {
420    fn share_buf(&mut self, mut range: Range<usize>) -> PartialVec {
421        let buf = self.buf.take().unwrap();
422        let vec = self.bb.release_to_share(buf);
423        range.start += self.position;
424        range.end += self.position;
425        PartialVec::new(vec, range)
426    }
427}
428
429impl MediaParser {
430    pub fn new() -> Self {
431        Self::default()
432    }
433
434    /// `MediaParser`/`AsyncMediaParser` comes with its own buffer management,
435    /// so that buffers can be reused during multiple parsing processes to
436    /// avoid frequent memory allocations. Therefore, try to reuse a
437    /// `MediaParser` instead of creating a new one every time you need it.
438    ///     
439    /// **Note**:
440    ///
441    /// - For [`ExifIter`] as parse output, Please avoid holding the `ExifIter`
442    ///   object all the time and drop it immediately after use. Otherwise, the
443    ///   parsing buffer referenced by the `ExifIter` object will not be reused
444    ///   by [`MediaParser`], resulting in repeated memory allocation in the
445    ///   subsequent parsing process.
446    ///
447    ///   If you really need to retain some data, please take out the required
448    ///   Entry values ​​and save them, or convert the `ExifIter` into an
449    ///   [`crate::Exif`] object to retain all Entry values.
450    ///
451    /// - For [`TrackInfo`] as parse output, you don't need to worry about
452    ///   this, because `TrackInfo` dosn't reference the parsing buffer.
453    pub fn parse<R: Read, S, O: ParseOutput<R, S>>(
454        &mut self,
455        mut ms: MediaSource<R, S>,
456    ) -> crate::Result<O> {
457        self.reset();
458        self.acquire_buf();
459
460        self.buf_mut().append(&mut ms.buf);
461        let res = self.do_parse(ms);
462
463        self.reset();
464        res
465    }
466
467    fn do_parse<R: Read, S, O: ParseOutput<R, S>>(
468        &mut self,
469        mut ms: MediaSource<R, S>,
470    ) -> Result<O, crate::Error> {
471        self.fill_buf(&mut ms.reader, INIT_BUF_SIZE)?;
472        let res = ParseOutput::parse(self, ms)?;
473        Ok(res)
474    }
475
476    fn reset(&mut self) {
477        // Ensure buf has been released
478        if let Some(buf) = self.buf.take() {
479            self.bb.release(buf);
480        }
481
482        // Reset position
483        self.set_position(0);
484    }
485
486    pub(crate) fn buf(&self) -> &Vec<u8> {
487        match self.buf.as_ref() {
488            Some(b) => b,
489            None => panic!("no buf here"),
490        }
491    }
492
493    fn buf_mut(&mut self) -> &mut Vec<u8> {
494        match self.buf.as_mut() {
495            Some(b) => b,
496            None => panic!("no buf here"),
497        }
498    }
499
500    fn acquire_buf(&mut self) {
501        assert!(self.buf.is_none());
502        self.buf = Some(self.bb.acquire());
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use std::sync::{LazyLock, Mutex, MutexGuard};
509
510    use super::*;
511    use test_case::case;
512
513    enum TrackExif {
514        Track,
515        Exif,
516        NoData,
517        Invalid,
518    }
519    use TrackExif::*;
520
521    static PARSER: LazyLock<Mutex<MediaParser>> = LazyLock::new(|| Mutex::new(MediaParser::new()));
522    fn parser() -> MutexGuard<'static, MediaParser> {
523        PARSER.lock().unwrap()
524    }
525
526    #[case("3gp_640x360.3gp", Track)]
527    #[case("broken.jpg", Exif)]
528    #[case("compatible-brands-fail.heic", Invalid)]
529    #[case("compatible-brands-fail.mov", Invalid)]
530    #[case("compatible-brands.heic", NoData)]
531    #[case("compatible-brands.mov", NoData)]
532    #[case("embedded-in-heic.mov", Track)]
533    #[case("exif.heic", Exif)]
534    #[case("exif.jpg", Exif)]
535    #[case("exif-no-tz.jpg", Exif)]
536    #[case("fujifilm_x_t1_01.raf.meta", Exif)]
537    #[case("meta.mov", Track)]
538    #[case("meta.mp4", Track)]
539    #[case("mka.mka", Track)]
540    #[case("mkv_640x360.mkv", Track)]
541    #[case("exif-one-entry.heic", Exif)]
542    #[case("no-exif.jpg", NoData)]
543    #[case("tif.tif", Exif)]
544    #[case("ramdisk.img", Invalid)]
545    #[case("webm_480.webm", Track)]
546    fn parse_media(path: &str, te: TrackExif) {
547        let mut parser = parser();
548        let ms = MediaSource::file_path(Path::new("testdata").join(path));
549        match te {
550            Track => {
551                let ms = ms.unwrap();
552                // println!("path: {path} mime: {:?}", ms.mime);
553                assert!(ms.has_track());
554                let _: TrackInfo = parser.parse(ms).unwrap();
555            }
556            Exif => {
557                let ms = ms.unwrap();
558                // println!("path: {path} mime: {:?}", ms.mime);
559                assert!(ms.has_exif());
560                let mut it: ExifIter = parser.parse(ms).unwrap();
561                let _ = it.parse_gps_info();
562
563                if path.contains("one-entry") {
564                    assert!(it.next().is_some());
565                    assert!(it.next().is_none());
566
567                    let exif: crate::Exif = it.clone_and_rewind().into();
568                    assert!(exif.get(ExifTag::Orientation).is_some());
569                } else {
570                    let _: crate::Exif = it.clone_and_rewind().into();
571                }
572            }
573            NoData => {
574                let ms = ms.unwrap();
575                // println!("path: {path} mime: {:?}", ms.mime);
576                if ms.has_exif() {
577                    let res: Result<ExifIter, _> = parser.parse(ms);
578                    res.unwrap_err();
579                } else if ms.has_track() {
580                    let res: Result<TrackInfo, _> = parser.parse(ms);
581                    res.unwrap_err();
582                }
583            }
584            Invalid => {
585                ms.unwrap_err();
586            }
587        }
588    }
589
590    use crate::testkit::open_sample;
591    use crate::{EntryValue, ExifTag, TrackInfoTag};
592    use chrono::DateTime;
593    use test_case::test_case;
594
595    use crate::video::TrackInfoTag::*;
596
597    #[test_case("mkv_640x360.mkv", ImageWidth, 640_u32.into())]
598    #[test_case("mkv_640x360.mkv", ImageHeight, 360_u32.into())]
599    #[test_case("mkv_640x360.mkv", DurationMs, 13346_u64.into())]
600    #[test_case("mkv_640x360.mkv", CreateDate, DateTime::parse_from_str("2008-08-08T08:08:08Z", "%+").unwrap().into())]
601    #[test_case("meta.mov", Make, "Apple".into())]
602    #[test_case("meta.mov", Model, "iPhone X".into())]
603    #[test_case("meta.mov", GpsIso6709, "+27.1281+100.2508+000.000/".into())]
604    #[test_case("meta.mp4", ImageWidth, 1920_u32.into())]
605    #[test_case("meta.mp4", ImageHeight, 1080_u32.into())]
606    #[test_case("meta.mp4", DurationMs, 1063_u64.into())]
607    #[test_case("meta.mp4", GpsIso6709, "+27.2939+112.6932/".into())]
608    #[test_case("meta.mp4", CreateDate, DateTime::parse_from_str("2024-02-03T07:05:38Z", "%+").unwrap().into())]
609    #[test_case("udta.auth.mp4", Author, "ReplayKitRecording".into(); "udta author")]
610    #[test_case("auth.mov", Author, "ReplayKitRecording".into(); "mov author")]
611    fn parse_track_info(path: &str, tag: TrackInfoTag, v: EntryValue) {
612        let mut parser = parser();
613
614        let mf = MediaSource::file(open_sample(path).unwrap()).unwrap();
615        let info: TrackInfo = parser.parse(mf).unwrap();
616        assert_eq!(info.get(tag).unwrap(), &v);
617
618        let mf = MediaSource::unseekable(open_sample(path).unwrap()).unwrap();
619        let info: TrackInfo = parser.parse(mf).unwrap();
620        assert_eq!(info.get(tag).unwrap(), &v);
621    }
622
623    #[test_case("crash_moov-trak")]
624    #[test_case("crash_skip_large")]
625    #[test_case("crash_add_large")]
626    fn parse_track_crash(path: &str) {
627        let mut parser = parser();
628
629        let mf = MediaSource::file(open_sample(path).unwrap()).unwrap();
630        let _: TrackInfo = parser.parse(mf).unwrap_or_default();
631
632        let mf = MediaSource::unseekable(open_sample(path).unwrap()).unwrap();
633        let _: TrackInfo = parser.parse(mf).unwrap_or_default();
634    }
635}