1use std::io::Read;
2use std::collections::{HashSet, VecDeque};
3
4use crate::spec_util::validate_tag_path;
5use crate::tag_iterator_util::EBMLSize::{Known, Unknown};
6use crate::tag_iterator_util::{DEFAULT_BUFFER_LEN, EBMLSize, ProcessingTag, AllowableErrors};
7
8use super::tools;
9use super::specs::{EbmlSpecification, EbmlTag, Master, TagDataType, PathPart};
10use super::errors::tag_iterator::{CorruptedFileError, TagIteratorError};
11use super::errors::tool::ToolError;
12
13const INVALID_TAG_ID_ERROR : u8 = 0x01;
14const INVALID_HIERARCHY_ERROR : u8 = 0x02;
15const OVERSIZED_CHILD_ERROR : u8 = 0x04;
16
17pub struct TagIterator<R: Read, TSpec>
52 where
53 TSpec: EbmlSpecification<TSpec> + EbmlTag<TSpec> + Clone
54{
55 source: R,
56 tag_ids_to_buffer: HashSet<u64>,
57 allowed_errors: u8,
58 max_allowed_tag_size: Option<usize>,
59
60 buffer: Box<[u8]>,
61 buffer_offset: Option<usize>,
62 buffered_byte_length: usize,
63 internal_buffer_position: usize,
64 tag_stack: Vec<ProcessingTag<TSpec>>,
65 emission_queue: VecDeque<Result<(TSpec, usize), TagIteratorError>>,
66 last_emitted_tag_offset: usize,
67 has_determined_doc_path: bool,
68
69 emit_master_end_when_eof: bool,
70}
71
72impl<R: Read, TSpec> TagIterator<R, TSpec>
73 where
74 TSpec: EbmlSpecification<TSpec> + EbmlTag<TSpec> + Clone
75{
76
77 pub fn new(source: R, tags_to_buffer: &[TSpec]) -> Self {
83 TagIterator::with_capacity(source, tags_to_buffer, DEFAULT_BUFFER_LEN)
84 }
85
86 pub fn with_capacity(source: R, tags_to_buffer: &[TSpec], capacity: usize) -> Self {
92 let buffer = vec![0;capacity];
93
94 TagIterator {
95 source,
96 tag_ids_to_buffer: tags_to_buffer.iter().map(|tag| tag.get_id()).collect(),
97 allowed_errors: 0,
98 max_allowed_tag_size: Some(4 * usize::pow(1000, 3)), buffer: buffer.into_boxed_slice(),
100 buffered_byte_length: 0,
101 buffer_offset: None,
102 internal_buffer_position: 0,
103 tag_stack: Vec::new(),
104 emission_queue: VecDeque::new(),
105 last_emitted_tag_offset: 0,
106 has_determined_doc_path: false,
107 emit_master_end_when_eof: true,
108 }
109 }
110
111 pub fn allow_errors(&mut self, errors: &[AllowableErrors]) {
125 self.allowed_errors = errors.iter().fold(0u8, |a, c| match c {
126 AllowableErrors::InvalidTagIds => a | INVALID_TAG_ID_ERROR ,
127 AllowableErrors::HierarchyProblems => a | INVALID_HIERARCHY_ERROR,
128 AllowableErrors::OversizedTags => a | OVERSIZED_CHILD_ERROR,
129 });
130 }
131
132 pub fn set_max_allowable_tag_size(&mut self, size: Option<usize>) {
138 self.max_allowed_tag_size = size;
139 }
140
141 pub fn try_recover(&mut self) -> Result<(), TagIteratorError> {
147 let original_position = self.current_offset();
148 loop {
149 if !self.ensure_data_read(1)? {
150 return Err(TagIteratorError::UnexpectedEOF { tag_start: self.current_offset(), tag_id: None, tag_size: None, partial_data: None });
151 }
152
153 self.internal_buffer_position += 1;
154 if self.peek_valid_tag_header().is_ok() {
155 break;
156 }
157 }
158
159 let diff = self.current_offset() - original_position;
161 for tag in self.tag_stack.iter_mut() {
162 if let EBMLSize::Known(size) = &tag.size {
163 tag.size = EBMLSize::Known(size + diff);
164 }
165 }
166
167 Ok(())
168 }
169
170 pub fn into_inner(self) -> R {
176 self.source
177 }
178
179 pub fn get_mut(&mut self) -> &mut R {
185 &mut self.source
186 }
187
188 pub fn get_ref(&self) -> &R {
194 &self.source
195 }
196
197 pub fn last_emitted_tag_offset(&self) -> usize {
203 self.last_emitted_tag_offset
204 }
205
206 pub fn emit_master_end_when_eof(&mut self, emit: bool) {
214 self.emit_master_end_when_eof = emit;
215 }
216
217 #[inline(always)]
218 fn current_offset(&self) -> usize {
219 self.buffer_offset.unwrap_or(0) + self.internal_buffer_position
220 }
221
222 fn private_read(&mut self, internal_buffer_start: usize) -> Result<bool, TagIteratorError> {
223 let bytes_read = self.source.read(&mut self.buffer[internal_buffer_start..]).map_err(|source| TagIteratorError::ReadError { source })?;
224 if bytes_read == 0 {
225 Ok(false)
226 } else {
227 self.buffered_byte_length += bytes_read;
228 Ok(true)
229 }
230 }
231
232 fn ensure_capacity(&mut self, required_capacity: usize) {
233 if required_capacity > self.buffer.len() {
234 let mut new_buffer = Vec::from(&self.buffer[..]);
235 new_buffer.resize(required_capacity, 0);
236 self.buffer = new_buffer.into_boxed_slice();
237 }
238 }
239
240 fn ensure_data_read(&mut self, length: usize) -> Result<bool, TagIteratorError> {
241 if self.internal_buffer_position + length <= self.buffered_byte_length {
242 return Ok(true)
243 }
244
245 if self.buffer_offset.is_none() {
246 if !self.private_read(0)? {
247 return Ok(false);
248 }
249 self.buffer_offset = Some(0);
250 self.internal_buffer_position = 0;
251 } else {
252 while self.internal_buffer_position + length > self.buffered_byte_length {
253 self.buffer.copy_within(self.internal_buffer_position..self.buffered_byte_length, 0);
254 self.buffered_byte_length -= self.internal_buffer_position;
255 self.buffer_offset = Some(self.current_offset());
256 self.internal_buffer_position = 0;
257 if !self.private_read(self.buffered_byte_length)? {
258 return Ok(false);
259 }
260 }
261 }
262 Ok(true)
263 }
264
265 #[inline(always)]
266 fn peek_tag_id(&mut self) -> Result<(u64, usize), TagIteratorError> {
267 self.ensure_data_read(8)?;
268 if self.buffer[self.internal_buffer_position] == 0 {
269 return Ok((0, 1));
270 }
271 let length = 8 - self.buffer[self.internal_buffer_position].ilog2() as usize;
272 let mut val = self.buffer[self.internal_buffer_position] as u64;
273 for i in 1..length {
274 val <<= 8;
275 val += self.buffer[self.internal_buffer_position+i] as u64;
276 }
277 Ok((val, length))
278 }
279
280 #[inline]
281 fn peek_valid_tag_header(&mut self) -> Result<(u64, Option<TagDataType>, EBMLSize, usize), TagIteratorError> {
282 self.ensure_data_read(16)?;
283 let (tag_id, id_len) = self.peek_tag_id()?;
284 let spec_tag_type = <TSpec>::get_tag_data_type(tag_id);
285
286 let (size, size_len) = tools::read_vint(&self.buffer[(self.internal_buffer_position + id_len)..])
287 .or(Err(TagIteratorError::CorruptedFileData(CorruptedFileError::InvalidTagData{tag_id, position: self.current_offset() })))?
288 .ok_or(TagIteratorError::UnexpectedEOF { tag_start: self.current_offset(), tag_id: Some(tag_id), tag_size: None, partial_data: None })?;
289
290 if self.buffered_byte_length <= id_len + size_len {
291 return Err(TagIteratorError::UnexpectedEOF { tag_start: self.current_offset(), tag_id: Some(tag_id), tag_size: None, partial_data: None });
292 }
293
294 if matches!(spec_tag_type, Some(TagDataType::UnsignedInt) | Some(TagDataType::Integer) | Some(TagDataType::Float)) && size > 8 {
295 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::InvalidTagData{tag_id, position: self.current_offset() }));
296 }
297
298 let size = EBMLSize::new(size, size_len);
299
300 let header_len = id_len + size_len;
301
302 if (self.allowed_errors & INVALID_TAG_ID_ERROR == 0) && spec_tag_type.is_none() {
303 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::InvalidTagId{tag_id, position: self.current_offset() }));
304 }
305
306 if (self.allowed_errors & INVALID_HIERARCHY_ERROR == 0) && spec_tag_type.is_some() {
307 if !self.has_determined_doc_path {
309 let path = <TSpec>::get_path_by_id(tag_id);
311 if path.iter().all(|p| matches!(p, PathPart::Id(_))) {
312 self.tag_stack = path.iter().map(|id| {
314 match id {
315 PathPart::Id(id) => {
316 ProcessingTag {
317 tag: <TSpec>::get_master_tag(*id, Master::Start).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was in path, but could not get master tag!", id)),
318 size: EBMLSize::Unknown,
319 tag_start: 0,
320 data_start: 0,
321 }
322 },
323 PathPart::Global(_) => unreachable!()
324 }
325 }).collect();
326 self.has_determined_doc_path = true;
327 }
328 }
329 if self.has_determined_doc_path && !self.validate_tag_path(tag_id) {
330 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::HierarchyError { found_tag_id: tag_id, current_parent_id: self.tag_stack.last().map(|tag| tag.tag.get_id()) }));
331 }
332 }
333
334 if (self.allowed_errors & OVERSIZED_CHILD_ERROR == 0) && size.is_known() && self.is_invalid_tag_size(header_len + size.value()) {
335 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::OversizedChildElement{ position: self.current_offset(), tag_id, size: size.value()}));
336 }
337
338 if let Some(max_size) = self.max_allowed_tag_size {
339 if size.is_known() && size.value() > max_size {
340 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::InvalidTagSize { position: self.current_offset(), tag_id, size: size.value() }));
341 }
342 }
343
344 Ok((tag_id, spec_tag_type, size, header_len))
345 }
346
347 #[inline(always)]
348 fn read_valid_tag_header(&mut self) -> Result<(u64, Option<TagDataType>, EBMLSize), TagIteratorError> {
349 let (tag_id, spec_tag_type, size, header_len) = self.peek_valid_tag_header()?;
350
351 self.internal_buffer_position += header_len;
352 Ok((tag_id, spec_tag_type, size))
353 }
354
355 fn read_tag_data(&mut self, size: usize) -> Result<Option<&[u8]>, TagIteratorError> {
356 self.ensure_capacity(size);
357 if !self.ensure_data_read(size)? {
358 return Ok(None);
359 }
360
361 self.internal_buffer_position += size;
362 Ok(Some(&self.buffer[(self.internal_buffer_position-size)..self.internal_buffer_position]))
363 }
364
365 fn read_tag(&mut self) -> Result<ProcessingTag<TSpec>, TagIteratorError> {
366 let tag_start = self.current_offset();
367
368 let (tag_id, spec_tag_type, size) = self.read_valid_tag_header()?;
369
370 let data_start = self.current_offset();
371 let raw_data = if matches!(spec_tag_type, Some(TagDataType::Master)) {
372 &[]
373 } else if let Known(size) = size {
374 if let Some(data) = self.read_tag_data(size)? {
375 data
376 } else {
377 return Err(TagIteratorError::UnexpectedEOF { tag_start, tag_id: Some(tag_id), tag_size: Some(size), partial_data: Some(self.buffer[self.internal_buffer_position..].to_vec()) });
378 }
379 } else {
380 return Err(TagIteratorError::CorruptedFileData(CorruptedFileError::InvalidTagData{ tag_id, position: tag_start }));
381 };
382
383 let tag = match spec_tag_type {
384 Some(TagDataType::Master) => {
385 TSpec::get_master_tag(tag_id, Master::Start).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was master, but could not get tag!", tag_id))
386 },
387 Some(TagDataType::UnsignedInt) => {
388 let val = tools::arr_to_u64(raw_data).map_err(|e| TagIteratorError::CorruptedTagData{ tag_id, problem: e })?;
389 TSpec::get_unsigned_int_tag(tag_id, val).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was unsigned int, but could not get tag!", tag_id))
390 },
391 Some(TagDataType::Integer) => {
392 let val = tools::arr_to_i64(raw_data).map_err(|e| TagIteratorError::CorruptedTagData{ tag_id, problem: e })?;
393 TSpec::get_signed_int_tag(tag_id, val).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was integer, but could not get tag!", tag_id))
394 },
395 Some(TagDataType::Utf8) => {
396 let val = String::from_utf8(raw_data.to_vec()).map_err(|e| TagIteratorError::CorruptedTagData{ tag_id, problem: ToolError::FromUtf8Error(raw_data.to_vec(), e) })?;
397 TSpec::get_utf8_tag(tag_id, val).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was utf8, but could not get tag!", tag_id))
398 },
399 Some(TagDataType::Binary) => {
400 TSpec::get_binary_tag(tag_id, raw_data).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was binary, but could not get tag!", tag_id))
401 },
402 Some(TagDataType::Float) => {
403 let val = tools::arr_to_f64(raw_data).map_err(|e| TagIteratorError::CorruptedTagData{ tag_id, problem: e })?;
404 TSpec::get_float_tag(tag_id, val).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was float, but could not get tag!", tag_id))
405 },
406 None => {
407 TSpec::get_raw_tag(tag_id, raw_data)
408 }
409 };
410
411 Ok(ProcessingTag { tag, size, tag_start, data_start })
412 }
413
414 fn read_tag_checked(&mut self) -> Option<Result<ProcessingTag<TSpec>, TagIteratorError>> {
415 if self.internal_buffer_position == self.buffered_byte_length {
416 let read_result = self.ensure_data_read(1);
419 match read_result {
420 Err(err) => return Some(Err(err)),
421 Ok(data_remaining) => {
422 if !data_remaining {
423 return None;
424 }
425 }
426 }
427 }
428
429 if self.internal_buffer_position > self.buffered_byte_length {
430 panic!("read position exceeded buffer length");
431 }
432
433 Some(self.read_tag())
434 }
435
436 fn read_next(&mut self) {
437 let ended_tag_index = self.tag_stack.iter().position(|tag| matches!(tag.size, Known(size) if self.current_offset() >= tag.data_start + size));
439 if let Some(index) = ended_tag_index {
440 self.emission_queue.extend(self.tag_stack.drain(index..).map(|t| Ok((t.tag, t.tag_start))).rev());
441 }
442
443 if let Some(next_read) = self.read_tag_checked() {
444 if let Ok(next_tag) = &next_read {
445 while matches!(self.tag_stack.last(), Some(open_tag) if open_tag.size == Unknown) {
446 let open_tag = self.tag_stack.last().unwrap();
447 let previous_tag_ended = open_tag.is_ended_by(next_tag.tag.get_id());
448
449 if previous_tag_ended {
450 let t = self.tag_stack.pop().unwrap();
451 self.emission_queue.push_back(Ok((t.tag, t.tag_start)));
452 } else {
453 break;
454 }
455 }
456
457 if let Some(Master::Start) = next_tag.tag.as_master() {
458 let tag_id = next_tag.tag.get_id();
459
460 self.tag_stack.push(ProcessingTag {
461 tag: TSpec::get_master_tag(tag_id, Master::End).unwrap(),
462 size: next_tag.size,
463 tag_start: next_tag.tag_start,
464 data_start: next_tag.data_start,
465 });
466
467 if self.tag_ids_to_buffer.contains(&tag_id) {
468 self.buffer_master(tag_id);
469 return;
470 }
471 }
472 }
473
474 self.emission_queue.push_back(next_read.map(|r| (r.tag, r.tag_start)));
475 } else if self.emit_master_end_when_eof {
476 while let Some(tag) = self.tag_stack.pop() {
477 self.emission_queue.push_back(Ok((tag.tag, tag.tag_start)));
478 }
479 }
480 }
481
482 fn buffer_master(&mut self, tag_id: u64) {
483 let tag_start = self.current_offset();
484 let pre_queue_len = self.emission_queue.len();
485
486 let mut position = pre_queue_len;
487 'endTagSearch: loop {
488 if position >= self.emission_queue.len() {
489 self.read_next();
490
491 if position >= self.emission_queue.len() {
492 self.emission_queue.push_back(Err(TagIteratorError::UnexpectedEOF{ tag_start, tag_id: Some(tag_id), tag_size: None, partial_data: None }));
493 return;
494 }
495 }
496
497 while position < self.emission_queue.len() {
498 if let Some(r) = self.emission_queue.get(position) {
499 match r {
500 Err(_) => break 'endTagSearch,
501 Ok(t) => {
502 if t.0.get_id() == tag_id && matches!(t.0.as_master(), Some(Master::End)) {
503 break 'endTagSearch;
504 }
505 }
506 }
507 }
508 position += 1;
509 }
510 }
511
512 let mut children = self.emission_queue.split_off(pre_queue_len);
513 let split_to = position - pre_queue_len;
514 if children.get(split_to).unwrap().is_ok() {
515 let remaining = children.split_off(split_to).into_iter().skip(1);
516 let full_tag = Self::roll_up_children(tag_id, children.into_iter().map(|c| c.unwrap().0).collect());
517 self.emission_queue.push_back(Ok((full_tag, tag_start)));
518 self.emission_queue.extend(remaining);
519 } else {
520 self.emission_queue.extend(children.drain(split_to..).take(1));
521 }
522 }
523
524 fn roll_up_children(tag_id: u64, children: Vec<TSpec>) -> TSpec {
525 let mut rolled_children = Vec::new();
526
527 let mut iter = children.into_iter();
528 while let Some(child) = iter.next() {
529 if let Some(Master::Start) = child.as_master() {
530 let child_id = child.get_id();
531 let subchildren = iter.by_ref().take_while(|c| !matches!(c.as_master(), Some(Master::End)) || c.get_id() != child_id).collect();
532 rolled_children.push(Self::roll_up_children(child_id, subchildren));
533 } else {
534 rolled_children.push(child);
535 }
536 }
537
538 TSpec::get_master_tag(tag_id, Master::Full(rolled_children)).unwrap_or_else(|| panic!("Bad specification implementation: Tag id 0x{:x?} type was master, but could not get tag!", tag_id))
539 }
540
541 #[inline(always)]
542 fn validate_tag_path(&self, tag_id: u64) -> bool {
543 validate_tag_path::<TSpec>(tag_id, self.tag_stack.iter().map(|p| (p.tag.get_id(), p.size, 0)))
544 }
545
546 #[inline(always)]
547 fn is_invalid_tag_size(&self, size: usize) -> bool {
548 self.tag_stack.iter().filter(|p| p.size.is_known()).any(|t| {
549 (t.data_start + t.size.value()) < (self.current_offset() + size)
550 })
551 }
552}
553
554impl<R: Read, TSpec> Iterator for TagIterator<R, TSpec>
555 where TSpec: EbmlSpecification<TSpec> + EbmlTag<TSpec> + Clone
556{
557 type Item = Result<TSpec, TagIteratorError>;
558
559 fn next(&mut self) -> Option<Self::Item> {
560 if self.emission_queue.is_empty() {
561 self.read_next();
562 }
563 let next_item = self.emission_queue.pop_front();
564 if let Some(Ok(ref tuple)) = next_item {
565 self.last_emitted_tag_offset = tuple.1;
566 }
567 next_item.map(|r| r.map(|t| t.0))
568 }
569}