#![allow(clippy::assertions_on_constants)]
extern crate encoding_rs;
use crate::as_mut_str_unchecked_no_borrow_check;
use crate::as_str_unchecked_no_borrow_check;
use crate::finding::Finding;
use crate::finding::Precision;
use crate::finding::OUTPUT_BUF_LEN;
use crate::helper::starts_with_multibyte_char;
use crate::helper::SplitStr;
use crate::input::ByteCounter;
use crate::input::INPUT_BUF_LEN;
use crate::scanner::ScannerState;
use encoding_rs::DecoderResult;
use std::io::Write;
use std::marker::PhantomPinned;
use std::ops::Deref;
use std::pin::Pin;
use std::slice;
use std::str;
/// `FindingCollection` is a set of ordered `Finding` s.
/// The box `output_buffer_bytes` and the struct `Finding` are self-referential,
/// because `Finding.s` points into `output_buffer_bytes`. Therefore, special
/// care is taken that, `output_buffer_bytes` is protected from being moved in
/// memory:
/// 1. `output_buffer_bytes` is private.
/// 2. The returned `FindingCollection` is wrapped in a
/// `Pin<Box<FindingCollection>>>`.
#[derive(Debug)]
pub struct FindingCollection<'a> {
/// `Finding` s in this vector are in chronological order.
pub v: Vec<Finding<'a>>,
/// All concurrent `ScannerState::scan()` start at the same byte. All
/// `Finding.position` refer to `first_byte_position` as zero.
pub first_byte_position: ByteCounter,
/// A buffer containing the UTF-8 representation of all findings during one
/// `Self::from()` run. First, the `Decoder` fills in some UTF-8
/// string. This string is then filtered. The result of this filtering is
/// a collection of `Finding`-objects stored in a `FindingCollection`. The
/// `Finding`-objects have a `&str`-member called `Finding.s` that is
/// a substring (slice) of `output_buffer_bytes`.
output_buffer_bytes: Box<[u8]>,
/// If `output_buffer` is too small to receive all findings, this is set
/// `true` indicating that only the last `Finding` s could be stored. At
/// least one `Finding` got lost. This incident is reported to the user. If
/// ever this happens, the `OUTPUT_BUF_LEN` was not chosen big enough.
pub str_buf_overflow: bool,
_marker: PhantomPinned,
}
impl FindingCollection<'_> {
pub fn new(byte_offset: ByteCounter) -> Self {
// This buffer lives on the heap. let mut output_buffer_bytes =
// Box::new([0u8; OUTPUT_BUF_LEN]);
let output_buffer_bytes = Box::new([0u8; OUTPUT_BUF_LEN]);
FindingCollection {
v: Vec::new(),
first_byte_position: byte_offset,
output_buffer_bytes,
str_buf_overflow: false,
_marker: PhantomPinned,
}
}
/// First, scans for valid encoded strings in `input_buffer, then decodes them `
/// using `ss.decoder` to UTF-8 and writes the results as UTF-8 in
/// `fc.output_buffer_bytes`. Finally some filter is applied to the found strings
/// retaining only those who satisfy the filter criteria.\
///
/// * The input of this function is `input_buffer`.
/// * The output of this function is the returned `FindingCollection`.
///
/// The input parameter `input_file_id` is forwarded and stored in each `Finding`
/// of the returned `FindingCollection`.\
/// The function keeps its inner state in
/// `ss.decoder`, `ss.last_scan_run_leftover`,
/// `ss.last_run_str_was_printed_and_is_maybe_cut_str` and `ss.consumed_bytes`.\
/// `ss.mission` is not directly used in this function, but some part of it, the
/// `ss.mission.filter`, is forwarded to the helper function:
/// `helper::SplitStr::next()`.\
/// In case this is the last `input_buffer` of the stream, `last` must be set
/// to correctly flush the `ss.decoder`.
pub fn from<'a>(
ss: &mut ScannerState,
input_file_id: Option<u8>,
input_buffer: &[u8],
is_last_input_buffer: bool,
) -> Pin<Box<FindingCollection<'a>>> {
let mut fc = FindingCollection::new(ss.consumed_bytes);
// We do not clear `output_buffer_bytes`, we just overwrite.
// Initialisation
let mut extra_round = false;
let mut decoder_input_start = 0usize;
let mut decoder_input_end;
let mut decoder_output_start = 0usize;
// Copy `ScannerState` in `last_window...`
// Copy last run leftover bytes at the beginning of `output_buffer`.
let mut last_window_leftover_len = 0usize;
if !ss.last_scan_run_leftover.is_empty() {
// We don't need to copy here, we just rewind temporarily
// `decoder_output_start` to `ss.last_scan_run_leftover`.
fc.output_buffer_bytes
// Make the same space.
[decoder_output_start..decoder_output_start + ss.last_scan_run_leftover.len()]
.copy_from_slice(ss.last_scan_run_leftover.as_bytes());
// Remember for later use.
last_window_leftover_len = ss.last_scan_run_leftover.len();
ss.last_scan_run_leftover.clear();
// Make the decoder write behind the insertion.
decoder_output_start += last_window_leftover_len;
}
let mut last_window_str_was_printed_and_is_maybe_cut_str =
ss.last_run_str_was_printed_and_is_maybe_cut_str;
// In many encodings (e.g. UTF16), to fill one `output_line` we need more bytes of input.
// If ever the string gets longer than `output_line_char_nb_max`, `SplitStr` will wrap the line.
let decoder_input_window = 2 * ss.mission.output_line_char_nb_max;
let mut is_last_window = false;
// iterate over `input_buffer with ``decoder_input_window`-sized slices.
'_input_window_loop: while decoder_input_start < input_buffer.len() {
decoder_input_end = match decoder_input_start + decoder_input_window {
n if n < input_buffer.len() => n, // There are at least one byte more left in `input_buffer`.
_ => {
is_last_window = true;
input_buffer.len()
}
};
// Decode one `input_window`, go as far as you can, then loop again.
'decoder: loop {
let output_buffer_slice: &mut str = as_mut_str_unchecked_no_borrow_check!(
&mut fc.output_buffer_bytes[decoder_output_start..]
);
let (decoder_result, decoder_read, decoder_written) =
ss.decoder.decode_to_str_without_replacement(
&input_buffer[decoder_input_start..decoder_input_end],
output_buffer_slice,
extra_round,
);
// If the assumption is wrong we change later.
let mut position_precision = Precision::Exact;
// Regardless of whether the intermediate buffer got full
// or the input buffer was exhausted, let's process what's
// in the intermediate buffer.
// The target encoding is always UTF-8.
if decoder_written > 0 {
// With the following `if`, we check if the previous scan has
// potentially left some remaining bytes in the Decoder's inner
// state. This is a complicated corner case, because the inner
// state of the `encoding_rs` decoder is private and there is
// yet not method to query if the decoder is in a neutral state.
// Read the related Issue [Enhancement: get read access to the
// decoder's inner state · Issue #48 ·
// hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/issues/48)
//
// As a workaround, we first check if this is the first round
// (`decoder_input_start == 0`). Seeing, that we only know the
// `ByteCounter` precisely at that point and that all other
// round's findings will be tagged `Precision::After` anyway,
// there is no need to investigate further in these cases.
//
// We can reduce the cases of double decoding by checking if the
// first decoded character is a multi-byte UTF-8. If yes, this
// means (in most cases), that no bytes had been stored in the
// decoder's inner state and therefore we can assume that the
// first character was found exactly at `decoder_input_start`.
// If so, we can then tag this string-finding with
// `Precision::exact`.
if decoder_input_start == 0 && starts_with_multibyte_char(output_buffer_slice) {
// The only way to find out from which scan() run the first
// bytes came, is to scan again with a new Decoder and compare
// the results.
let mut empty_decoder =
ss.decoder.encoding().new_decoder_without_bom_handling();
// A short buffer on the stack will do.
let mut buffer_bytes = [0u8; 8];
// This is save, because there are only valid 0 in
// `buffer_bytes`.
let buffer: &mut str =
as_mut_str_unchecked_no_borrow_check!(buffer_bytes[..]);
// Alternative code, but slower. let tmp_buffer: &mut str =
// std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
let (_, _, written) = empty_decoder.decode_to_str_without_replacement(
input_buffer,
&mut *buffer,
true,
);
// When the result of the two decoders is not the same, as the
// bytes originating from the previous run, we know the extra
// bytes come from the previous run. Unfortunately there is no
// way to determine how many the decoder had internally stored.
// I can be one, two, or three. We only know that the multibyte
// sequence started some byte before 0.
if (written == 0)
|| (fc.output_buffer_bytes[0..written] != buffer_bytes[0..written])
{
position_precision = Precision::Before;
}
}
}
// Prepare input for `SplitStr`
let mut split_str_start = decoder_output_start;
let split_str_end = decoder_output_start + decoder_written;
// Enlarge window to the left, to cover not treated bytes again.
if last_window_leftover_len > 0 {
// Go some bytes to the left.
split_str_start -= last_window_leftover_len;
// We use it only once.
last_window_leftover_len = 0;
// We lose precision.
position_precision = Precision::Before;
};
// This is safe because the decoder guarantees us to return only valid UTF-8.
// We need unsafe code here because the buffer is still borrowed mutably by decoder.
let split_str_buffer = as_str_unchecked_no_borrow_check!(
fc.output_buffer_bytes[split_str_start..split_str_end]
);
// Another way of saying (decoder_result == DecoderResult::Malformed) ||
// (is_last_window ...):
// This can only be `false`, when `split_str_buffer` touches the right boundary (end)
// of an `input_window`. Normally it `true` because we usually stop at
// `DecoderResult::Malformed`.
let invalid_bytes_after_split_str_buffer = (decoder_result
!= DecoderResult::InputEmpty
&& decoder_result != DecoderResult::OutputFull)
|| (is_last_window && is_last_input_buffer);
// Use it only once.
let continue_str_if_possible = last_window_str_was_printed_and_is_maybe_cut_str;
last_window_str_was_printed_and_is_maybe_cut_str = false;
// Now we split `split_str_buffer` into substrings and store them in
// vector `fc.v`.
'_chunk_loop: for chunk in SplitStr::new(
split_str_buffer,
ss.mission.chars_min_nb,
ss.mission.require_same_unicode_block,
continue_str_if_possible,
invalid_bytes_after_split_str_buffer,
ss.mission.filter,
ss.mission.output_line_char_nb_max,
) {
if !chunk.s_is_to_be_filtered_again {
// We keep it for printing.
fc.v.push(Finding {
input_file_id,
mission: ss.mission,
position: ss.consumed_bytes + decoder_input_start as ByteCounter,
position_precision,
s: chunk.s,
s_completes_previous_s: chunk.s_completes_previous_s,
});
last_window_leftover_len = 0;
last_window_str_was_printed_and_is_maybe_cut_str = chunk.s_is_maybe_cut;
} else {
// `chunk.s_is_to_be_filtered_again`
// This chunk will be inserted at the beginning
// of the `output_buffer_bytes` and we do not print it
// now. As we will see it (completed to its full
// length) again, we can decide later what to do with
// it.
// As we exactly know where `chunk.s` is located in
// `ss.output_buffer_bytes`, it is enough to remember
// its length.
last_window_leftover_len = chunk.s.len();
// As the chunk is not printed now, so we set this
// to `false`:
last_window_str_was_printed_and_is_maybe_cut_str = false;
}
// For all other following `SplitStr` we set this,
// since we do not know their exact position.
position_precision = Precision::After;
}
decoder_output_start += decoder_written;
decoder_input_start += decoder_read;
// Now let's see if we should read again or process the
// rest of the current input buffer.
match decoder_result {
DecoderResult::InputEmpty => {
if is_last_window && is_last_input_buffer && !extra_round {
extra_round = true;
} else {
break 'decoder;
}
}
DecoderResult::OutputFull => {
// This should never happen. If ever it does we clear
// the FindingCollection to make more space and
// forget all previous findings.
fc.clear_and_mark_incomplete();
eprintln!("Buffer overflow. Output buffer is too small to receive all decoder data.\
Some findings got lost in input {:x}..{:x} from file {:?} for scanner ({})!",
ss.consumed_bytes,
ss.consumed_bytes + decoder_input_start as ByteCounter,
input_file_id,
char::from((ss.mission.mission_id + 97) as u8),
);
decoder_output_start = 0;
debug_assert!(
true,
"Buffer overflow. Output buffer is too small to receive all decoder data."
);
}
DecoderResult::Malformed(_, _) => {}
};
}
}
// Store possible leftovers in `ScannerState` for next `scanner::scan()`.
let last_window_leftover = as_str_unchecked_no_borrow_check!(
fc.output_buffer_bytes
[decoder_output_start - last_window_leftover_len..decoder_output_start]
);
// Update inner state for next `scan()` run.
ss.last_scan_run_leftover = String::from(last_window_leftover);
ss.last_run_str_was_printed_and_is_maybe_cut_str =
last_window_str_was_printed_and_is_maybe_cut_str;
ss.consumed_bytes += decoder_input_start as ByteCounter;
// Now we pin the `FindingCollection`.
Box::pin(fc)
}
/// Clears the buffer to make more space after buffer overflow. Tag the
/// collection as overflowed.
pub fn clear_and_mark_incomplete(&mut self) {
self.v.clear();
self.str_buf_overflow = true;
}
/// This method formats and dumps a `FindingCollection` to the output
/// channel, usually `stdout`.
#[allow(dead_code)]
pub fn print(&self, out: &mut dyn Write) -> Result<(), Box<std::io::Error>> {
if self.str_buf_overflow {
eprint!("Warning: output buffer overflow! Some findings might got lost.");
eprintln!(
"in input chunk 0x{:x}-0x{:x}.",
self.first_byte_position,
self.first_byte_position + INPUT_BUF_LEN as ByteCounter
);
}
for finding in &self.v {
finding.print(out)?;
}
Ok(())
}
}
/// This allows us to create an iterator from a `FindingCollection`.
impl<'a> IntoIterator for &'a Pin<Box<FindingCollection<'a>>> {
type Item = &'a Finding<'a>;
type IntoIter = FindingCollectionIterator<'a>;
fn into_iter(self) -> Self::IntoIter {
FindingCollectionIterator { fc: self, index: 0 }
}
}
/// This allows iterating over `Finding`-objects in a `FindingCollection::v`.
/// The state of this iterator must hold the whole `FindingCollection` and not
/// only `FindingCollection::v`! This is required because `next()` produces a
/// link to `Finding`, whose member `Finding::s` is a `&str`. The content of this
/// `&str` is part of `FindingCollection::output_buffer_bytes`, thus the need for
/// the whole object `FindingCollection`.
pub struct FindingCollectionIterator<'a> {
fc: &'a FindingCollection<'a>,
index: usize,
}
/// This allows us to iterate over `FindingCollection`. It is needed
/// by `kmerge()`.
impl<'a> Iterator for FindingCollectionIterator<'a> {
type Item = &'a Finding<'a>;
fn next(&mut self) -> Option<&'a Finding<'a>> {
let result = if self.index < self.fc.v.len() {
Some(&self.fc.v[self.index])
} else {
None
};
self.index += 1;
result
}
}
/// We consider the "content" of a `FindingCollection`
/// to be `FindingCollection::v` which is a `Vec<Finding>`.
impl<'a> Deref for FindingCollection<'a> {
type Target = Vec<Finding<'a>>;
fn deref(&self) -> &Self::Target {
&self.v
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::finding::Precision;
use crate::finding_collection::FindingCollection;
use crate::mission::Mission;
use crate::scanner::tests::MISSION_ALL_X_USER_DEFINED;
use crate::scanner::tests::MISSION_ASCII;
use std::str;
// To see println!() output in test run, launch
// cargo test -- --nocapture
#[test]
fn test_ascii_emulation() {
let m: &'static Mission = &MISSION_ALL_X_USER_DEFINED;
let mut ss = ScannerState::new(m);
let input = b"abcdefg\x58\x59\x80\x82h\x83ijk\x89\x90";
let fc = FindingCollection::from(&mut ss, Some(0), input, true);
//println!("{:#?}", fc.v);
assert_eq!(fc.first_byte_position, 10_000);
assert_eq!(fc.str_buf_overflow, false);
assert_eq!(fc.v.len(), 2);
assert_eq!(fc.v[0].position, 10_000);
assert_eq!(fc.v[0].position_precision, Precision::Exact);
assert_eq!(fc.v[0].s, "abcdefgXY\u{f780}");
// Next output line.
assert_eq!(fc.v[1].position, 10_000);
assert_eq!(fc.v[1].position_precision, Precision::After);
assert_eq!(fc.v[1].s, "\u{f782}h\u{f783}ijk\u{f789}\u{f790}");
assert_eq!(
// We only compare the first 35 bytes, the others are 0 anyway.
unsafe { str::from_utf8_unchecked(&fc.output_buffer_bytes[..35]) },
"abcdefg\u{58}\u{59}\u{f780}\u{f782}h\u{f783}ijk\u{f789}\u{f790}\
\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}"
);
assert_eq!(ss.consumed_bytes, 10000 + 18);
// false, because we told the `FindingCollection::scan()` this is the last run.
assert_eq!(ss.last_run_str_was_printed_and_is_maybe_cut_str, false);
assert_eq!(ss.last_scan_run_leftover, "");
// Second run.
let m: &'static Mission = &MISSION_ASCII;
let mut ss = ScannerState::new(m);
let input = b"abcdefg\x58\x59\x80\x82h\x83ijk\x89\x90";
let fc = FindingCollection::from(&mut ss, Some(0), input, false);
//println!("{:#?}", fc.v);
assert_eq!(fc.v.len(), 2);
assert_eq!(fc.first_byte_position, 10000);
assert_eq!(fc.str_buf_overflow, false);
assert_eq!(fc.v[0].position, 10_000);
assert_eq!(fc.v[0].position_precision, Precision::Exact);
assert_eq!(fc.v[0].s, "abcdefgXY");
// Next output line.
assert_eq!(fc.v[1].position, 10_000);
assert_eq!(fc.v[1].position_precision, Precision::After);
// Note that `h` is gone.
assert_eq!(fc.v[1].s, "ijk");
assert_eq!(
// We only compare the first 35 bytes, the others are 0 anyway.
unsafe { str::from_utf8_unchecked(&fc.output_buffer_bytes[..35]) },
"abcdefg\u{58}\u{59}\u{f780}\u{f782}h\u{f783}ijk\u{f789}\u{f790}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}"
);
assert_eq!(ss.consumed_bytes, 10000 + 18);
assert_eq!(ss.last_run_str_was_printed_and_is_maybe_cut_str, false);
assert_eq!(ss.last_scan_run_leftover, "");
}
}