use arrow::util::test_util::parquet_test_data;
use bytes::Bytes;
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
use parquet::errors::ParquetError;
use std::collections::HashSet;
use std::path::PathBuf;
static KNOWN_FILES: &[&str] = &[
"PARQUET-1481.parquet",
"ARROW-GH-41317.parquet",
"ARROW-GH-41321.parquet",
"ARROW-GH-43605.parquet",
"ARROW-RS-GH-6229-DICTHEADER.parquet",
"ARROW-RS-GH-6229-LEVELS.parquet",
"ARROW-GH-45185.parquet",
"README.md",
];
fn bad_data_dir() -> PathBuf {
let parquet_testing_data = parquet_test_data();
PathBuf::from(parquet_testing_data)
.parent()
.expect("was in parquet-testing/data")
.join("bad_data")
}
#[test]
fn test_invalid_files() {
let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect();
let mut seen_files = HashSet::new();
let files = std::fs::read_dir(bad_data_dir()).unwrap();
for file in files {
let file_name = file
.unwrap()
.path()
.file_name()
.unwrap()
.to_str()
.unwrap()
.to_string();
assert!(
known_files.contains(file_name.as_str()),
"Found new file in bad_data, please add test: {file_name}"
);
seen_files.insert(file_name);
}
for expected_file in known_files {
assert!(
seen_files.contains(expected_file),
"Expected file not found in bad_data directory: {expected_file}"
);
}
}
#[test]
fn test_parquet_1481() {
let err = read_file("PARQUET-1481.parquet").unwrap_err();
assert_eq!(err.to_string(), "Parquet error: Unexpected Type -7");
}
#[test]
fn test_arrow_gh_41321() {
let err = read_file("ARROW-GH-41321.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: Invalid or corrupted RLE bit width 254. Max allowed is 32"
);
}
#[test]
fn test_arrow_gh_41317() {
let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: StructArrayReader out of sync in read_records, expected 5 read, got 2"
);
}
#[test]
fn test_arrow_rs_gh_6229_dict_header() {
let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: Integer overflow: out of range integral type conversion attempted"
);
}
#[test]
#[cfg(feature = "snap")]
fn test_arrow_rs_gh_6229_dict_levels() {
let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: Insufficient repetition levels read from column"
);
}
#[test]
#[cfg(feature = "snap")]
fn test_arrow_rs_gh_45185_dict_levels() {
let err = read_file("ARROW-GH-45185.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: first repetition level of batch must be 0"
);
}
fn read_file(name: &str) -> Result<usize, ParquetError> {
let path = bad_data_dir().join(name);
println!("Reading file: {path:?}");
let file = std::fs::File::open(&path).unwrap();
let reader = ArrowReaderBuilder::try_new(file)?.build()?;
let mut num_rows = 0;
for batch in reader {
let batch = batch?;
num_rows += batch.num_rows();
}
Ok(num_rows)
}
#[test]
fn non_standard_delta_blocks() {
let file = Bytes::from_static(include_bytes!("bigdelta.parquet"));
use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
let selectors = vec![RowSelector::skip(1000), RowSelector::select(5)];
let selection: RowSelection = selectors.into();
let reader = ArrowReaderBuilder::try_new(file)
.unwrap()
.with_row_selection(selection)
.build()
.unwrap();
if let Some(maybe_batch) = reader.into_iter().next() {
assert!(
maybe_batch
.unwrap_err()
.to_string()
.contains("cannot skip miniblock of size 128")
);
}
}
#[cfg(feature = "async")]
#[tokio::test]
#[allow(deprecated)]
async fn bad_metadata_err() {
use parquet::file::metadata::ParquetMetaDataReader;
let metadata_buffer = Bytes::from_static(include_bytes!("bad_raw_metadata.bin"));
let metadata_length = metadata_buffer.len() as u64;
let mut reader = std::io::Cursor::new(&metadata_buffer);
let mut loader = ParquetMetaDataReader::new();
loader.try_load(&mut reader, metadata_length).await.unwrap();
loader = loader.with_page_indexes(false);
loader.load_page_index(&mut reader).await.unwrap();
loader = loader.with_offset_indexes(true);
loader.load_page_index(&mut reader).await.unwrap();
loader = loader.with_column_indexes(true);
let err = loader.load_page_index(&mut reader).await.unwrap_err();
assert_eq!(
err.to_string(),
"Parquet error: error converting value, expected 4 bytes got 0"
);
}