use arrow::util::test_util::parquet_test_data;
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
use parquet::errors::ParquetError;
use std::collections::HashSet;
use std::path::PathBuf;
static KNOWN_FILES: &[&str] = &[
"PARQUET-1481.parquet",
"ARROW-GH-41317.parquet",
"ARROW-GH-41321.parquet",
"ARROW-GH-43605.parquet",
"ARROW-RS-GH-6229-DICTHEADER.parquet",
"ARROW-RS-GH-6229-LEVELS.parquet",
"ARROW-GH-45185.parquet",
"README.md",
];
fn bad_data_dir() -> PathBuf {
let parquet_testing_data = parquet_test_data();
PathBuf::from(parquet_testing_data)
.parent()
.expect("was in parquet-testing/data")
.join("bad_data")
}
#[test]
fn test_invalid_files() {
let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect();
let mut seen_files = HashSet::new();
let files = std::fs::read_dir(bad_data_dir()).unwrap();
for file in files {
let file_name = file
.unwrap()
.path()
.file_name()
.unwrap()
.to_str()
.unwrap()
.to_string();
assert!(
known_files.contains(file_name.as_str()),
"Found new file in bad_data, please add test: {file_name}"
);
seen_files.insert(file_name);
}
for expected_file in known_files {
assert!(
seen_files.contains(expected_file),
"Expected file not found in bad_data directory: {expected_file}"
);
}
}
#[test]
fn test_parquet_1481() {
let err = read_file("PARQUET-1481.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"Parquet error: unexpected parquet type: -7"
);
}
#[test]
#[should_panic(expected = "assertion failed: self.current_value.is_some()")]
fn test_arrow_gh_41321() {
let err = read_file("ARROW-GH-41321.parquet").unwrap_err();
assert_eq!(err.to_string(), "TBD (currently panics)");
}
#[test]
fn test_arrow_gh_41317() {
let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: External: bad data"
);
}
#[test]
fn test_arrow_rs_gh_6229_dict_header() {
let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: Integer overflow: out of range integral type conversion attempted"
);
}
#[test]
#[cfg(feature = "snap")]
fn test_arrow_rs_gh_6229_dict_levels() {
let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: Insufficient repetition levels read from column"
);
}
#[test]
#[cfg(feature = "snap")]
fn test_arrow_rs_gh_45185_dict_levels() {
let err = read_file("ARROW-GH-45185.parquet").unwrap_err();
assert_eq!(
err.to_string(),
"External: Parquet argument error: Parquet error: first repetition level of batch must be 0"
);
}
fn read_file(name: &str) -> Result<usize, ParquetError> {
let path = bad_data_dir().join(name);
println!("Reading file: {path:?}");
let file = std::fs::File::open(&path).unwrap();
let reader = ArrowReaderBuilder::try_new(file)?.build()?;
let mut num_rows = 0;
for batch in reader {
let batch = batch?;
num_rows += batch.num_rows();
}
Ok(num_rows)
}
#[cfg(feature = "async")]
#[tokio::test]
#[allow(deprecated)]
async fn bad_metadata_err() {
use bytes::Bytes;
use parquet::file::metadata::ParquetMetaDataReader;
let metadata_buffer = Bytes::from_static(include_bytes!("bad_raw_metadata.bin"));
let metadata_length = metadata_buffer.len() as u64;
let mut reader = std::io::Cursor::new(&metadata_buffer);
let mut loader = ParquetMetaDataReader::new();
loader.try_load(&mut reader, metadata_length).await.unwrap();
loader = loader.with_page_indexes(false);
loader.load_page_index(&mut reader).await.unwrap();
loader = loader.with_offset_indexes(true);
loader.load_page_index(&mut reader).await.unwrap();
loader = loader.with_column_indexes(true);
let err = loader.load_page_index(&mut reader).await.unwrap_err();
assert_eq!(
err.to_string(),
"Parquet error: error converting value, expected 4 bytes got 0"
);
}