use crate::web::format_detection::FileFormat;
use std::collections::HashSet;
pub const MAX_CONTIGS: usize = 100_000;
pub const MAX_FILENAME_LENGTH: usize = 255;
pub const MIN_FILE_CONTENT_SIZE: usize = 1;
#[must_use]
pub fn is_valid_md5(s: &str) -> bool {
s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit())
}
#[must_use]
pub fn normalize_md5(s: &str) -> Option<String> {
if is_valid_md5(s) {
Some(s.to_lowercase())
} else {
None
}
}
#[must_use]
pub fn compute_sha512t24u(sequence: &[u8]) -> String {
refget_digest::sha512t24u(sequence)
}
#[must_use]
pub fn is_valid_sha512t24u(s: &str) -> bool {
s.len() == 32
&& s.bytes()
.all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_')
}
#[must_use]
#[allow(clippy::implicit_hasher)] pub fn compute_signature(md5s: &HashSet<String>) -> String {
if md5s.is_empty() {
return String::new();
}
let mut sorted: Vec<&str> = md5s.iter().map(std::string::String::as_str).collect();
sorted.sort_unstable();
let concatenated = sorted.join(",");
let digest = md5::compute(concatenated.as_bytes());
format!("{digest:x}")
}
#[must_use]
pub fn check_contig_limit(count: usize) -> Option<String> {
if count >= MAX_CONTIGS {
Some(format!(
"Too many contigs: adding another would exceed maximum of {MAX_CONTIGS}"
))
} else {
None
}
}
#[derive(Debug, thiserror::Error)]
pub enum ValidationError {
#[error("Filename too long: exceeds {MAX_FILENAME_LENGTH} characters")]
FilenameTooLong,
#[error("Invalid filename: contains path traversal or invalid characters")]
InvalidFilename,
#[error("Empty filename provided")]
EmptyFilename,
#[error("File content appears malformed or invalid")]
InvalidFileContent,
#[error("File format validation failed")]
FormatValidationFailed,
}
pub fn validate_filename(filename: &str) -> Result<String, ValidationError> {
if filename.trim().is_empty() {
return Err(ValidationError::EmptyFilename);
}
if filename.len() > MAX_FILENAME_LENGTH {
return Err(ValidationError::FilenameTooLong);
}
if filename.contains("..") || filename.contains('/') || filename.contains('\\') {
return Err(ValidationError::InvalidFilename);
}
if filename.contains('\0') || filename.chars().any(|c| ('\x01'..='\x1F').contains(&c)) {
return Err(ValidationError::InvalidFilename);
}
let sanitized = filename
.chars()
.filter(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == '-' || *c == '_' || *c == ' ')
.collect::<String>();
if sanitized.trim().is_empty() {
return Err(ValidationError::InvalidFilename);
}
if sanitized.starts_with('.') && !has_known_extension(&sanitized) {
return Err(ValidationError::InvalidFilename);
}
Ok(sanitized)
}
fn has_known_extension(filename: &str) -> bool {
let safe_extensions = [
".sam",
".bam",
".cram",
".dict",
".vcf",
".txt",
".tsv",
".csv",
".gz",
".assembly_report.txt",
];
safe_extensions
.iter()
.any(|ext| filename.to_lowercase().ends_with(ext))
}
#[must_use]
pub fn validate_file_format(content: &[u8], expected_format: FileFormat) -> bool {
if content.is_empty() {
return false;
}
match expected_format {
FileFormat::Bam => {
content.len() >= 4 && content.starts_with(b"BAM\x01")
}
FileFormat::Cram => {
content.len() >= 4 && content.starts_with(b"CRAM")
}
FileFormat::Vcf => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.starts_with("##fileformat=VCF")
}
FileFormat::Sam => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.contains("@SQ")
|| content_str.contains("@HD")
|| content_str.contains("SN:")
|| content_str.contains("LN:")
}
FileFormat::Dict => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.contains("@HD") && content_str.contains("@SQ")
}
FileFormat::NcbiReport => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.contains("Sequence-Name") || content_str.contains("Sequence-Role")
}
FileFormat::Tsv => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.contains('\t')
&& (content_str.to_lowercase().contains("length")
|| content_str.to_lowercase().contains("sequence")
|| content_str.to_lowercase().contains("size"))
}
FileFormat::Fai => {
let content_str = std::str::from_utf8(content).unwrap_or("");
content_str.lines().take(5).any(|line| {
let fields: Vec<&str> = line.split('\t').collect();
fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
})
}
FileFormat::Fasta => {
content.starts_with(b">")
|| (content.len() >= 2 && content[0] == 0x1f && content[1] == 0x8b)
}
FileFormat::Auto => {
true
}
}
}
pub fn validate_file_content(content: &[u8], expected_text: bool) -> Result<(), ValidationError> {
if content.len() < MIN_FILE_CONTENT_SIZE {
return Err(ValidationError::InvalidFileContent);
}
if expected_text {
let non_printable_count = content
.iter()
.filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
.count();
if content.len() > 100 && non_printable_count > content.len() / 20 {
return Err(ValidationError::InvalidFileContent);
}
if std::str::from_utf8(content).is_err() {
return Err(ValidationError::InvalidFileContent);
}
}
Ok(())
}
pub fn validate_upload(
filename: Option<&str>,
content: &[u8],
expected_format: FileFormat,
) -> Result<Option<String>, ValidationError> {
let validated_filename = if let Some(name) = filename {
Some(validate_filename(name)?)
} else {
None
};
let is_text_format = matches!(
expected_format,
FileFormat::Sam
| FileFormat::Dict
| FileFormat::Vcf
| FileFormat::NcbiReport
| FileFormat::Tsv
| FileFormat::Auto
);
validate_file_content(content, is_text_format)?;
if expected_format == FileFormat::Auto {
if content.len() >= 4 {
let starts_with_bam = content.starts_with(b"BAM");
let starts_with_cram = content.starts_with(b"CRAM");
if starts_with_bam && !validate_file_format(content, FileFormat::Bam) {
return Err(ValidationError::FormatValidationFailed);
}
if starts_with_cram && !validate_file_format(content, FileFormat::Cram) {
return Err(ValidationError::FormatValidationFailed);
}
}
} else if !validate_file_format(content, expected_format) {
return Err(ValidationError::FormatValidationFailed);
}
Ok(validated_filename)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_valid_md5() {
assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
assert!(is_valid_md5("AABBCCDD11223344556677889900AABB")); assert!(!is_valid_md5("not-an-md5"));
assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178ddd")); assert!(!is_valid_md5("")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178dg")); }
#[test]
fn test_compute_sha512t24u() {
let digest = compute_sha512t24u(b"ACGT");
assert_eq!(digest.len(), 32);
assert!(is_valid_sha512t24u(&digest));
assert_eq!(digest, compute_sha512t24u(b"ACGT"));
assert_ne!(digest, compute_sha512t24u(b"TGCA"));
assert_eq!(digest, "aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2");
}
#[test]
fn test_is_valid_sha512t24u() {
assert!(is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2"));
assert!(!is_valid_sha512t24u("too-short"));
assert!(!is_valid_sha512t24u(""));
assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2X"));
assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw "));
}
#[test]
fn test_normalize_md5() {
assert_eq!(
normalize_md5("6AEF897C3D6FF0C78AFF06AC189178DD"),
Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
);
assert_eq!(normalize_md5("invalid"), None);
}
#[test]
fn test_compute_signature() {
let mut md5s = HashSet::new();
md5s.insert("aaaa".repeat(8)); md5s.insert("bbbb".repeat(8));
let sig = compute_signature(&md5s);
assert_eq!(sig.len(), 32);
let sig2 = compute_signature(&md5s);
assert_eq!(sig, sig2);
let empty: HashSet<String> = HashSet::new();
assert_eq!(compute_signature(&empty), "");
}
#[test]
fn test_check_contig_limit() {
assert!(check_contig_limit(100).is_none());
assert!(check_contig_limit(MAX_CONTIGS - 1).is_none());
assert!(check_contig_limit(MAX_CONTIGS).is_some());
assert!(check_contig_limit(MAX_CONTIGS + 1).is_some());
}
#[test]
fn test_validate_filename_safe() {
assert!(validate_filename("test.sam").is_ok());
assert!(validate_filename("my-file.bam").is_ok());
assert!(validate_filename("data_file.txt").is_ok());
assert!(validate_filename("sample 123.vcf").is_ok());
}
#[test]
fn test_validate_filename_dangerous() {
assert!(validate_filename("../etc/passwd").is_err());
assert!(validate_filename("..\\windows\\system32").is_err());
assert!(validate_filename("test/../../secret").is_err());
assert!(validate_filename("test\0.txt").is_err());
assert!(validate_filename("test\x01.txt").is_err());
let long_name = "a".repeat(300);
assert!(validate_filename(&long_name).is_err());
assert!(validate_filename("").is_err());
assert!(validate_filename(" ").is_err());
assert!(validate_filename(".hidden").is_err());
}
#[test]
fn test_validate_filename_sanitization() {
let result = validate_filename("test@#$%file.txt").unwrap();
assert_eq!(result, "testfile.txt");
let result = validate_filename("my-file_123.sam").unwrap();
assert_eq!(result, "my-file_123.sam");
}
#[test]
fn test_validate_file_format_bam() {
let bam_content = b"BAM\x01test_content";
assert!(validate_file_format(bam_content, FileFormat::Bam));
let invalid_bam = b"NOTBAM\x01";
assert!(!validate_file_format(invalid_bam, FileFormat::Bam));
}
#[test]
fn test_validate_file_format_cram() {
let cram_content = b"CRAMtest_content";
assert!(validate_file_format(cram_content, FileFormat::Cram));
let invalid_cram = b"NOTCRAM";
assert!(!validate_file_format(invalid_cram, FileFormat::Cram));
}
#[test]
fn test_validate_file_format_vcf() {
let vcf_content = b"##fileformat=VCFv4.2\n##contig=<ID=chr1>";
assert!(validate_file_format(vcf_content, FileFormat::Vcf));
let invalid_vcf = b"@SQ\tSN:chr1\tLN:123";
assert!(!validate_file_format(invalid_vcf, FileFormat::Vcf));
}
#[test]
fn test_validate_file_format_sam() {
let sam_content = b"@SQ\tSN:chr1\tLN:123456";
assert!(validate_file_format(sam_content, FileFormat::Sam));
let sam_content2 = b"@HD\tVN:1.0\tSO:coordinate";
assert!(validate_file_format(sam_content2, FileFormat::Sam));
}
#[test]
fn test_validate_file_content_text() {
let valid_text = b"@SQ\tSN:chr1\tLN:123456\n@SQ\tSN:chr2\tLN:654321";
assert!(validate_file_content(valid_text, true).is_ok());
let binary_data = vec![0u8; 1000];
assert!(validate_file_content(&binary_data, true).is_err());
assert!(validate_file_content(b"", true).is_err());
}
#[test]
fn test_validate_file_content_binary() {
let binary_data = vec![0xABu8; 100];
assert!(validate_file_content(&binary_data, false).is_ok());
assert!(validate_file_content(b"", false).is_err());
}
#[test]
fn test_validate_upload_complete() {
let sam_content = b"@SQ\tSN:chr1\tLN:123456";
let result = validate_upload(Some("test.sam"), sam_content, FileFormat::Sam);
assert!(result.is_ok());
assert_eq!(result.unwrap().unwrap(), "test.sam");
let result = validate_upload(None, sam_content, FileFormat::Sam);
assert!(result.is_ok());
assert!(result.unwrap().is_none());
let result = validate_upload(Some("../etc/passwd"), sam_content, FileFormat::Sam);
assert!(result.is_err());
let bam_content = b"BAM\x01test";
let result = validate_upload(Some("test.sam"), bam_content, FileFormat::Sam);
assert!(result.is_err());
}
#[test]
fn test_has_known_extension() {
assert!(has_known_extension(".sam"));
assert!(has_known_extension(".bam"));
assert!(has_known_extension(".vcf.gz"));
assert!(has_known_extension("test.assembly_report.txt"));
assert!(!has_known_extension(".exe"));
assert!(!has_known_extension(".hidden"));
assert!(!has_known_extension(".config"));
}
}