use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
const MALE_Y_MARKERS: &[&str] = &[
"rs11575897",
"rs2534636",
"i3000043",
"i3000045",
"i4000162",
"rs13303871",
"rs35284970",
"rs3895",
"i4000120",
"i4000121",
];
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenotypeMetadata {
pub data_type: String,
pub source: Option<String>,
pub grch_version: Option<String>,
pub row_count: Option<i64>,
pub chromosome_count: Option<i64>,
pub inferred_sex: Option<String>,
}
impl Default for GenotypeMetadata {
fn default() -> Self {
Self {
data_type: "Unknown".to_string(),
source: None,
grch_version: None,
row_count: None,
chromosome_count: None,
inferred_sex: None,
}
}
}
pub fn detect_genotype_metadata(file_path: &str) -> Result<GenotypeMetadata> {
let path = Path::new(file_path);
if !path.exists() {
return Ok(GenotypeMetadata::default());
}
let file = File::open(path)?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let mut header_text = String::new();
let mut data_lines = Vec::new();
let mut line_count = 0;
let max_header_lines = 50;
for line_result in &mut lines {
let line = line_result?;
line_count += 1;
if line.starts_with('#') || line.starts_with("//") {
header_text.push_str(&line);
header_text.push('\n');
} else if !line.trim().is_empty() {
data_lines.push(line);
if data_lines.len() >= 10 {
break;
}
}
if line_count >= max_header_lines {
break;
}
}
let is_genotype = validate_genotype_structure(&data_lines);
if !is_genotype {
return Ok(GenotypeMetadata::default());
}
let header_lower = header_text.to_lowercase();
let source = detect_source(&header_lower);
let grch_version = detect_grch_version(&header_lower);
Ok(GenotypeMetadata {
data_type: "Genotype".to_string(),
source,
grch_version,
row_count: None,
chromosome_count: None,
inferred_sex: None,
})
}
pub fn analyze_genotype_file(file_path: &str) -> Result<GenotypeMetadata> {
let path = Path::new(file_path);
if !path.exists() {
return Ok(GenotypeMetadata::default());
}
let metadata = detect_genotype_metadata(file_path)?;
let source = metadata.source.clone();
let (row_count, chromosome_count, inferred_sex) =
count_rows_and_chromosomes(file_path, source.as_deref())?;
Ok(GenotypeMetadata {
data_type: metadata.data_type,
source: metadata.source,
grch_version: metadata.grch_version,
row_count: Some(row_count as i64),
chromosome_count: Some(chromosome_count as i64),
inferred_sex: Some(inferred_sex),
})
}
fn count_rows_and_chromosomes(
file_path: &str,
source: Option<&str>,
) -> Result<(usize, usize, String)> {
let file = File::open(file_path)?;
let reader = BufReader::new(file);
let mut row_count = 0;
let mut chromosomes = HashSet::new();
let mut male_markers_called = 0;
let is_23andme = source.map(|s| s.contains("23andMe")).unwrap_or(false);
for line_result in reader.lines() {
let line = line_result?;
if line.starts_with('#') || line.starts_with("//") || line.trim().is_empty() {
continue;
}
row_count += 1;
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 4 {
let rsid = parts[0].trim();
let chr = parts[1].trim();
let genotype = parts[3].trim();
chromosomes.insert(chr.to_string());
if is_23andme && (chr == "Y" || chr == "24") && MALE_Y_MARKERS.contains(&rsid) {
if !genotype.is_empty() && genotype != "--" && genotype != "00" {
male_markers_called += 1;
}
}
}
}
let inferred_sex = if is_23andme {
if male_markers_called >= 5 {
"Male".to_string()
} else if chromosomes.contains("X") || chromosomes.contains("23") {
"Female".to_string()
} else {
"Unknown".to_string()
}
} else {
let has_x = chromosomes.contains("X") || chromosomes.contains("23");
let has_y = chromosomes.contains("Y") || chromosomes.contains("24");
if has_x && has_y {
"Unknown".to_string()
} else if has_x && !has_y {
"Female".to_string()
} else {
"Unknown".to_string()
}
};
Ok((row_count, chromosomes.len(), inferred_sex))
}
fn validate_genotype_structure(data_lines: &[String]) -> bool {
if data_lines.is_empty() {
return false;
}
let mut valid_lines = 0;
for line in data_lines {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() < 4 {
continue;
}
let rsid = parts[0].trim();
if !rsid.starts_with("rs") && !rsid.starts_with('i') {
continue;
}
let chr = parts[1].trim();
if !is_valid_chromosome(chr) {
continue;
}
if parts[2].trim().parse::<u64>().is_err() {
continue;
}
let genotype = parts[3].trim();
if !is_valid_genotype(genotype) {
continue;
}
valid_lines += 1;
}
let threshold = (data_lines.len() * 7) / 10;
valid_lines >= threshold
}
fn is_valid_chromosome(chr: &str) -> bool {
if let Ok(num) = chr.parse::<u8>() {
return (1..=22).contains(&num);
}
matches!(
chr.to_uppercase().as_str(),
"X" | "Y" | "MT" | "M" | "23" | "24"
)
}
fn is_valid_genotype(genotype: &str) -> bool {
if genotype.is_empty() {
return false;
}
let g = genotype.to_uppercase();
if g == "--" || g == "00" {
return true;
}
for c in g.chars() {
if !matches!(c, 'A' | 'T' | 'G' | 'C' | 'D' | 'I' | '-' | '0') {
return false;
}
}
matches!(g.len(), 1 | 2)
}
fn detect_source(header_lower: &str) -> Option<String> {
if header_lower.contains("23andme") {
Some("23andMe".to_string())
} else if header_lower.contains("ancestrydna") || header_lower.contains("ancestry dna") {
Some("AncestryDNA".to_string())
} else if header_lower.contains("genes for good") || header_lower.contains("genesforgood") {
Some("Genes for Good".to_string())
} else if header_lower.contains("dynamic dna")
|| header_lower.contains("ddna")
|| header_lower.contains("dynamicdnalabs")
{
Some("Dynamic DNA".to_string())
} else {
Some("Unknown".to_string())
}
}
fn detect_grch_version(header_lower: &str) -> Option<String> {
if header_lower.contains("build 36")
|| header_lower.contains("grch36")
|| header_lower.contains("hg18")
{
return Some("36".to_string());
}
if header_lower.contains("build 37")
|| header_lower.contains("grch37")
|| header_lower.contains("hg19")
{
return Some("37".to_string());
}
if header_lower.contains("build 38")
|| header_lower.contains("grch38")
|| header_lower.contains("hg38")
{
return Some("38".to_string());
}
Some("Unknown".to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_valid_chromosome() {
assert!(is_valid_chromosome("1"));
assert!(is_valid_chromosome("22"));
assert!(is_valid_chromosome("X"));
assert!(is_valid_chromosome("Y"));
assert!(is_valid_chromosome("MT"));
assert!(!is_valid_chromosome("0"));
assert!(!is_valid_chromosome("23"));
assert!(!is_valid_chromosome("ABC"));
}
#[test]
fn test_valid_genotype() {
assert!(is_valid_genotype("AA"));
assert!(is_valid_genotype("AG"));
assert!(is_valid_genotype("TT"));
assert!(is_valid_genotype("--"));
assert!(is_valid_genotype("00"));
assert!(is_valid_genotype("D"));
assert!(is_valid_genotype("I"));
assert!(is_valid_genotype("DD"));
assert!(is_valid_genotype("A"));
assert!(!is_valid_genotype(""));
assert!(!is_valid_genotype("AAA"));
assert!(!is_valid_genotype("XY"));
}
#[test]
fn test_detect_source() {
assert_eq!(
detect_source("this is from 23andme"),
Some("23andMe".to_string())
);
assert_eq!(
detect_source("ancestrydna test"),
Some("AncestryDNA".to_string())
);
assert_eq!(
detect_source("genes for good data"),
Some("Genes for Good".to_string())
);
assert_eq!(
detect_source("# this data file generated by dynamic dna (ddna) laboratories"),
Some("Dynamic DNA".to_string())
);
assert_eq!(
detect_source("data from ddna"),
Some("Dynamic DNA".to_string())
);
assert_eq!(
detect_source("https://dynamicdnalabs.com"),
Some("Dynamic DNA".to_string())
);
assert_eq!(detect_source("unknown source"), Some("Unknown".to_string()));
}
#[test]
fn test_detect_grch_version() {
assert_eq!(detect_grch_version("build 36"), Some("36".to_string()));
assert_eq!(detect_grch_version("grch37"), Some("37".to_string()));
assert_eq!(detect_grch_version("hg38"), Some("38".to_string()));
assert_eq!(detect_grch_version("build 38"), Some("38".to_string()));
assert_eq!(detect_grch_version("grch38"), Some("38".to_string()));
assert_eq!(
detect_grch_version("chromosomal location realtive to build 38 of the human reference"),
Some("38".to_string())
);
assert_eq!(
detect_grch_version("no version info"),
Some("Unknown".to_string())
);
}
}