use rusqlite::{params_from_iter, Connection, Result};
use std::path::PathBuf;
pub fn generate_newick_tree(db: &PathBuf, input_data: Vec<String>, domain: &str) -> Result<String> {
let conn = Connection::open(db).expect("Failed to open database");
let ranks = process_data(input_data, db).expect("Failed to process data");
let table_name = format!("gtdb_tree_{domain}");
let leaf_nodes = crate::tree::get_leaf_nodes_by_rank(&conn, &table_name, &ranks)?;
let newick_tree = crate::tree::build_pruned_tree(&conn, &table_name, 1, &leaf_nodes)?;
if let Some(root) = newick_tree {
let mut newick = String::new();
crate::tree::write_node_to_newick(&root, &mut newick);
newick.push(';');
Ok(newick)
} else {
Ok("".into())
}
}
pub fn process_data(data: Vec<String>, db: &PathBuf) -> std::io::Result<Vec<String>> {
let mut species = Vec::new();
let mut ncbi_taxids = Vec::new();
let mut ncbi_ids = Vec::new();
let conn = Connection::open(db).expect("Failed to open database");
let valid_prefixes = ["c__", "d__", "f__", "g__", "o__", "p__", "s__"];
for item in data {
if valid_prefixes
.iter()
.any(|&prefix| item.starts_with(prefix))
{
species.push(item.to_string());
} else if item.chars().all(char::is_numeric) {
ncbi_taxids.push(item.to_string());
} else if let Some(captures) = regex::Regex::new(r"(?:[A-Za-z]{2}_)?[A-Za-z]{3}_(\d+\.\d+)")
.unwrap()
.captures(&item)
{
let ncbi_id = captures.get(1).unwrap().as_str();
ncbi_ids.push(ncbi_id.to_string());
} else {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to process item: {}", item),
));
}
}
let mut not_found = Vec::new();
let mut results = Vec::new();
if !species.is_empty() {
let placeholders: Vec<String> = species.iter().map(|_| "?".to_string()).collect();
let query = format!(
"SELECT node FROM genome_taxonomy WHERE node IN ({})",
placeholders.join(", ")
);
let mut stmt = conn.prepare(&query).expect("Failed to prepare query");
let rows: Vec<String> = stmt
.query_map(
params_from_iter(species.iter().map(|s| s.as_str())),
|row| row.get::<_, String>(0),
)
.expect("Failed to query")
.filter_map(Result::ok)
.collect();
if rows.len() != species.len() {
let missing_species: Vec<&String> =
species.iter().filter(|s| !rows.contains(s)).collect();
not_found.extend(missing_species);
} else {
results.extend(species);
}
}
if !ncbi_taxids.is_empty() {
let placeholders: Vec<String> = ncbi_taxids.iter().map(|_| "?".to_string()).collect();
let query = format!(
"SELECT ncbi_taxid, parent FROM genome_taxonomy WHERE ncbi_taxid IN ({})",
placeholders.join(", ")
);
let mut stmt = conn.prepare(&query).expect("Failed to prepare query");
let rows: Vec<(String, String)> = stmt
.query_map(
params_from_iter(ncbi_taxids.iter().map(|s| s.as_str())),
|row| {
let ncbi_taxid: String = row.get(0)?;
let parent: String = row.get(1)?;
Ok((ncbi_taxid, parent)) },
)
.expect("Failed to query")
.filter_map(Result::ok)
.collect();
if rows.len() != ncbi_taxids.len() {
let selected: Vec<_> = rows.iter().map(|r| r.0.clone()).collect();
let missing_species: Vec<&String> =
ncbi_ids.iter().filter(|s| !selected.contains(s)).collect();
not_found.extend(missing_species);
} else {
for row in rows {
results.push(row.1);
}
}
}
if !ncbi_ids.is_empty() {
let placeholders: Vec<String> = ncbi_ids.iter().map(|_| "?".to_string()).collect();
let query = format!(
"SELECT ncbi_id, parent FROM genome_taxonomy WHERE ncbi_id IN ({})",
placeholders.join(", ")
);
let mut stmt = conn.prepare(&query).expect("Failed to prepare query");
let rows: Vec<(String, String)> = stmt
.query_map(
params_from_iter(ncbi_ids.iter().map(|s| s.as_str())),
|row| {
let ncbi_id: String = row.get(0)?;
let parent: String = row.get(1)?;
Ok((ncbi_id, parent)) },
)
.expect("Failed to query")
.filter_map(Result::ok)
.collect();
if rows.len() != ncbi_ids.len() {
let selected: Vec<_> = rows.iter().map(|r| r.0.clone()).collect();
let missing_species: Vec<&String> =
ncbi_ids.iter().filter(|s| !selected.contains(s)).collect();
not_found.extend(missing_species);
} else {
for row in rows {
results.push(row.1);
}
}
}
if !not_found.is_empty() {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"Not found in the database. Missing: {}",
not_found
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>()
.join(", ")
),
));
}
Ok(results)
}