pub mod batch;
pub mod field;
pub use batch::BatchBuilder;
use std::sync::Arc;
use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef};
use crate::{OxbowError, Select};
use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES};
#[derive(Clone, Debug)]
pub struct Model {
fields: Vec<Field>,
schema: SchemaRef,
}
impl Model {
pub fn new_fasta(fields: Select<String>) -> crate::Result<Self> {
let defaults = || {
FASTA_DEFAULT_FIELD_NAMES
.iter()
.map(|&s| s.to_string())
.collect()
};
let field_names = match fields {
Select::All => defaults(),
Select::Some(names) => names,
Select::Omit => Vec::new(),
};
Self::new(field_names)
}
pub fn new_fastq(fields: Select<String>) -> crate::Result<Self> {
let defaults = || {
FASTQ_DEFAULT_FIELD_NAMES
.iter()
.map(|&s| s.to_string())
.collect()
};
let field_names = match fields {
Select::All => defaults(),
Select::Some(names) => names,
Select::Omit => Vec::new(),
};
Self::new(field_names)
}
fn new(field_names: Vec<String>) -> crate::Result<Self> {
let mut parsed_fields = Vec::new();
for name in &field_names {
let field: Field = name
.parse()
.map_err(|e: std::io::Error| OxbowError::invalid_input(e.to_string()))?;
parsed_fields.push(field);
}
let arrow_fields: Vec<ArrowField> =
parsed_fields.iter().map(|f| f.get_arrow_field()).collect();
let schema = Arc::new(Schema::new(arrow_fields));
Ok(Self {
fields: parsed_fields,
schema,
})
}
pub fn fields(&self) -> &[Field] {
&self.fields
}
pub fn field_names(&self) -> Vec<String> {
self.fields.iter().map(|f| f.name().to_string()).collect()
}
pub fn schema(&self) -> &SchemaRef {
&self.schema
}
pub fn column_names(&self) -> Vec<String> {
self.schema
.fields()
.iter()
.map(|f| f.name().clone())
.collect()
}
pub fn project(&self, columns: &[String]) -> crate::Result<Self> {
let available = self.column_names();
let unknown: Vec<&str> = columns
.iter()
.filter(|c| !available.iter().any(|a| a.eq_ignore_ascii_case(c)))
.map(|c| c.as_str())
.collect();
if !unknown.is_empty() {
return Err(OxbowError::invalid_input(format!(
"Unknown columns: {:?}. Available: {:?}",
unknown, available
)));
}
let projected: Vec<String> = self
.fields
.iter()
.filter(|f| columns.iter().any(|c| c.eq_ignore_ascii_case(f.name())))
.map(|f| f.name().to_string())
.collect();
Self::new(projected)
}
}
impl PartialEq for Model {
fn eq(&self, other: &Self) -> bool {
self.fields == other.fields
}
}
impl Eq for Model {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fasta_defaults() {
let model = Model::new_fasta(Select::All).unwrap();
assert_eq!(model.field_names(), vec!["name", "description", "sequence"]);
assert_eq!(model.schema().fields().len(), 3);
}
#[test]
fn test_fastq_defaults() {
let model = Model::new_fastq(Select::All).unwrap();
assert_eq!(
model.field_names(),
vec!["name", "description", "sequence", "quality"]
);
assert_eq!(model.schema().fields().len(), 4);
}
#[test]
fn test_custom_fields() {
let model = Model::new_fastq(Select::Some(vec!["name".into(), "sequence".into()])).unwrap();
assert_eq!(model.field_names(), vec!["name", "sequence"]);
assert_eq!(model.schema().fields().len(), 2);
}
#[test]
fn test_invalid_field() {
let result = Model::new_fasta(Select::Some(vec!["invalid".into()]));
assert!(result.is_err());
}
#[test]
fn test_project() {
let model = Model::new_fastq(Select::All).unwrap();
let projected = model.project(&["name".into(), "quality".into()]).unwrap();
assert_eq!(projected.field_names(), vec!["name", "quality"]);
}
#[test]
fn test_project_unknown() {
let model = Model::new_fasta(Select::All).unwrap();
let result = model.project(&["nonexistent".into()]);
assert!(result.is_err());
}
}