orphos_core/engine.rs
1use std::marker::PhantomData;
2use std::path::Path;
3
4use crate::algorithms::dynamic_programming::eliminate_bad_genes;
5use crate::algorithms::dynamic_programming::predict_genes;
6use crate::algorithms::gene_finding::GeneBuilder;
7use crate::config::OrphosConfig;
8use crate::constants::MIN_SEQUENCE_LENGTH;
9use crate::node::{
10 add_nodes, calculate_dicodon_gene, raw_coding_score, rbs_score, record_gc_bias,
11 record_overlapping_starts, score_nodes, sort_nodes_by_position,
12};
13use crate::results::{OrphosResults, SequenceInfo};
14use crate::sequence::calc_most_gc_frame;
15use crate::sequence::encoded::EncodedSequence;
16use crate::sequence::read_fasta_sequences;
17use crate::training::non_sd_training::train_starts_nonsd;
18use crate::training::sd_training::train_starts_sd;
19use crate::training::should_use_sd;
20use crate::types::Gene;
21use crate::types::{OrphosError, Training};
22
23/// Marker trait for Orphos training state.
24///
25/// This trait is used in the type-state pattern to enforce that training
26/// must be performed before gene prediction. It's implemented by both
27/// [`Untrained`] and [`Trained`] marker types.
28pub trait TrainingState {}
29
30/// Marker type indicating an untrained Orphos instance.
31///
32/// A [`Orphos<Untrained>`] instance can perform training operations
33/// but cannot find genes until training is complete.
34#[derive(Debug, Clone)]
35pub struct Untrained;
36
37/// Marker type indicating a trained Orphos instance.
38///
39/// A [`Orphos<Trained>`] instance has completed training and can
40/// perform gene prediction operations.
41#[derive(Debug, Clone)]
42pub struct Trained;
43
44impl TrainingState for Untrained {}
45impl TrainingState for Trained {}
46
47/// Main Orphos configuration and execution engine.
48///
49/// This struct uses the type-state pattern with the `S` type parameter
50/// to ensure training is performed before gene prediction. The state
51/// transitions from [`Untrained`] to [`Trained`] via the training methods.
52///
53/// # Type Parameters
54///
55/// * `S` - The training state, either [`Untrained`] or [`Trained`]
56///
57/// # Examples
58///
59/// ```rust,no_run
60/// use orphos_core::engine::UntrainedOrphos;
61/// use orphos_core::config::OrphosConfig;
62/// use orphos_core::sequence::encoded::EncodedSequence;
63///
64/// // Create an untrained instance
65/// let mut orphos = UntrainedOrphos::new();
66///
67/// // Encode a sequence
68/// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
69/// let encoded = EncodedSequence::without_masking(sequence);
70///
71/// // Train on the sequence and get results
72/// let trained = orphos.train_single_genome(&encoded)?;
73///
74/// // Use the higher-level API to analyze sequences
75/// use orphos_core::OrphosAnalyzer;
76/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
77/// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
78/// println!("Found {} genes", results.genes.len());
79/// # Ok::<(), orphos_core::types::OrphosError>(())
80/// ```
81#[derive(Debug, Default)]
82pub struct Orphos<S: TrainingState> {
83 /// Configuration options for gene prediction
84 pub config: OrphosConfig,
85 /// Training data obtained from the genome
86 training: Option<Training>,
87 /// Type-state marker (zero-sized)
88 _state: PhantomData<S>,
89}
90
91/// Type alias for an untrained Orphos instance.
92///
93/// Use this when you need to perform training on a new genome.
94pub type UntrainedOrphos = Orphos<Untrained>;
95
96/// Type alias for a trained Orphos instance.
97///
98/// Use this when you have already trained on a genome and want to find genes.
99pub type TrainedOrphos = Orphos<Trained>;
100
101impl UntrainedOrphos {
102 /// Creates a new untrained Orphos instance with default configuration.
103 ///
104 /// # Examples
105 ///
106 /// ```rust
107 /// use orphos_core::engine::UntrainedOrphos;
108 ///
109 /// let orphos = UntrainedOrphos::new();
110 /// ```
111 pub fn new() -> Self {
112 Self {
113 config: OrphosConfig::default(),
114 training: None,
115 _state: PhantomData,
116 }
117 }
118
119 /// Creates a new untrained Orphos instance with custom configuration.
120 ///
121 /// # Arguments
122 ///
123 /// * `config` - Configuration options for gene prediction
124 ///
125 /// # Errors
126 ///
127 /// Returns [`OrphosError`] if thread pool configuration fails.
128 ///
129 /// # Examples
130 ///
131 /// ```rust
132 /// use orphos_core::engine::UntrainedOrphos;
133 /// use orphos_core::config::{OrphosConfig, OutputFormat};
134 ///
135 /// let config = OrphosConfig {
136 /// closed_ends: true,
137 /// output_format: OutputFormat::Gff,
138 /// ..Default::default()
139 /// };
140 ///
141 /// let orphos = UntrainedOrphos::with_config(config)?;
142 /// # Ok::<(), orphos_core::types::OrphosError>(())
143 /// ```
144 pub fn with_config(config: OrphosConfig) -> Result<Self, OrphosError> {
145 let orphos = Self {
146 config,
147 training: None,
148 _state: PhantomData,
149 };
150
151 if let Some(num_threads) = orphos.config.num_threads {
152 rayon::ThreadPoolBuilder::new()
153 .num_threads(num_threads)
154 .build_global()
155 .map_err(|e| {
156 OrphosError::InvalidSequence(format!("Failed to configure thread pool: {}", e))
157 })?;
158 }
159
160 Ok(orphos)
161 }
162
163 /// Trains the model on a single complete genome sequence.
164 ///
165 /// This method analyzes the sequence to build a statistical model of gene
166 /// characteristics including:
167 /// - Start codon usage (ATG, GTG, TTG)
168 /// - Ribosome binding site motifs
169 /// - Codon usage patterns
170 /// - GC content bias
171 ///
172 /// # Arguments
173 ///
174 /// * `encoded_sequence` - The genome sequence encoded in bitmap format
175 ///
176 /// # Returns
177 ///
178 /// A [`TrainedOrphos`] instance ready for gene prediction.
179 ///
180 /// # Errors
181 ///
182 /// Returns [`OrphosError::InvalidSequence`] if:
183 /// - The sequence is shorter than [`MIN_SEQUENCE_LENGTH`]
184 /// - The sequence contains invalid characters
185 /// - Training fails to converge
186 ///
187 /// # Examples
188 ///
189 /// ```rust,no_run
190 /// use orphos_core::engine::UntrainedOrphos;
191 /// use orphos_core::sequence::encoded::EncodedSequence;
192 ///
193 /// let mut orphos = UntrainedOrphos::new();
194 /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
195 /// let encoded = EncodedSequence::without_masking(sequence);
196 ///
197 /// let trained = orphos.train_single_genome(&encoded)?;
198 /// # Ok::<(), orphos_core::types::OrphosError>(())
199 /// ```
200 pub fn train_single_genome(
201 &mut self,
202 encoded_sequence: &EncodedSequence,
203 ) -> Result<TrainedOrphos, OrphosError> {
204 let sequence_length = encoded_sequence.sequence_length;
205 if sequence_length < MIN_SEQUENCE_LENGTH {
206 return Err(OrphosError::InvalidSequence(format!(
207 "Sequence too short for gene prediction: {} bp (minimum {} bp required)",
208 sequence_length, MIN_SEQUENCE_LENGTH
209 )));
210 }
211
212 if !self.config.quiet {
213 eprintln!(
214 "Training on single genome ({} bp, {:.2}% GC)...",
215 sequence_length,
216 encoded_sequence.gc_content * 100.0
217 );
218 }
219 let mut training = Training {
220 gc_content: 0.0,
221 ..Default::default()
222 };
223 training.uses_shine_dalgarno = false;
224 training.gc_bias_factors = [0.0; 3];
225
226 training.gc_content = encoded_sequence.gc_content;
227
228 let mut nodes = Vec::new();
229 let num_nodes = add_nodes(
230 encoded_sequence,
231 &mut nodes,
232 self.config.closed_ends,
233 &training,
234 )?;
235
236 if !self.config.quiet {
237 eprintln!(
238 "Located {} potential start/stop nodes, closed {}",
239 num_nodes, self.config.closed_ends
240 );
241 }
242
243 sort_nodes_by_position(&mut nodes);
244
245 let gc_frame = calc_most_gc_frame(&encoded_sequence.forward_sequence, sequence_length);
246
247 record_gc_bias(&gc_frame, &mut nodes, &mut training);
248
249 if !self.config.quiet {
250 eprintln!(
251 "Frame bias scores: {:.8} {:.8} {:.8}",
252 training.gc_bias_factors[0],
253 training.gc_bias_factors[1],
254 training.gc_bias_factors[2]
255 );
256 }
257
258 record_overlapping_starts(&mut nodes, &training, false);
259
260 let initial_path = predict_genes(&mut nodes, &training, false).unwrap_or(0);
261 calculate_dicodon_gene(
262 &mut training,
263 &encoded_sequence.forward_sequence,
264 &encoded_sequence.reverse_complement_sequence,
265 sequence_length,
266 &nodes,
267 initial_path,
268 );
269
270 raw_coding_score(
271 &encoded_sequence.forward_sequence,
272 &encoded_sequence.reverse_complement_sequence,
273 sequence_length,
274 &mut nodes,
275 &training,
276 );
277
278 rbs_score(
279 &encoded_sequence.forward_sequence,
280 &encoded_sequence.reverse_complement_sequence,
281 sequence_length,
282 &mut nodes,
283 &training,
284 );
285
286 train_starts_sd(
287 &encoded_sequence.forward_sequence,
288 &encoded_sequence.reverse_complement_sequence,
289 sequence_length,
290 &nodes,
291 &mut training,
292 );
293 training.uses_shine_dalgarno = should_use_sd(&training);
294 if self.config.force_non_sd {
295 training.uses_shine_dalgarno = false;
296 }
297
298 if !training.uses_shine_dalgarno {
299 train_starts_nonsd(
300 &encoded_sequence.forward_sequence,
301 &encoded_sequence.reverse_complement_sequence,
302 sequence_length,
303 &mut nodes,
304 &mut training,
305 );
306 }
307
308 if !self.config.quiet {
309 eprintln!("Training complete!");
310 }
311
312 Ok(Orphos {
313 config: self.config.clone(),
314 training: Some(training),
315 _state: PhantomData,
316 })
317 }
318}
319
320impl TrainedOrphos {
321 /// Creates a new trained Orphos instance with pre-computed training data.
322 ///
323 /// This is useful when you have previously computed training data that you
324 /// want to reuse for gene prediction on multiple sequences.
325 ///
326 /// # Arguments
327 ///
328 /// * `config` - Configuration options for gene prediction
329 /// * `training` - Pre-computed training data
330 ///
331 /// # Examples
332 ///
333 /// ```rust,no_run
334 /// use orphos_core::engine::TrainedOrphos;
335 /// use orphos_core::config::OrphosConfig;
336 /// use orphos_core::types::Training;
337 ///
338 /// let config = OrphosConfig::default();
339 /// let training = Training::default(); // In practice, load from file
340 ///
341 /// let trained = TrainedOrphos::new(config, training);
342 /// ```
343 pub const fn new(config: OrphosConfig, training: Training) -> Self {
344 Self {
345 config,
346 training: Some(training),
347 _state: PhantomData,
348 }
349 }
350
351 /// Finds genes in a single genome sequence using the trained model.
352 ///
353 /// Uses dynamic programming to find the optimal set of non-overlapping genes
354 /// based on the statistical model built during training. This method performs:
355 ///
356 /// 1. Node generation (start/stop codon detection)
357 /// 2. Node scoring (coding potential, RBS, start codon usage)
358 /// 3. Dynamic programming gene selection
359 /// 4. Gene quality filtering
360 /// 5. Start position refinement
361 ///
362 /// # Arguments
363 ///
364 /// * `encoded_sequence` - The genome sequence encoded in bitmap format
365 ///
366 /// # Returns
367 ///
368 /// A vector of [`Gene`] predictions sorted by position. Returns an empty
369 /// vector if no genes are found.
370 ///
371 /// # Errors
372 ///
373 /// Returns [`OrphosError::InvalidSequence`] if:
374 /// - The Orphos instance is not properly trained
375 /// - Node generation fails
376 /// - Sequence contains invalid characters
377 ///
378 /// # Examples
379 ///
380 /// ```rust,no_run
381 /// use orphos_core::engine::UntrainedOrphos;
382 /// use orphos_core::sequence::encoded::EncodedSequence;
383 ///
384 /// let mut orphos = UntrainedOrphos::new();
385 /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
386 /// let encoded = EncodedSequence::without_masking(sequence);
387 ///
388 /// let trained = orphos.train_single_genome(&encoded)?;
389 ///
390 /// // Use OrphosAnalyzer for a higher-level API to find genes
391 /// use orphos_core::OrphosAnalyzer;
392 /// use orphos_core::config::OrphosConfig;
393 /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
394 /// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
395 /// for gene in &results.genes {
396 /// println!("Gene at {}-{} (strand: {:?})",
397 /// gene.coordinates.begin, gene.coordinates.end, gene.coordinates.strand);
398 /// }
399 /// # Ok::<(), orphos_core::types::OrphosError>(())
400 /// ```
401 fn find_genes_single(
402 &self,
403 encoded_sequence: &EncodedSequence,
404 ) -> Result<Vec<Gene>, OrphosError> {
405 let mut nodes = Vec::new();
406 let training = self
407 .training
408 .as_ref()
409 .ok_or_else(|| OrphosError::InvalidSequence("Orphos is not trained".to_string()))?;
410
411 let _num_nodes = add_nodes(
412 encoded_sequence,
413 &mut nodes,
414 self.config.closed_ends,
415 training,
416 )?;
417
418 sort_nodes_by_position(&mut nodes);
419
420 // Tap the training state right before scoring to ensure parity at inference time
421
422 score_nodes(
423 encoded_sequence,
424 &mut nodes,
425 training,
426 self.config.closed_ends,
427 false,
428 )?;
429
430 record_overlapping_starts(&mut nodes, training, true);
431
432 let gene_path = match predict_genes(&mut nodes, training, true) {
433 Some(path) => path,
434 None => {
435 // No genes found - return empty gene list
436 return Ok(vec![]);
437 }
438 };
439
440 eliminate_bad_genes(&mut nodes, Some(gene_path), training);
441
442 let genes = GeneBuilder::from_nodes(&nodes, gene_path, training, 1)
443 .with_tweaked_starts()
444 .with_annotations()
445 .build();
446
447 Ok(genes)
448 }
449}
450
451/// High-level gene finding analyzer with automatic training.
452///
453/// This struct provides a simplified interface for gene prediction that handles
454/// training automatically. It's the recommended entry point for most users.
455///
456/// Unlike the type-safe [`Orphos`] struct, `OrphosAnalyzer` manages training
457/// internally and provides convenient methods for analyzing sequences from various
458/// sources (files, strings, byte slices).
459///
460/// # Modes
461///
462/// - **Single Genome Mode** (default): Trains on each sequence individually for
463/// optimal accuracy on complete genomes
464/// - **Metagenomic Mode**: Uses pre-computed models for fragmented sequences
465///
466/// # Examples
467///
468/// ## Analyze a sequence string
469///
470/// ```rust,no_run
471/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
472///
473/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
474///
475/// let sequence = "ATGAAACGCATTAGCACCACCATT...";
476/// let results = analyzer.analyze_sequence(sequence, Some("genome1".to_string()))?;
477///
478/// println!("Found {} genes in {} bp sequence",
479/// results.genes.len(),
480/// results.sequence_info.length);
481/// # Ok::<(), orphos_core::types::OrphosError>(())
482/// ```
483///
484/// ## Analyze a FASTA file
485///
486/// ```rust,no_run
487/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
488///
489/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
490/// let results = analyzer.analyze_fasta_file("genome.fasta")?;
491///
492/// for result in results {
493/// println!("Sequence: {}", result.sequence_info.header);
494/// println!(" Genes: {}", result.genes.len());
495/// println!(" GC%: {:.2}", result.sequence_info.gc_content * 100.0);
496/// }
497/// # Ok::<(), orphos_core::types::OrphosError>(())
498/// ```
499///
500/// ## With custom configuration
501///
502/// ```rust,no_run
503/// use orphos_core::{OrphosAnalyzer, config::{OrphosConfig, OutputFormat}};
504///
505/// let config = OrphosConfig {
506/// closed_ends: true,
507/// mask_n_runs: true,
508/// output_format: OutputFormat::Gff,
509/// num_threads: Some(4),
510/// ..Default::default()
511/// };
512///
513/// let mut analyzer = OrphosAnalyzer::new(config);
514/// # Ok::<(), orphos_core::types::OrphosError>(())
515/// ```
516#[derive(Debug)]
517pub struct OrphosAnalyzer {
518 /// Configuration options for gene prediction
519 pub config: OrphosConfig,
520}
521
522impl OrphosAnalyzer {
523 /// Creates a new analyzer with the specified configuration.
524 ///
525 /// # Arguments
526 ///
527 /// * `config` - Configuration options for gene prediction
528 ///
529 /// # Examples
530 ///
531 /// ```rust
532 /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
533 ///
534 /// let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
535 /// ```
536 pub const fn new(config: OrphosConfig) -> Self {
537 Self { config }
538 }
539
540 /// Analyzes sequences from a FASTA file.
541 ///
542 /// Reads all sequences from the FASTA file and performs gene prediction on each.
543 /// In single genome mode, each sequence is trained and analyzed independently.
544 ///
545 /// # Arguments
546 ///
547 /// * `path` - Path to the FASTA file
548 ///
549 /// # Returns
550 ///
551 /// A vector of [`OrphosResults`], one for each sequence in the file.
552 ///
553 /// # Errors
554 ///
555 /// Returns [`OrphosError`] if:
556 /// - The file cannot be read
557 /// - The FASTA format is invalid
558 /// - Any sequence fails analysis
559 ///
560 /// # Examples
561 ///
562 /// ```rust,no_run
563 /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
564 ///
565 /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
566 /// let results = analyzer.analyze_fasta_file("genomes.fasta")?;
567 ///
568 /// for (i, result) in results.iter().enumerate() {
569 /// println!("Sequence {}: {} genes", i + 1, result.genes.len());
570 /// }
571 /// # Ok::<(), orphos_core::types::OrphosError>(())
572 /// ```
573 pub fn analyze_fasta_file<P: AsRef<Path>>(
574 &mut self,
575 path: P,
576 ) -> Result<Vec<OrphosResults>, OrphosError> {
577 let sequences = read_fasta_sequences(path.as_ref().to_str().unwrap())?;
578
579 if self.config.metagenomic {
580 Ok(vec![])
581 } else {
582 let mut results = Vec::new();
583 for (header, description, seq_bytes) in sequences {
584 let result = self.analyze_sequence_bytes(&seq_bytes, header, description)?;
585 results.push(result);
586 }
587 Ok(results)
588 }
589 }
590
591 /// Analyzes a single sequence from a string.
592 ///
593 /// Converts the string sequence to bytes and performs gene prediction.
594 /// This is a convenience method for analyzing sequences already loaded
595 /// into memory.
596 ///
597 /// # Arguments
598 ///
599 /// * `sequence` - DNA sequence string (A, T, G, C, N)
600 /// * `header` - Optional sequence identifier (defaults to "Orphos_Seq_1")
601 ///
602 /// # Returns
603 ///
604 /// [`OrphosResults`] containing genes, training data, and sequence info.
605 ///
606 /// # Errors
607 ///
608 /// Returns [`OrphosError`] if:
609 /// - The sequence is too short (< 20,000 bp recommended)
610 /// - Training fails
611 /// - Gene prediction fails
612 ///
613 /// # Examples
614 ///
615 /// ```rust,no_run
616 /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
617 ///
618 /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
619 ///
620 /// let sequence = "ATGAAACGCATTAGCACCACCATT...";
621 /// let results = analyzer.analyze_sequence(sequence, Some("E. coli K12".to_string()))?;
622 ///
623 /// println!("Analyzed: {}", results.sequence_info.header);
624 /// println!("Found {} genes", results.genes.len());
625 /// println!("GC content: {:.2}%", results.sequence_info.gc_content * 100.0);
626 /// # Ok::<(), orphos_core::types::OrphosError>(())
627 /// ```
628 pub fn analyze_sequence(
629 &mut self,
630 sequence: &str,
631 header: Option<String>,
632 ) -> Result<OrphosResults, OrphosError> {
633 let seq_bytes = sequence.as_bytes();
634 let header = header.unwrap_or_else(|| "Orphos_Seq_1".to_string());
635
636 self.analyze_sequence_bytes(seq_bytes, header, None)
637 }
638
639 /// Analyzes a single sequence from raw bytes.
640 ///
641 /// This is the core analysis method used by other convenience methods.
642 /// It handles the complete workflow: encoding, training, and gene prediction.
643 ///
644 /// # Arguments
645 ///
646 /// * `sequence` - Raw DNA sequence bytes (ASCII: A, T, G, C, N)
647 /// * `header` - Sequence identifier for output
648 /// * `description` - Optional sequence description
649 ///
650 /// # Returns
651 ///
652 /// [`OrphosResults`] containing:
653 /// - Predicted genes with coordinates and scores
654 /// - Training parameters used
655 /// - Sequence statistics (length, GC content)
656 ///
657 /// # Errors
658 ///
659 /// Returns [`OrphosError`] if:
660 /// - The sequence is too short for reliable training
661 /// - Sequence encoding fails
662 /// - Training or prediction fails
663 ///
664 /// # Examples
665 ///
666 /// ```rust,no_run
667 /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
668 ///
669 /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
670 ///
671 /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
672 /// let results = analyzer.analyze_sequence_bytes(
673 /// sequence,
674 /// "sequence1".to_string(),
675 /// Some("E. coli genome".to_string())
676 /// )?;
677 ///
678 /// println!("Found {} genes", results.genes.len());
679 /// # Ok::<(), orphos_core::types::OrphosError>(())
680 /// ```
681 pub fn analyze_sequence_bytes(
682 &mut self,
683 sequence: &[u8],
684 header: String,
685 description: Option<String>,
686 ) -> Result<OrphosResults, OrphosError> {
687 let sequence_length = sequence.len();
688 let encoded_sequence = self.encode_sequence(sequence);
689
690 let mut untrained_orphos = UntrainedOrphos::with_config(self.config.clone())?;
691 let trained_orphos = untrained_orphos.train_single_genome(&encoded_sequence)?;
692 let genes = trained_orphos.find_genes_single(&encoded_sequence)?;
693
694 Ok(OrphosResults {
695 genes: genes.clone(),
696 training_used: trained_orphos.training.unwrap_or_default(),
697 sequence_info: SequenceInfo {
698 length: sequence_length,
699 gc_content: encoded_sequence.gc_content,
700 num_genes: genes.len(),
701 header,
702 description,
703 },
704 metagenomic_model: if self.config.metagenomic {
705 Some("Best".to_string())
706 } else {
707 None
708 },
709 })
710 }
711
712 /// Encodes a sequence to bitmap format for efficient processing.
713 ///
714 /// Converts the DNA sequence into a compact binary representation that
715 /// enables fast nucleotide lookups. Optionally masks runs of N characters.
716 ///
717 /// # Arguments
718 ///
719 /// * `sequence` - Raw DNA sequence bytes
720 ///
721 /// # Returns
722 ///
723 /// An [`EncodedSequence`] with forward and reverse-complement strands
724 /// encoded in bitmap format.
725 fn encode_sequence(&self, sequence: &[u8]) -> EncodedSequence {
726 if self.config.mask_n_runs {
727 EncodedSequence::with_masking(sequence)
728 } else {
729 EncodedSequence::without_masking(sequence)
730 }
731 }
732}
733
734#[cfg(test)]
735mod tests {
736 use super::*;
737 use crate::config::OutputFormat;
738 use crate::constants::TEST_SEQUENCE_REPEAT_FACTOR;
739 use std::env;
740 use std::fs;
741
742 // Helper function to create a simple test sequence
743 fn create_test_sequence() -> Vec<u8> {
744 // Simple DNA sequence with some genes
745 "ATGAAACGTAAATAG".as_bytes().to_vec()
746 }
747
748 // Helper function to create a longer test sequence for training
749 fn create_training_sequence() -> Vec<u8> {
750 // A longer sequence suitable for training (> 20,000 bp recommended)
751 let basic_gene = "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG";
752 basic_gene
753 .repeat(TEST_SEQUENCE_REPEAT_FACTOR)
754 .as_bytes()
755 .to_vec()
756 }
757
758 // Helper function to create an EncodedSequence for testing
759 fn create_encoded_sequence_for_test(seq: &[u8]) -> EncodedSequence {
760 EncodedSequence::without_masking(seq)
761 }
762
763 #[test]
764 fn test_training_state_traits() {
765 // Test that training state markers implement required traits
766 let _untrained: Untrained = Untrained;
767 let _trained: Trained = Trained;
768
769 // Test that they can be cloned and debugged
770 let untrained_clone = _untrained.clone();
771 let trained_clone = _trained.clone();
772
773 assert_eq!(format!("{:?}", untrained_clone), "Untrained");
774 assert_eq!(format!("{:?}", trained_clone), "Trained");
775 }
776
777 #[test]
778 fn test_untrained_orphos_new() {
779 let orphos = UntrainedOrphos::new();
780
781 assert!(!orphos.config.metagenomic);
782 assert!(!orphos.config.closed_ends);
783 assert!(!orphos.config.mask_n_runs);
784 assert!(!orphos.config.force_non_sd);
785 assert!(!orphos.config.quiet);
786 assert_eq!(orphos.config.output_format, OutputFormat::Genbank);
787 assert!(orphos.training.is_none());
788 }
789
790 #[test]
791 fn test_untrained_orphos_with_config() {
792 let config = OrphosConfig {
793 metagenomic: true,
794 closed_ends: true,
795 quiet: true,
796 ..OrphosConfig::default()
797 };
798
799 let result = UntrainedOrphos::with_config(config.clone());
800 assert!(result.is_ok());
801
802 let orphos = result.unwrap();
803 assert!(orphos.config.metagenomic);
804 assert!(orphos.config.closed_ends);
805 assert!(orphos.config.quiet);
806 assert!(orphos.training.is_none());
807 }
808
809 #[test]
810 fn test_untrained_orphos_with_thread_config() {
811 let config = OrphosConfig {
812 num_threads: Some(2),
813 ..OrphosConfig::default()
814 };
815
816 let result = UntrainedOrphos::with_config(config);
817 // Thread configuration might fail depending on system
818 // Expected to potentially fail
819 let _ = result;
820 }
821
822 #[test]
823 fn test_untrained_orphos_with_invalid_thread_config() {
824 let config = OrphosConfig {
825 num_threads: Some(0),
826 ..OrphosConfig::default()
827 };
828
829 let result = UntrainedOrphos::with_config(config);
830 // Thread configuration might fail depending on system
831 // Rayon might handle it gracefully or it might fail
832 let _ = result;
833 }
834
835 #[test]
836 fn test_trained_orphos_new() {
837 let config = OrphosConfig::default();
838 let training = Training::default();
839
840 let orphos = TrainedOrphos::new(config, training);
841
842 assert!(orphos.training.is_some());
843 }
844
845 #[test]
846 fn test_train_single_genome_basic() {
847 let mut orphos = UntrainedOrphos::new();
848 let sequence = create_training_sequence();
849 let encoded_sequence = create_encoded_sequence_for_test(&sequence);
850
851 let result = orphos.train_single_genome(&encoded_sequence);
852
853 assert!(result.is_ok());
854 let trained = result.unwrap();
855 assert!(trained.training.is_some());
856 }
857
858 #[test]
859 fn test_train_single_genome_quiet_mode() {
860 let config = OrphosConfig {
861 quiet: true,
862 ..OrphosConfig::default()
863 };
864 let mut orphos = UntrainedOrphos::with_config(config).unwrap();
865
866 let sequence = create_training_sequence();
867 let encoded_sequence = create_encoded_sequence_for_test(&sequence);
868
869 let result = orphos.train_single_genome(&encoded_sequence);
870
871 assert!(result.is_ok());
872 }
873
874 #[test]
875 fn test_train_single_genome_force_non_sd() {
876 let config = OrphosConfig {
877 force_non_sd: true,
878 ..OrphosConfig::default()
879 };
880 let mut orphos = UntrainedOrphos::with_config(config).unwrap();
881
882 let sequence = create_training_sequence();
883 let encoded_sequence = create_encoded_sequence_for_test(&sequence);
884
885 let result = orphos.train_single_genome(&encoded_sequence);
886
887 assert!(result.is_ok());
888 let trained = result.unwrap();
889 let training = trained.training.unwrap();
890 assert!(!training.uses_shine_dalgarno); // Should be forced to false
891 }
892
893 #[test]
894 fn test_trained_orphos_find_genes_single() {
895 // First create a trained orphos
896 let mut orphos = UntrainedOrphos::new();
897 let sequence = create_training_sequence();
898 let encoded_sequence = create_encoded_sequence_for_test(&sequence);
899
900 let trained = orphos.train_single_genome(&encoded_sequence).unwrap();
901
902 // Now test gene finding
903 let test_seq = create_test_sequence();
904 let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
905
906 let result = trained.find_genes_single(&test_encoded_sequence);
907
908 assert!(result.is_ok());
909 let _genes = result.unwrap();
910 // Should find genes in the sequence - number could be 0 for very short sequences
911 }
912
913 #[test]
914 fn test_trained_orphos_find_genes_without_training() {
915 let config = OrphosConfig::default();
916 let orphos = TrainedOrphos {
917 config,
918 training: None,
919 _state: PhantomData,
920 };
921
922 let test_seq = create_test_sequence();
923 let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
924
925 let result = orphos.find_genes_single(&test_encoded_sequence);
926
927 assert!(result.is_err());
928 if let Err(OrphosError::InvalidSequence(msg)) = result {
929 assert!(msg.contains("not trained"));
930 } else {
931 panic!("Expected InvalidSequence error");
932 }
933 }
934
935 #[test]
936 fn test_orphos_analyzer_new() {
937 let config = OrphosConfig::default();
938 let analyzer = OrphosAnalyzer::new(config.clone());
939
940 assert_eq!(analyzer.config.metagenomic, config.metagenomic);
941 assert_eq!(analyzer.config.closed_ends, config.closed_ends);
942 }
943
944 #[test]
945 fn test_analyze_sequence_basic() {
946 let config = OrphosConfig::default();
947 let mut analyzer = OrphosAnalyzer::new(config);
948
949 let sequence =
950 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
951
952 let result = analyzer.analyze_sequence(&sequence, None);
953 assert!(result.is_ok());
954
955 let analysis = result.unwrap();
956 assert_eq!(analysis.sequence_info.header, "Orphos_Seq_1");
957 assert_eq!(analysis.sequence_info.length, sequence.len());
958 assert!(
959 analysis.sequence_info.gc_content >= 0.0 && analysis.sequence_info.gc_content <= 1.0
960 );
961 assert!(analysis.metagenomic_model.is_none());
962 }
963
964 #[test]
965 fn test_analyze_sequence_with_header() {
966 let config = OrphosConfig::default();
967 let mut analyzer = OrphosAnalyzer::new(config);
968
969 let sequence =
970 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
971 let header = Some("test_sequence".to_string());
972
973 let result = analyzer.analyze_sequence(&sequence, header);
974 assert!(result.is_ok());
975
976 let analysis = result.unwrap();
977 assert_eq!(analysis.sequence_info.header, "test_sequence");
978 }
979
980 #[test]
981 fn test_analyze_sequence_bytes() {
982 let config = OrphosConfig::default();
983 let mut analyzer = OrphosAnalyzer::new(config);
984
985 let sequence =
986 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
987 let seq_bytes = sequence.as_bytes();
988 let header = "test_sequence".to_string();
989 let description = Some("Test description".to_string());
990
991 let result =
992 analyzer.analyze_sequence_bytes(seq_bytes, header.clone(), description.clone());
993 assert!(result.is_ok());
994
995 let analysis = result.unwrap();
996 assert_eq!(analysis.sequence_info.header, header);
997 assert_eq!(analysis.sequence_info.description, description);
998 assert_eq!(analysis.sequence_info.length, seq_bytes.len());
999 }
1000
1001 #[test]
1002 fn test_analyze_sequence_metagenomic_config() {
1003 let config = OrphosConfig {
1004 metagenomic: true,
1005 ..OrphosConfig::default()
1006 };
1007 let mut analyzer = OrphosAnalyzer::new(config);
1008
1009 let sequence =
1010 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1011
1012 let result = analyzer.analyze_sequence(&sequence, None);
1013 assert!(result.is_ok());
1014
1015 let analysis = result.unwrap();
1016 assert!(analysis.metagenomic_model.is_some());
1017 assert_eq!(analysis.metagenomic_model.unwrap(), "Best");
1018 }
1019
1020 // #[test]
1021 // fn test_encode_sequence_basic() {
1022 // let config = OrphosConfig::default();
1023 // let analyzer = OrphosAnalyzer::new(config);
1024
1025 // let sequence = b"ATCG";
1026 // let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1027 // let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1028 // let mut masks = Vec::new();
1029
1030 // let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1031 // assert!(result.is_ok());
1032
1033 // let gc_content = result.unwrap();
1034 // assert!((0.0..=1.0).contains(&gc_content));
1035 // }
1036
1037 // #[test]
1038 // fn test_encode_sequence_with_mask_n_runs() {
1039 // let config = OrphosConfig {
1040 // mask_n_runs: true,
1041 // ..OrphosConfig::default()
1042 // };
1043 // let analyzer = OrphosAnalyzer::new(config);
1044
1045 // let sequence = b"ATCGNNNNGCAT";
1046 // let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1047 // let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1048 // let mut masks = Vec::new();
1049
1050 // let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1051 // assert!(result.is_ok());
1052
1053 // // N-run masking behavior depends on implementation - might not always create masks
1054 // // So we just check that it doesn't crash
1055 // }
1056
1057 #[test]
1058 fn test_analyze_fasta_file_not_found() {
1059 let config = OrphosConfig::default();
1060 let mut analyzer = OrphosAnalyzer::new(config);
1061
1062 let result = analyzer.analyze_fasta_file("nonexistent_file.fa");
1063 assert!(result.is_err());
1064 }
1065
1066 #[test]
1067 fn test_analyze_fasta_file_metagenomic() {
1068 let config = OrphosConfig {
1069 metagenomic: true,
1070 ..OrphosConfig::default()
1071 };
1072 let mut analyzer = OrphosAnalyzer::new(config);
1073
1074 // Create a temporary FASTA file
1075 let fasta_content = ">test_seq\nATCG\n";
1076 let temp_dir = env::temp_dir();
1077 let temp_file = temp_dir.join("test_metagenomic.fa");
1078 fs::write(&temp_file, fasta_content).unwrap();
1079
1080 let result = analyzer.analyze_fasta_file(&temp_file);
1081 assert!(result.is_ok());
1082
1083 let results = result.unwrap();
1084 assert!(results.is_empty()); // Metagenomic mode returns empty for now
1085
1086 let _ = fs::remove_file(temp_file);
1087 }
1088
1089 #[test]
1090 fn test_analyze_fasta_file_single_genome() {
1091 let config = OrphosConfig::default();
1092 let mut analyzer = OrphosAnalyzer::new(config);
1093
1094 // Create a temporary FASTA file with a longer sequence for training
1095 let sequence =
1096 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1097 let fasta_content = format!(">test_seq\n{}\n", sequence);
1098 let temp_dir = env::temp_dir();
1099 let temp_file = temp_dir.join("test_single_genome.fa");
1100 fs::write(&temp_file, fasta_content).unwrap();
1101
1102 let result = analyzer.analyze_fasta_file(&temp_file);
1103 assert!(result.is_ok());
1104
1105 let results = result.unwrap();
1106 assert_eq!(results.len(), 1);
1107 assert_eq!(results[0].sequence_info.header, "test_seq");
1108
1109 let _ = fs::remove_file(temp_file);
1110 }
1111
1112 #[test]
1113 fn test_analyze_empty_sequence() {
1114 let config = OrphosConfig::default();
1115 let mut analyzer = OrphosAnalyzer::new(config);
1116
1117 let sequence = "";
1118 let result = analyzer.analyze_sequence(sequence, None);
1119
1120 assert!(result.is_err());
1121 if let Err(e) = result {
1122 // Should be an InvalidSequence error about being too short
1123 match e {
1124 OrphosError::InvalidSequence(msg) => {
1125 assert!(msg.contains("too short"));
1126 }
1127 _ => panic!("Expected InvalidSequence error for empty sequence"),
1128 }
1129 }
1130 }
1131
1132 #[test]
1133 fn test_analyze_very_short_sequence() {
1134 let config = OrphosConfig::default();
1135 let mut analyzer = OrphosAnalyzer::new(config);
1136
1137 let sequence = "ATG"; // Very short sequence (3 bp)
1138 let result = analyzer.analyze_sequence(sequence, None);
1139
1140 assert!(result.is_err());
1141 if let Err(e) = result {
1142 // Should be an InvalidSequence error about being too short
1143 match e {
1144 OrphosError::InvalidSequence(msg) => {
1145 assert!(msg.contains("too short"));
1146 }
1147 _ => panic!("Expected InvalidSequence error for very short sequence"),
1148 }
1149 }
1150 }
1151
1152 #[test]
1153 fn test_config_cloning() {
1154 let config1 = OrphosConfig::default();
1155 let orphos1 = UntrainedOrphos::with_config(config1.clone()).unwrap();
1156
1157 let config2 = orphos1.config.clone();
1158 let _orphos2 = UntrainedOrphos::with_config(config2).unwrap();
1159 }
1160
1161 #[test]
1162 fn test_debug_formatting() {
1163 let orphos = UntrainedOrphos::new();
1164 let debug_str = format!("{:?}", orphos);
1165 assert!(debug_str.contains("Orphos"));
1166 assert!(debug_str.contains("config"));
1167
1168 let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
1169 let debug_str2 = format!("{:?}", analyzer);
1170 assert!(debug_str2.contains("OrphosAnalyzer"));
1171 assert!(debug_str2.contains("config"));
1172 }
1173
1174 #[test]
1175 fn test_type_aliases() {
1176 // Test that type aliases work correctly
1177 let _untrained: UntrainedOrphos = UntrainedOrphos::new();
1178 let _trained: TrainedOrphos =
1179 TrainedOrphos::new(OrphosConfig::default(), Training::default());
1180
1181 assert_eq!(
1182 std::any::type_name::<UntrainedOrphos>(),
1183 std::any::type_name::<Orphos<Untrained>>()
1184 );
1185 assert_eq!(
1186 std::any::type_name::<TrainedOrphos>(),
1187 std::any::type_name::<Orphos<Trained>>()
1188 );
1189 }
1190
1191 #[test]
1192 fn test_training_state_phantom_data() {
1193 let untrained = UntrainedOrphos::new();
1194 let trained = TrainedOrphos::new(OrphosConfig::default(), Training::default());
1195
1196 assert_eq!(std::mem::size_of_val(&untrained._state), 0);
1197 assert_eq!(std::mem::size_of_val(&trained._state), 0);
1198 }
1199
1200 #[test]
1201 fn test_analyzer_multiple_sequences() {
1202 let config = OrphosConfig::default();
1203 let mut analyzer = OrphosAnalyzer::new(config);
1204
1205 let sequence1 =
1206 "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(150);
1207 let sequence2 =
1208 "ATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATAG".repeat(200);
1209
1210 let result1 = analyzer.analyze_sequence(&sequence1, Some("seq1".to_string()));
1211 let result2 = analyzer.analyze_sequence(&sequence2, Some("seq2".to_string()));
1212
1213 assert!(result1.is_ok());
1214 assert!(result2.is_ok());
1215
1216 let analysis1 = result1.unwrap();
1217 let analysis2 = result2.unwrap();
1218
1219 assert_eq!(analysis1.sequence_info.header, "seq1");
1220 assert_eq!(analysis2.sequence_info.header, "seq2");
1221 assert_ne!(
1222 analysis1.sequence_info.length,
1223 analysis2.sequence_info.length
1224 );
1225 }
1226
1227 #[test]
1228 fn test_error_handling_edge_cases() {
1229 let config = OrphosConfig::default();
1230 let mut analyzer = OrphosAnalyzer::new(config);
1231
1232 // Test with sequence containing invalid characters
1233 let invalid_sequence = "ATCGXYZ123";
1234 let result = analyzer.analyze_sequence(invalid_sequence, None);
1235
1236 // Should either handle gracefully or return appropriate error
1237 let _ = result;
1238 }
1239}