orphos_core/
engine.rs

1use std::marker::PhantomData;
2use std::path::Path;
3
4use crate::algorithms::dynamic_programming::eliminate_bad_genes;
5use crate::algorithms::dynamic_programming::predict_genes;
6use crate::algorithms::gene_finding::GeneBuilder;
7use crate::config::OrphosConfig;
8use crate::constants::MIN_SEQUENCE_LENGTH;
9use crate::node::{
10    add_nodes, calculate_dicodon_gene, raw_coding_score, rbs_score, record_gc_bias,
11    record_overlapping_starts, score_nodes, sort_nodes_by_position,
12};
13use crate::results::{OrphosResults, SequenceInfo};
14use crate::sequence::calc_most_gc_frame;
15use crate::sequence::encoded::EncodedSequence;
16use crate::sequence::read_fasta_sequences;
17use crate::training::non_sd_training::train_starts_nonsd;
18use crate::training::sd_training::train_starts_sd;
19use crate::training::should_use_sd;
20use crate::types::Gene;
21use crate::types::{OrphosError, Training};
22
23/// Marker trait for Orphos training state.
24///
25/// This trait is used in the type-state pattern to enforce that training
26/// must be performed before gene prediction. It's implemented by both
27/// [`Untrained`] and [`Trained`] marker types.
28pub trait TrainingState {}
29
30/// Marker type indicating an untrained Orphos instance.
31///
32/// A [`Orphos<Untrained>`] instance can perform training operations
33/// but cannot find genes until training is complete.
34#[derive(Debug, Clone)]
35pub struct Untrained;
36
37/// Marker type indicating a trained Orphos instance.
38///
39/// A [`Orphos<Trained>`] instance has completed training and can
40/// perform gene prediction operations.
41#[derive(Debug, Clone)]
42pub struct Trained;
43
44impl TrainingState for Untrained {}
45impl TrainingState for Trained {}
46
47/// Main Orphos configuration and execution engine.
48///
49/// This struct uses the type-state pattern with the `S` type parameter
50/// to ensure training is performed before gene prediction. The state
51/// transitions from [`Untrained`] to [`Trained`] via the training methods.
52///
53/// # Type Parameters
54///
55/// * `S` - The training state, either [`Untrained`] or [`Trained`]
56///
57/// # Examples
58///
59/// ```rust,no_run
60/// use orphos_core::engine::UntrainedOrphos;
61/// use orphos_core::config::OrphosConfig;
62/// use orphos_core::sequence::encoded::EncodedSequence;
63///
64/// // Create an untrained instance
65/// let mut orphos = UntrainedOrphos::new();
66///
67/// // Encode a sequence
68/// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
69/// let encoded = EncodedSequence::without_masking(sequence);
70///
71/// // Train on the sequence and get results
72/// let trained = orphos.train_single_genome(&encoded)?;
73///
74/// // Use the higher-level API to analyze sequences
75/// use orphos_core::OrphosAnalyzer;
76/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
77/// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
78/// println!("Found {} genes", results.genes.len());
79/// # Ok::<(), orphos_core::types::OrphosError>(())
80/// ```
81#[derive(Debug, Default)]
82pub struct Orphos<S: TrainingState> {
83    /// Configuration options for gene prediction
84    pub config: OrphosConfig,
85    /// Training data obtained from the genome
86    training: Option<Training>,
87    /// Type-state marker (zero-sized)
88    _state: PhantomData<S>,
89}
90
91/// Type alias for an untrained Orphos instance.
92///
93/// Use this when you need to perform training on a new genome.
94pub type UntrainedOrphos = Orphos<Untrained>;
95
96/// Type alias for a trained Orphos instance.
97///
98/// Use this when you have already trained on a genome and want to find genes.
99pub type TrainedOrphos = Orphos<Trained>;
100
101impl UntrainedOrphos {
102    /// Creates a new untrained Orphos instance with default configuration.
103    ///
104    /// # Examples
105    ///
106    /// ```rust
107    /// use orphos_core::engine::UntrainedOrphos;
108    ///
109    /// let orphos = UntrainedOrphos::new();
110    /// ```
111    pub fn new() -> Self {
112        Self {
113            config: OrphosConfig::default(),
114            training: None,
115            _state: PhantomData,
116        }
117    }
118
119    /// Creates a new untrained Orphos instance with custom configuration.
120    ///
121    /// # Arguments
122    ///
123    /// * `config` - Configuration options for gene prediction
124    ///
125    /// # Errors
126    ///
127    /// Returns [`OrphosError`] if thread pool configuration fails.
128    ///
129    /// # Examples
130    ///
131    /// ```rust
132    /// use orphos_core::engine::UntrainedOrphos;
133    /// use orphos_core::config::{OrphosConfig, OutputFormat};
134    ///
135    /// let config = OrphosConfig {
136    ///     closed_ends: true,
137    ///     output_format: OutputFormat::Gff,
138    ///     ..Default::default()
139    /// };
140    ///
141    /// let orphos = UntrainedOrphos::with_config(config)?;
142    /// # Ok::<(), orphos_core::types::OrphosError>(())
143    /// ```
144    pub fn with_config(config: OrphosConfig) -> Result<Self, OrphosError> {
145        let orphos = Self {
146            config,
147            training: None,
148            _state: PhantomData,
149        };
150
151        if let Some(num_threads) = orphos.config.num_threads {
152            rayon::ThreadPoolBuilder::new()
153                .num_threads(num_threads)
154                .build_global()
155                .map_err(|e| {
156                    OrphosError::InvalidSequence(format!("Failed to configure thread pool: {}", e))
157                })?;
158        }
159
160        Ok(orphos)
161    }
162
163    /// Trains the model on a single complete genome sequence.
164    ///
165    /// This method analyzes the sequence to build a statistical model of gene
166    /// characteristics including:
167    /// - Start codon usage (ATG, GTG, TTG)
168    /// - Ribosome binding site motifs
169    /// - Codon usage patterns
170    /// - GC content bias
171    ///
172    /// # Arguments
173    ///
174    /// * `encoded_sequence` - The genome sequence encoded in bitmap format
175    ///
176    /// # Returns
177    ///
178    /// A [`TrainedOrphos`] instance ready for gene prediction.
179    ///
180    /// # Errors
181    ///
182    /// Returns [`OrphosError::InvalidSequence`] if:
183    /// - The sequence is shorter than [`MIN_SEQUENCE_LENGTH`]
184    /// - The sequence contains invalid characters
185    /// - Training fails to converge
186    ///
187    /// # Examples
188    ///
189    /// ```rust,no_run
190    /// use orphos_core::engine::UntrainedOrphos;
191    /// use orphos_core::sequence::encoded::EncodedSequence;
192    ///
193    /// let mut orphos = UntrainedOrphos::new();
194    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
195    /// let encoded = EncodedSequence::without_masking(sequence);
196    ///
197    /// let trained = orphos.train_single_genome(&encoded)?;
198    /// # Ok::<(), orphos_core::types::OrphosError>(())
199    /// ```
200    pub fn train_single_genome(
201        &mut self,
202        encoded_sequence: &EncodedSequence,
203    ) -> Result<TrainedOrphos, OrphosError> {
204        let sequence_length = encoded_sequence.sequence_length;
205        if sequence_length < MIN_SEQUENCE_LENGTH {
206            return Err(OrphosError::InvalidSequence(format!(
207                "Sequence too short for gene prediction: {} bp (minimum {} bp required)",
208                sequence_length, MIN_SEQUENCE_LENGTH
209            )));
210        }
211
212        if !self.config.quiet {
213            eprintln!(
214                "Training on single genome ({} bp, {:.2}% GC)...",
215                sequence_length,
216                encoded_sequence.gc_content * 100.0
217            );
218        }
219        let mut training = Training {
220            gc_content: 0.0,
221            ..Default::default()
222        };
223        training.uses_shine_dalgarno = false;
224        training.gc_bias_factors = [0.0; 3];
225
226        training.gc_content = encoded_sequence.gc_content;
227
228        let mut nodes = Vec::new();
229        let num_nodes = add_nodes(
230            encoded_sequence,
231            &mut nodes,
232            self.config.closed_ends,
233            &training,
234        )?;
235
236        if !self.config.quiet {
237            eprintln!(
238                "Located {} potential start/stop nodes, closed {}",
239                num_nodes, self.config.closed_ends
240            );
241        }
242
243        sort_nodes_by_position(&mut nodes);
244
245        let gc_frame = calc_most_gc_frame(&encoded_sequence.forward_sequence, sequence_length);
246
247        record_gc_bias(&gc_frame, &mut nodes, &mut training);
248
249        if !self.config.quiet {
250            eprintln!(
251                "Frame bias scores: {:.8} {:.8} {:.8}",
252                training.gc_bias_factors[0],
253                training.gc_bias_factors[1],
254                training.gc_bias_factors[2]
255            );
256        }
257
258        record_overlapping_starts(&mut nodes, &training, false);
259
260        let initial_path = predict_genes(&mut nodes, &training, false).unwrap_or(0);
261        calculate_dicodon_gene(
262            &mut training,
263            &encoded_sequence.forward_sequence,
264            &encoded_sequence.reverse_complement_sequence,
265            sequence_length,
266            &nodes,
267            initial_path,
268        );
269
270        raw_coding_score(
271            &encoded_sequence.forward_sequence,
272            &encoded_sequence.reverse_complement_sequence,
273            sequence_length,
274            &mut nodes,
275            &training,
276        );
277
278        rbs_score(
279            &encoded_sequence.forward_sequence,
280            &encoded_sequence.reverse_complement_sequence,
281            sequence_length,
282            &mut nodes,
283            &training,
284        );
285
286        train_starts_sd(
287            &encoded_sequence.forward_sequence,
288            &encoded_sequence.reverse_complement_sequence,
289            sequence_length,
290            &nodes,
291            &mut training,
292        );
293        training.uses_shine_dalgarno = should_use_sd(&training);
294        if self.config.force_non_sd {
295            training.uses_shine_dalgarno = false;
296        }
297
298        if !training.uses_shine_dalgarno {
299            train_starts_nonsd(
300                &encoded_sequence.forward_sequence,
301                &encoded_sequence.reverse_complement_sequence,
302                sequence_length,
303                &mut nodes,
304                &mut training,
305            );
306        }
307
308        if !self.config.quiet {
309            eprintln!("Training complete!");
310        }
311
312        Ok(Orphos {
313            config: self.config.clone(),
314            training: Some(training),
315            _state: PhantomData,
316        })
317    }
318}
319
320impl TrainedOrphos {
321    /// Creates a new trained Orphos instance with pre-computed training data.
322    ///
323    /// This is useful when you have previously computed training data that you
324    /// want to reuse for gene prediction on multiple sequences.
325    ///
326    /// # Arguments
327    ///
328    /// * `config` - Configuration options for gene prediction
329    /// * `training` - Pre-computed training data
330    ///
331    /// # Examples
332    ///
333    /// ```rust,no_run
334    /// use orphos_core::engine::TrainedOrphos;
335    /// use orphos_core::config::OrphosConfig;
336    /// use orphos_core::types::Training;
337    ///
338    /// let config = OrphosConfig::default();
339    /// let training = Training::default(); // In practice, load from file
340    ///
341    /// let trained = TrainedOrphos::new(config, training);
342    /// ```
343    pub const fn new(config: OrphosConfig, training: Training) -> Self {
344        Self {
345            config,
346            training: Some(training),
347            _state: PhantomData,
348        }
349    }
350
351    /// Finds genes in a single genome sequence using the trained model.
352    ///
353    /// Uses dynamic programming to find the optimal set of non-overlapping genes
354    /// based on the statistical model built during training. This method performs:
355    ///
356    /// 1. Node generation (start/stop codon detection)
357    /// 2. Node scoring (coding potential, RBS, start codon usage)
358    /// 3. Dynamic programming gene selection
359    /// 4. Gene quality filtering
360    /// 5. Start position refinement
361    ///
362    /// # Arguments
363    ///
364    /// * `encoded_sequence` - The genome sequence encoded in bitmap format
365    ///
366    /// # Returns
367    ///
368    /// A vector of [`Gene`] predictions sorted by position. Returns an empty
369    /// vector if no genes are found.
370    ///
371    /// # Errors
372    ///
373    /// Returns [`OrphosError::InvalidSequence`] if:
374    /// - The Orphos instance is not properly trained
375    /// - Node generation fails
376    /// - Sequence contains invalid characters
377    ///
378    /// # Examples
379    ///
380    /// ```rust,no_run
381    /// use orphos_core::engine::UntrainedOrphos;
382    /// use orphos_core::sequence::encoded::EncodedSequence;
383    ///
384    /// let mut orphos = UntrainedOrphos::new();
385    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
386    /// let encoded = EncodedSequence::without_masking(sequence);
387    ///
388    /// let trained = orphos.train_single_genome(&encoded)?;
389    ///
390    /// // Use OrphosAnalyzer for a higher-level API to find genes
391    /// use orphos_core::OrphosAnalyzer;
392    /// use orphos_core::config::OrphosConfig;
393    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
394    /// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
395    /// for gene in &results.genes {
396    ///     println!("Gene at {}-{} (strand: {:?})",
397    ///              gene.coordinates.begin, gene.coordinates.end, gene.coordinates.strand);
398    /// }
399    /// # Ok::<(), orphos_core::types::OrphosError>(())
400    /// ```
401    fn find_genes_single(
402        &self,
403        encoded_sequence: &EncodedSequence,
404    ) -> Result<Vec<Gene>, OrphosError> {
405        let mut nodes = Vec::new();
406        let training = self
407            .training
408            .as_ref()
409            .ok_or_else(|| OrphosError::InvalidSequence("Orphos is not trained".to_string()))?;
410
411        let _num_nodes = add_nodes(
412            encoded_sequence,
413            &mut nodes,
414            self.config.closed_ends,
415            training,
416        )?;
417
418        sort_nodes_by_position(&mut nodes);
419
420        // Tap the training state right before scoring to ensure parity at inference time
421
422        score_nodes(
423            encoded_sequence,
424            &mut nodes,
425            training,
426            self.config.closed_ends,
427            false,
428        )?;
429
430        record_overlapping_starts(&mut nodes, training, true);
431
432        let gene_path = match predict_genes(&mut nodes, training, true) {
433            Some(path) => path,
434            None => {
435                // No genes found - return empty gene list
436                return Ok(vec![]);
437            }
438        };
439
440        eliminate_bad_genes(&mut nodes, Some(gene_path), training);
441
442        let genes = GeneBuilder::from_nodes(&nodes, gene_path, training, 1)
443            .with_tweaked_starts()
444            .with_annotations()
445            .build();
446
447        Ok(genes)
448    }
449}
450
451/// High-level gene finding analyzer with automatic training.
452///
453/// This struct provides a simplified interface for gene prediction that handles
454/// training automatically. It's the recommended entry point for most users.
455///
456/// Unlike the type-safe [`Orphos`] struct, `OrphosAnalyzer` manages training
457/// internally and provides convenient methods for analyzing sequences from various
458/// sources (files, strings, byte slices).
459///
460/// # Modes
461///
462/// - **Single Genome Mode** (default): Trains on each sequence individually for
463///   optimal accuracy on complete genomes
464/// - **Metagenomic Mode**: Uses pre-computed models for fragmented sequences
465///
466/// # Examples
467///
468/// ## Analyze a sequence string
469///
470/// ```rust,no_run
471/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
472///
473/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
474///
475/// let sequence = "ATGAAACGCATTAGCACCACCATT...";
476/// let results = analyzer.analyze_sequence(sequence, Some("genome1".to_string()))?;
477///
478/// println!("Found {} genes in {} bp sequence",
479///          results.genes.len(),
480///          results.sequence_info.length);
481/// # Ok::<(), orphos_core::types::OrphosError>(())
482/// ```
483///
484/// ## Analyze a FASTA file
485///
486/// ```rust,no_run
487/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
488///
489/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
490/// let results = analyzer.analyze_fasta_file("genome.fasta")?;
491///
492/// for result in results {
493///     println!("Sequence: {}", result.sequence_info.header);
494///     println!("  Genes: {}", result.genes.len());
495///     println!("  GC%: {:.2}", result.sequence_info.gc_content * 100.0);
496/// }
497/// # Ok::<(), orphos_core::types::OrphosError>(())
498/// ```
499///
500/// ## With custom configuration
501///
502/// ```rust,no_run
503/// use orphos_core::{OrphosAnalyzer, config::{OrphosConfig, OutputFormat}};
504///
505/// let config = OrphosConfig {
506///     closed_ends: true,
507///     mask_n_runs: true,
508///     output_format: OutputFormat::Gff,
509///     num_threads: Some(4),
510///     ..Default::default()
511/// };
512///
513/// let mut analyzer = OrphosAnalyzer::new(config);
514/// # Ok::<(), orphos_core::types::OrphosError>(())
515/// ```
516#[derive(Debug)]
517pub struct OrphosAnalyzer {
518    /// Configuration options for gene prediction
519    pub config: OrphosConfig,
520}
521
522impl OrphosAnalyzer {
523    /// Creates a new analyzer with the specified configuration.
524    ///
525    /// # Arguments
526    ///
527    /// * `config` - Configuration options for gene prediction
528    ///
529    /// # Examples
530    ///
531    /// ```rust
532    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
533    ///
534    /// let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
535    /// ```
536    pub const fn new(config: OrphosConfig) -> Self {
537        Self { config }
538    }
539
540    /// Analyzes sequences from a FASTA file.
541    ///
542    /// Reads all sequences from the FASTA file and performs gene prediction on each.
543    /// In single genome mode, each sequence is trained and analyzed independently.
544    ///
545    /// # Arguments
546    ///
547    /// * `path` - Path to the FASTA file
548    ///
549    /// # Returns
550    ///
551    /// A vector of [`OrphosResults`], one for each sequence in the file.
552    ///
553    /// # Errors
554    ///
555    /// Returns [`OrphosError`] if:
556    /// - The file cannot be read
557    /// - The FASTA format is invalid
558    /// - Any sequence fails analysis
559    ///
560    /// # Examples
561    ///
562    /// ```rust,no_run
563    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
564    ///
565    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
566    /// let results = analyzer.analyze_fasta_file("genomes.fasta")?;
567    ///
568    /// for (i, result) in results.iter().enumerate() {
569    ///     println!("Sequence {}: {} genes", i + 1, result.genes.len());
570    /// }
571    /// # Ok::<(), orphos_core::types::OrphosError>(())
572    /// ```
573    pub fn analyze_fasta_file<P: AsRef<Path>>(
574        &mut self,
575        path: P,
576    ) -> Result<Vec<OrphosResults>, OrphosError> {
577        let sequences = read_fasta_sequences(path.as_ref().to_str().unwrap())?;
578
579        if self.config.metagenomic {
580            Ok(vec![])
581        } else {
582            let mut results = Vec::new();
583            for (header, description, seq_bytes) in sequences {
584                let result = self.analyze_sequence_bytes(&seq_bytes, header, description)?;
585                results.push(result);
586            }
587            Ok(results)
588        }
589    }
590
591    /// Analyzes a single sequence from a string.
592    ///
593    /// Converts the string sequence to bytes and performs gene prediction.
594    /// This is a convenience method for analyzing sequences already loaded
595    /// into memory.
596    ///
597    /// # Arguments
598    ///
599    /// * `sequence` - DNA sequence string (A, T, G, C, N)
600    /// * `header` - Optional sequence identifier (defaults to "Orphos_Seq_1")
601    ///
602    /// # Returns
603    ///
604    /// [`OrphosResults`] containing genes, training data, and sequence info.
605    ///
606    /// # Errors
607    ///
608    /// Returns [`OrphosError`] if:
609    /// - The sequence is too short (< 20,000 bp recommended)
610    /// - Training fails
611    /// - Gene prediction fails
612    ///
613    /// # Examples
614    ///
615    /// ```rust,no_run
616    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
617    ///
618    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
619    ///
620    /// let sequence = "ATGAAACGCATTAGCACCACCATT...";
621    /// let results = analyzer.analyze_sequence(sequence, Some("E. coli K12".to_string()))?;
622    ///
623    /// println!("Analyzed: {}", results.sequence_info.header);
624    /// println!("Found {} genes", results.genes.len());
625    /// println!("GC content: {:.2}%", results.sequence_info.gc_content * 100.0);
626    /// # Ok::<(), orphos_core::types::OrphosError>(())
627    /// ```
628    pub fn analyze_sequence(
629        &mut self,
630        sequence: &str,
631        header: Option<String>,
632    ) -> Result<OrphosResults, OrphosError> {
633        let seq_bytes = sequence.as_bytes();
634        let header = header.unwrap_or_else(|| "Orphos_Seq_1".to_string());
635
636        self.analyze_sequence_bytes(seq_bytes, header, None)
637    }
638
639    /// Analyzes a single sequence from raw bytes.
640    ///
641    /// This is the core analysis method used by other convenience methods.
642    /// It handles the complete workflow: encoding, training, and gene prediction.
643    ///
644    /// # Arguments
645    ///
646    /// * `sequence` - Raw DNA sequence bytes (ASCII: A, T, G, C, N)
647    /// * `header` - Sequence identifier for output
648    /// * `description` - Optional sequence description
649    ///
650    /// # Returns
651    ///
652    /// [`OrphosResults`] containing:
653    /// - Predicted genes with coordinates and scores
654    /// - Training parameters used
655    /// - Sequence statistics (length, GC content)
656    ///
657    /// # Errors
658    ///
659    /// Returns [`OrphosError`] if:
660    /// - The sequence is too short for reliable training
661    /// - Sequence encoding fails
662    /// - Training or prediction fails
663    ///
664    /// # Examples
665    ///
666    /// ```rust,no_run
667    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
668    ///
669    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
670    ///
671    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
672    /// let results = analyzer.analyze_sequence_bytes(
673    ///     sequence,
674    ///     "sequence1".to_string(),
675    ///     Some("E. coli genome".to_string())
676    /// )?;
677    ///
678    /// println!("Found {} genes", results.genes.len());
679    /// # Ok::<(), orphos_core::types::OrphosError>(())
680    /// ```
681    pub fn analyze_sequence_bytes(
682        &mut self,
683        sequence: &[u8],
684        header: String,
685        description: Option<String>,
686    ) -> Result<OrphosResults, OrphosError> {
687        let sequence_length = sequence.len();
688        let encoded_sequence = self.encode_sequence(sequence);
689
690        let mut untrained_orphos = UntrainedOrphos::with_config(self.config.clone())?;
691        let trained_orphos = untrained_orphos.train_single_genome(&encoded_sequence)?;
692        let genes = trained_orphos.find_genes_single(&encoded_sequence)?;
693
694        Ok(OrphosResults {
695            genes: genes.clone(),
696            training_used: trained_orphos.training.unwrap_or_default(),
697            sequence_info: SequenceInfo {
698                length: sequence_length,
699                gc_content: encoded_sequence.gc_content,
700                num_genes: genes.len(),
701                header,
702                description,
703            },
704            metagenomic_model: if self.config.metagenomic {
705                Some("Best".to_string())
706            } else {
707                None
708            },
709        })
710    }
711
712    /// Encodes a sequence to bitmap format for efficient processing.
713    ///
714    /// Converts the DNA sequence into a compact binary representation that
715    /// enables fast nucleotide lookups. Optionally masks runs of N characters.
716    ///
717    /// # Arguments
718    ///
719    /// * `sequence` - Raw DNA sequence bytes
720    ///
721    /// # Returns
722    ///
723    /// An [`EncodedSequence`] with forward and reverse-complement strands
724    /// encoded in bitmap format.
725    fn encode_sequence(&self, sequence: &[u8]) -> EncodedSequence {
726        if self.config.mask_n_runs {
727            EncodedSequence::with_masking(sequence)
728        } else {
729            EncodedSequence::without_masking(sequence)
730        }
731    }
732}
733
734#[cfg(test)]
735mod tests {
736    use super::*;
737    use crate::config::OutputFormat;
738    use crate::constants::TEST_SEQUENCE_REPEAT_FACTOR;
739    use std::env;
740    use std::fs;
741
742    // Helper function to create a simple test sequence
743    fn create_test_sequence() -> Vec<u8> {
744        // Simple DNA sequence with some genes
745        "ATGAAACGTAAATAG".as_bytes().to_vec()
746    }
747
748    // Helper function to create a longer test sequence for training
749    fn create_training_sequence() -> Vec<u8> {
750        // A longer sequence suitable for training (> 20,000 bp recommended)
751        let basic_gene = "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG";
752        basic_gene
753            .repeat(TEST_SEQUENCE_REPEAT_FACTOR)
754            .as_bytes()
755            .to_vec()
756    }
757
758    // Helper function to create an EncodedSequence for testing
759    fn create_encoded_sequence_for_test(seq: &[u8]) -> EncodedSequence {
760        EncodedSequence::without_masking(seq)
761    }
762
763    #[test]
764    fn test_training_state_traits() {
765        // Test that training state markers implement required traits
766        let _untrained: Untrained = Untrained;
767        let _trained: Trained = Trained;
768
769        // Test that they can be cloned and debugged
770        let untrained_clone = _untrained.clone();
771        let trained_clone = _trained.clone();
772
773        assert_eq!(format!("{:?}", untrained_clone), "Untrained");
774        assert_eq!(format!("{:?}", trained_clone), "Trained");
775    }
776
777    #[test]
778    fn test_untrained_orphos_new() {
779        let orphos = UntrainedOrphos::new();
780
781        assert!(!orphos.config.metagenomic);
782        assert!(!orphos.config.closed_ends);
783        assert!(!orphos.config.mask_n_runs);
784        assert!(!orphos.config.force_non_sd);
785        assert!(!orphos.config.quiet);
786        assert_eq!(orphos.config.output_format, OutputFormat::Genbank);
787        assert!(orphos.training.is_none());
788    }
789
790    #[test]
791    fn test_untrained_orphos_with_config() {
792        let config = OrphosConfig {
793            metagenomic: true,
794            closed_ends: true,
795            quiet: true,
796            ..OrphosConfig::default()
797        };
798
799        let result = UntrainedOrphos::with_config(config.clone());
800        assert!(result.is_ok());
801
802        let orphos = result.unwrap();
803        assert!(orphos.config.metagenomic);
804        assert!(orphos.config.closed_ends);
805        assert!(orphos.config.quiet);
806        assert!(orphos.training.is_none());
807    }
808
809    #[test]
810    fn test_untrained_orphos_with_thread_config() {
811        let config = OrphosConfig {
812            num_threads: Some(2),
813            ..OrphosConfig::default()
814        };
815
816        let result = UntrainedOrphos::with_config(config);
817        // Thread configuration might fail depending on system
818        // Expected to potentially fail
819        let _ = result;
820    }
821
822    #[test]
823    fn test_untrained_orphos_with_invalid_thread_config() {
824        let config = OrphosConfig {
825            num_threads: Some(0),
826            ..OrphosConfig::default()
827        };
828
829        let result = UntrainedOrphos::with_config(config);
830        // Thread configuration might fail depending on system
831        // Rayon might handle it gracefully or it might fail
832        let _ = result;
833    }
834
835    #[test]
836    fn test_trained_orphos_new() {
837        let config = OrphosConfig::default();
838        let training = Training::default();
839
840        let orphos = TrainedOrphos::new(config, training);
841
842        assert!(orphos.training.is_some());
843    }
844
845    #[test]
846    fn test_train_single_genome_basic() {
847        let mut orphos = UntrainedOrphos::new();
848        let sequence = create_training_sequence();
849        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
850
851        let result = orphos.train_single_genome(&encoded_sequence);
852
853        assert!(result.is_ok());
854        let trained = result.unwrap();
855        assert!(trained.training.is_some());
856    }
857
858    #[test]
859    fn test_train_single_genome_quiet_mode() {
860        let config = OrphosConfig {
861            quiet: true,
862            ..OrphosConfig::default()
863        };
864        let mut orphos = UntrainedOrphos::with_config(config).unwrap();
865
866        let sequence = create_training_sequence();
867        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
868
869        let result = orphos.train_single_genome(&encoded_sequence);
870
871        assert!(result.is_ok());
872    }
873
874    #[test]
875    fn test_train_single_genome_force_non_sd() {
876        let config = OrphosConfig {
877            force_non_sd: true,
878            ..OrphosConfig::default()
879        };
880        let mut orphos = UntrainedOrphos::with_config(config).unwrap();
881
882        let sequence = create_training_sequence();
883        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
884
885        let result = orphos.train_single_genome(&encoded_sequence);
886
887        assert!(result.is_ok());
888        let trained = result.unwrap();
889        let training = trained.training.unwrap();
890        assert!(!training.uses_shine_dalgarno); // Should be forced to false
891    }
892
893    #[test]
894    fn test_trained_orphos_find_genes_single() {
895        // First create a trained orphos
896        let mut orphos = UntrainedOrphos::new();
897        let sequence = create_training_sequence();
898        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
899
900        let trained = orphos.train_single_genome(&encoded_sequence).unwrap();
901
902        // Now test gene finding
903        let test_seq = create_test_sequence();
904        let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
905
906        let result = trained.find_genes_single(&test_encoded_sequence);
907
908        assert!(result.is_ok());
909        let _genes = result.unwrap();
910        // Should find genes in the sequence - number could be 0 for very short sequences
911    }
912
913    #[test]
914    fn test_trained_orphos_find_genes_without_training() {
915        let config = OrphosConfig::default();
916        let orphos = TrainedOrphos {
917            config,
918            training: None,
919            _state: PhantomData,
920        };
921
922        let test_seq = create_test_sequence();
923        let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
924
925        let result = orphos.find_genes_single(&test_encoded_sequence);
926
927        assert!(result.is_err());
928        if let Err(OrphosError::InvalidSequence(msg)) = result {
929            assert!(msg.contains("not trained"));
930        } else {
931            panic!("Expected InvalidSequence error");
932        }
933    }
934
935    #[test]
936    fn test_orphos_analyzer_new() {
937        let config = OrphosConfig::default();
938        let analyzer = OrphosAnalyzer::new(config.clone());
939
940        assert_eq!(analyzer.config.metagenomic, config.metagenomic);
941        assert_eq!(analyzer.config.closed_ends, config.closed_ends);
942    }
943
944    #[test]
945    fn test_analyze_sequence_basic() {
946        let config = OrphosConfig::default();
947        let mut analyzer = OrphosAnalyzer::new(config);
948
949        let sequence =
950            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
951
952        let result = analyzer.analyze_sequence(&sequence, None);
953        assert!(result.is_ok());
954
955        let analysis = result.unwrap();
956        assert_eq!(analysis.sequence_info.header, "Orphos_Seq_1");
957        assert_eq!(analysis.sequence_info.length, sequence.len());
958        assert!(
959            analysis.sequence_info.gc_content >= 0.0 && analysis.sequence_info.gc_content <= 1.0
960        );
961        assert!(analysis.metagenomic_model.is_none());
962    }
963
964    #[test]
965    fn test_analyze_sequence_with_header() {
966        let config = OrphosConfig::default();
967        let mut analyzer = OrphosAnalyzer::new(config);
968
969        let sequence =
970            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
971        let header = Some("test_sequence".to_string());
972
973        let result = analyzer.analyze_sequence(&sequence, header);
974        assert!(result.is_ok());
975
976        let analysis = result.unwrap();
977        assert_eq!(analysis.sequence_info.header, "test_sequence");
978    }
979
980    #[test]
981    fn test_analyze_sequence_bytes() {
982        let config = OrphosConfig::default();
983        let mut analyzer = OrphosAnalyzer::new(config);
984
985        let sequence =
986            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
987        let seq_bytes = sequence.as_bytes();
988        let header = "test_sequence".to_string();
989        let description = Some("Test description".to_string());
990
991        let result =
992            analyzer.analyze_sequence_bytes(seq_bytes, header.clone(), description.clone());
993        assert!(result.is_ok());
994
995        let analysis = result.unwrap();
996        assert_eq!(analysis.sequence_info.header, header);
997        assert_eq!(analysis.sequence_info.description, description);
998        assert_eq!(analysis.sequence_info.length, seq_bytes.len());
999    }
1000
1001    #[test]
1002    fn test_analyze_sequence_metagenomic_config() {
1003        let config = OrphosConfig {
1004            metagenomic: true,
1005            ..OrphosConfig::default()
1006        };
1007        let mut analyzer = OrphosAnalyzer::new(config);
1008
1009        let sequence =
1010            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1011
1012        let result = analyzer.analyze_sequence(&sequence, None);
1013        assert!(result.is_ok());
1014
1015        let analysis = result.unwrap();
1016        assert!(analysis.metagenomic_model.is_some());
1017        assert_eq!(analysis.metagenomic_model.unwrap(), "Best");
1018    }
1019
1020    // #[test]
1021    // fn test_encode_sequence_basic() {
1022    //     let config = OrphosConfig::default();
1023    //     let analyzer = OrphosAnalyzer::new(config);
1024
1025    //     let sequence = b"ATCG";
1026    //     let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1027    //     let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1028    //     let mut masks = Vec::new();
1029
1030    //     let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1031    //     assert!(result.is_ok());
1032
1033    //     let gc_content = result.unwrap();
1034    //     assert!((0.0..=1.0).contains(&gc_content));
1035    // }
1036
1037    // #[test]
1038    // fn test_encode_sequence_with_mask_n_runs() {
1039    //     let config = OrphosConfig {
1040    //         mask_n_runs: true,
1041    //         ..OrphosConfig::default()
1042    //     };
1043    //     let analyzer = OrphosAnalyzer::new(config);
1044
1045    //     let sequence = b"ATCGNNNNGCAT";
1046    //     let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1047    //     let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1048    //     let mut masks = Vec::new();
1049
1050    //     let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1051    //     assert!(result.is_ok());
1052
1053    //     // N-run masking behavior depends on implementation - might not always create masks
1054    //     // So we just check that it doesn't crash
1055    // }
1056
1057    #[test]
1058    fn test_analyze_fasta_file_not_found() {
1059        let config = OrphosConfig::default();
1060        let mut analyzer = OrphosAnalyzer::new(config);
1061
1062        let result = analyzer.analyze_fasta_file("nonexistent_file.fa");
1063        assert!(result.is_err());
1064    }
1065
1066    #[test]
1067    fn test_analyze_fasta_file_metagenomic() {
1068        let config = OrphosConfig {
1069            metagenomic: true,
1070            ..OrphosConfig::default()
1071        };
1072        let mut analyzer = OrphosAnalyzer::new(config);
1073
1074        // Create a temporary FASTA file
1075        let fasta_content = ">test_seq\nATCG\n";
1076        let temp_dir = env::temp_dir();
1077        let temp_file = temp_dir.join("test_metagenomic.fa");
1078        fs::write(&temp_file, fasta_content).unwrap();
1079
1080        let result = analyzer.analyze_fasta_file(&temp_file);
1081        assert!(result.is_ok());
1082
1083        let results = result.unwrap();
1084        assert!(results.is_empty()); // Metagenomic mode returns empty for now
1085
1086        let _ = fs::remove_file(temp_file);
1087    }
1088
1089    #[test]
1090    fn test_analyze_fasta_file_single_genome() {
1091        let config = OrphosConfig::default();
1092        let mut analyzer = OrphosAnalyzer::new(config);
1093
1094        // Create a temporary FASTA file with a longer sequence for training
1095        let sequence =
1096            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1097        let fasta_content = format!(">test_seq\n{}\n", sequence);
1098        let temp_dir = env::temp_dir();
1099        let temp_file = temp_dir.join("test_single_genome.fa");
1100        fs::write(&temp_file, fasta_content).unwrap();
1101
1102        let result = analyzer.analyze_fasta_file(&temp_file);
1103        assert!(result.is_ok());
1104
1105        let results = result.unwrap();
1106        assert_eq!(results.len(), 1);
1107        assert_eq!(results[0].sequence_info.header, "test_seq");
1108
1109        let _ = fs::remove_file(temp_file);
1110    }
1111
1112    #[test]
1113    fn test_analyze_empty_sequence() {
1114        let config = OrphosConfig::default();
1115        let mut analyzer = OrphosAnalyzer::new(config);
1116
1117        let sequence = "";
1118        let result = analyzer.analyze_sequence(sequence, None);
1119
1120        assert!(result.is_err());
1121        if let Err(e) = result {
1122            // Should be an InvalidSequence error about being too short
1123            match e {
1124                OrphosError::InvalidSequence(msg) => {
1125                    assert!(msg.contains("too short"));
1126                }
1127                _ => panic!("Expected InvalidSequence error for empty sequence"),
1128            }
1129        }
1130    }
1131
1132    #[test]
1133    fn test_analyze_very_short_sequence() {
1134        let config = OrphosConfig::default();
1135        let mut analyzer = OrphosAnalyzer::new(config);
1136
1137        let sequence = "ATG"; // Very short sequence (3 bp)
1138        let result = analyzer.analyze_sequence(sequence, None);
1139
1140        assert!(result.is_err());
1141        if let Err(e) = result {
1142            // Should be an InvalidSequence error about being too short
1143            match e {
1144                OrphosError::InvalidSequence(msg) => {
1145                    assert!(msg.contains("too short"));
1146                }
1147                _ => panic!("Expected InvalidSequence error for very short sequence"),
1148            }
1149        }
1150    }
1151
1152    #[test]
1153    fn test_config_cloning() {
1154        let config1 = OrphosConfig::default();
1155        let orphos1 = UntrainedOrphos::with_config(config1.clone()).unwrap();
1156
1157        let config2 = orphos1.config.clone();
1158        let _orphos2 = UntrainedOrphos::with_config(config2).unwrap();
1159    }
1160
1161    #[test]
1162    fn test_debug_formatting() {
1163        let orphos = UntrainedOrphos::new();
1164        let debug_str = format!("{:?}", orphos);
1165        assert!(debug_str.contains("Orphos"));
1166        assert!(debug_str.contains("config"));
1167
1168        let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
1169        let debug_str2 = format!("{:?}", analyzer);
1170        assert!(debug_str2.contains("OrphosAnalyzer"));
1171        assert!(debug_str2.contains("config"));
1172    }
1173
1174    #[test]
1175    fn test_type_aliases() {
1176        // Test that type aliases work correctly
1177        let _untrained: UntrainedOrphos = UntrainedOrphos::new();
1178        let _trained: TrainedOrphos =
1179            TrainedOrphos::new(OrphosConfig::default(), Training::default());
1180
1181        assert_eq!(
1182            std::any::type_name::<UntrainedOrphos>(),
1183            std::any::type_name::<Orphos<Untrained>>()
1184        );
1185        assert_eq!(
1186            std::any::type_name::<TrainedOrphos>(),
1187            std::any::type_name::<Orphos<Trained>>()
1188        );
1189    }
1190
1191    #[test]
1192    fn test_training_state_phantom_data() {
1193        let untrained = UntrainedOrphos::new();
1194        let trained = TrainedOrphos::new(OrphosConfig::default(), Training::default());
1195
1196        assert_eq!(std::mem::size_of_val(&untrained._state), 0);
1197        assert_eq!(std::mem::size_of_val(&trained._state), 0);
1198    }
1199
1200    #[test]
1201    fn test_analyzer_multiple_sequences() {
1202        let config = OrphosConfig::default();
1203        let mut analyzer = OrphosAnalyzer::new(config);
1204
1205        let sequence1 =
1206            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(150);
1207        let sequence2 =
1208            "ATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATAG".repeat(200);
1209
1210        let result1 = analyzer.analyze_sequence(&sequence1, Some("seq1".to_string()));
1211        let result2 = analyzer.analyze_sequence(&sequence2, Some("seq2".to_string()));
1212
1213        assert!(result1.is_ok());
1214        assert!(result2.is_ok());
1215
1216        let analysis1 = result1.unwrap();
1217        let analysis2 = result2.unwrap();
1218
1219        assert_eq!(analysis1.sequence_info.header, "seq1");
1220        assert_eq!(analysis2.sequence_info.header, "seq2");
1221        assert_ne!(
1222            analysis1.sequence_info.length,
1223            analysis2.sequence_info.length
1224        );
1225    }
1226
1227    #[test]
1228    fn test_error_handling_edge_cases() {
1229        let config = OrphosConfig::default();
1230        let mut analyzer = OrphosAnalyzer::new(config);
1231
1232        // Test with sequence containing invalid characters
1233        let invalid_sequence = "ATCGXYZ123";
1234        let result = analyzer.analyze_sequence(invalid_sequence, None);
1235
1236        // Should either handle gracefully or return appropriate error
1237        let _ = result;
1238    }
1239}