Skip to main content

orphos_core/
engine.rs

1use std::marker::PhantomData;
2use std::path::Path;
3
4use crate::algorithms::dynamic_programming::eliminate_bad_genes;
5use crate::algorithms::dynamic_programming::predict_genes;
6use crate::algorithms::gene_finding::GeneBuilder;
7use crate::config::OrphosConfig;
8use crate::constants::MIN_SEQUENCE_LENGTH;
9use crate::metagenomic::bins;
10use crate::metagenomic::get_preset_training_ref;
11use crate::node::{
12    add_nodes, calculate_dicodon_gene, raw_coding_score, rbs_score, record_gc_bias,
13    record_overlapping_starts, reset_node_scores, score_nodes, sort_nodes_by_position,
14};
15use crate::results::{OrphosResults, SequenceInfo};
16use crate::sequence::calc_most_gc_frame;
17use crate::sequence::encoded::EncodedSequence;
18use crate::sequence::read_fasta_sequences;
19use crate::training::non_sd_training::train_starts_nonsd;
20use crate::training::sd_training::train_starts_sd;
21use crate::training::should_use_sd;
22use crate::types::Gene;
23use crate::types::{CodonType, Node, OrphosError, Training};
24use bio::bio_types::strand::Strand;
25
26/// Marker trait for Orphos training state.
27///
28/// This trait is used in the type-state pattern to enforce that training
29/// must be performed before gene prediction. It's implemented by both
30/// [`Untrained`] and [`Trained`] marker types.
31pub trait TrainingState {}
32
33/// Marker type indicating an untrained Orphos instance.
34///
35/// A [`Orphos<Untrained>`] instance can perform training operations
36/// but cannot find genes until training is complete.
37#[derive(Debug, Clone)]
38pub struct Untrained;
39
40/// Marker type indicating a trained Orphos instance.
41///
42/// A [`Orphos<Trained>`] instance has completed training and can
43/// perform gene prediction operations.
44#[derive(Debug, Clone)]
45pub struct Trained;
46
47impl TrainingState for Untrained {}
48impl TrainingState for Trained {}
49
50/// Main Orphos configuration and execution engine.
51///
52/// This struct uses the type-state pattern with the `S` type parameter
53/// to ensure training is performed before gene prediction. The state
54/// transitions from [`Untrained`] to [`Trained`] via the training methods.
55///
56/// # Type Parameters
57///
58/// * `S` - The training state, either [`Untrained`] or [`Trained`]
59///
60/// # Examples
61///
62/// ```rust,no_run
63/// use orphos_core::engine::UntrainedOrphos;
64/// use orphos_core::config::OrphosConfig;
65/// use orphos_core::sequence::encoded::EncodedSequence;
66///
67/// // Create an untrained instance
68/// let mut orphos = UntrainedOrphos::new();
69///
70/// // Encode a sequence
71/// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
72/// let encoded = EncodedSequence::without_masking(sequence);
73///
74/// // Train on the sequence and get results
75/// let trained = orphos.train_single_genome(&encoded)?;
76///
77/// // Use the higher-level API to analyze sequences
78/// use orphos_core::OrphosAnalyzer;
79/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
80/// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
81/// println!("Found {} genes", results.genes.len());
82/// # Ok::<(), orphos_core::types::OrphosError>(())
83/// ```
84#[derive(Debug, Default)]
85pub struct Orphos<S: TrainingState> {
86    /// Configuration options for gene prediction
87    pub config: OrphosConfig,
88    /// Training data obtained from the genome
89    training: Option<Training>,
90    /// Type-state marker (zero-sized)
91    _state: PhantomData<S>,
92}
93
94/// Type alias for an untrained Orphos instance.
95///
96/// Use this when you need to perform training on a new genome.
97pub type UntrainedOrphos = Orphos<Untrained>;
98
99/// Type alias for a trained Orphos instance.
100///
101/// Use this when you have already trained on a genome and want to find genes.
102pub type TrainedOrphos = Orphos<Trained>;
103
104impl UntrainedOrphos {
105    /// Creates a new untrained Orphos instance with default configuration.
106    ///
107    /// # Examples
108    ///
109    /// ```rust
110    /// use orphos_core::engine::UntrainedOrphos;
111    ///
112    /// let orphos = UntrainedOrphos::new();
113    /// ```
114    pub fn new() -> Self {
115        Self {
116            config: OrphosConfig::default(),
117            training: None,
118            _state: PhantomData,
119        }
120    }
121
122    /// Creates a new untrained Orphos instance with custom configuration.
123    ///
124    /// # Arguments
125    ///
126    /// * `config` - Configuration options for gene prediction
127    ///
128    /// # Errors
129    ///
130    /// Returns [`OrphosError`] if thread pool configuration fails.
131    ///
132    /// # Examples
133    ///
134    /// ```rust
135    /// use orphos_core::engine::UntrainedOrphos;
136    /// use orphos_core::config::{OrphosConfig, OutputFormat};
137    ///
138    /// let config = OrphosConfig {
139    ///     closed_ends: true,
140    ///     output_format: OutputFormat::Gff,
141    ///     ..Default::default()
142    /// };
143    ///
144    /// let orphos = UntrainedOrphos::with_config(config)?;
145    /// # Ok::<(), orphos_core::types::OrphosError>(())
146    /// ```
147    pub fn with_config(config: OrphosConfig) -> Result<Self, OrphosError> {
148        if config.circular && config.closed_ends {
149            return Err(OrphosError::InvalidSequence(
150                "Invalid configuration: circular and closed_ends cannot both be true".to_string(),
151            ));
152        }
153
154        let orphos = Self {
155            config,
156            training: None,
157            _state: PhantomData,
158        };
159
160        if let Some(num_threads) = orphos.config.num_threads {
161            rayon::ThreadPoolBuilder::new()
162                .num_threads(num_threads)
163                .build_global()
164                .map_err(|e| {
165                    OrphosError::InvalidSequence(format!("Failed to configure thread pool: {}", e))
166                })?;
167        }
168
169        Ok(orphos)
170    }
171
172    /// Trains the model on a single complete genome sequence.
173    ///
174    /// This method analyzes the sequence to build a statistical model of gene
175    /// characteristics including:
176    /// - Start codon usage (ATG, GTG, TTG)
177    /// - Ribosome binding site motifs
178    /// - Codon usage patterns
179    /// - GC content bias
180    ///
181    /// # Arguments
182    ///
183    /// * `encoded_sequence` - The genome sequence encoded in bitmap format
184    ///
185    /// # Returns
186    ///
187    /// A [`TrainedOrphos`] instance ready for gene prediction.
188    ///
189    /// # Errors
190    ///
191    /// Returns [`OrphosError::InvalidSequence`] if:
192    /// - The sequence is shorter than [`MIN_SEQUENCE_LENGTH`]
193    /// - The sequence contains invalid characters
194    /// - Training fails to converge
195    ///
196    /// # Examples
197    ///
198    /// ```rust,no_run
199    /// use orphos_core::engine::UntrainedOrphos;
200    /// use orphos_core::sequence::encoded::EncodedSequence;
201    ///
202    /// let mut orphos = UntrainedOrphos::new();
203    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
204    /// let encoded = EncodedSequence::without_masking(sequence);
205    ///
206    /// let trained = orphos.train_single_genome(&encoded)?;
207    /// # Ok::<(), orphos_core::types::OrphosError>(())
208    /// ```
209    pub fn train_single_genome(
210        &mut self,
211        encoded_sequence: &EncodedSequence,
212    ) -> Result<TrainedOrphos, OrphosError> {
213        let sequence_length = encoded_sequence.sequence_length;
214        if sequence_length < MIN_SEQUENCE_LENGTH {
215            return Err(OrphosError::InvalidSequence(format!(
216                "Sequence too short for gene prediction: {} bp (minimum {} bp required)",
217                sequence_length, MIN_SEQUENCE_LENGTH
218            )));
219        }
220
221        if !self.config.quiet {
222            eprintln!(
223                "Training on single genome ({} bp, {:.2}% GC)...",
224                sequence_length,
225                encoded_sequence.gc_content * 100.0
226            );
227        }
228        let mut training = Training {
229            gc_content: 0.0,
230            ..Default::default()
231        };
232        training.uses_shine_dalgarno = false;
233        training.gc_bias_factors = [0.0; 3];
234
235        training.gc_content = encoded_sequence.gc_content;
236
237        let mut nodes = Vec::new();
238        let num_nodes = add_nodes(
239            encoded_sequence,
240            &mut nodes,
241            self.config.closed_ends,
242            self.config.circular,
243            &training,
244        )?;
245
246        if !self.config.quiet {
247            eprintln!(
248                "Located {} potential start/stop nodes, closed {}",
249                num_nodes, self.config.closed_ends
250            );
251        }
252
253        sort_nodes_by_position(&mut nodes);
254
255        let gc_frame = calc_most_gc_frame(&encoded_sequence.forward_sequence, sequence_length);
256
257        record_gc_bias(&gc_frame, &mut nodes, &mut training);
258
259        if !self.config.quiet {
260            eprintln!(
261                "Frame bias scores: {:.8} {:.8} {:.8}",
262                training.gc_bias_factors[0],
263                training.gc_bias_factors[1],
264                training.gc_bias_factors[2]
265            );
266        }
267
268        record_overlapping_starts(&mut nodes, &training, false);
269
270        let initial_path = predict_genes(&mut nodes, &training, false).unwrap_or(0);
271        calculate_dicodon_gene(
272            &mut training,
273            &encoded_sequence.forward_sequence,
274            &encoded_sequence.reverse_complement_sequence,
275            sequence_length,
276            &nodes,
277            initial_path,
278        );
279
280        raw_coding_score(
281            &encoded_sequence.forward_sequence,
282            &encoded_sequence.reverse_complement_sequence,
283            sequence_length,
284            &mut nodes,
285            &training,
286        );
287
288        rbs_score(
289            &encoded_sequence.forward_sequence,
290            &encoded_sequence.reverse_complement_sequence,
291            sequence_length,
292            &mut nodes,
293            &training,
294        );
295
296        train_starts_sd(
297            &encoded_sequence.forward_sequence,
298            &encoded_sequence.reverse_complement_sequence,
299            sequence_length,
300            &nodes,
301            &mut training,
302        );
303        training.uses_shine_dalgarno = should_use_sd(&training);
304        if self.config.force_non_sd {
305            training.uses_shine_dalgarno = false;
306        }
307
308        if !training.uses_shine_dalgarno {
309            train_starts_nonsd(
310                &encoded_sequence.forward_sequence,
311                &encoded_sequence.reverse_complement_sequence,
312                sequence_length,
313                &mut nodes,
314                &mut training,
315            );
316        }
317
318        if !self.config.quiet {
319            eprintln!("Training complete!");
320        }
321
322        Ok(Orphos {
323            config: self.config.clone(),
324            training: Some(training),
325            _state: PhantomData,
326        })
327    }
328
329    pub fn train_meta_genome(
330        &mut self,
331        _encoded_sequence: &EncodedSequence,
332    ) -> Result<TrainedOrphos, OrphosError> {
333        if !self.config.quiet {
334            eprintln!("Request: Metagenomic, Phase: Training");
335            eprintln!("Initializing training files...");
336        }
337        // initialize metagenomic bins;
338        if !self.config.quiet {
339            eprintln!("Metagenomic training initialized.");
340        }
341        Ok(Orphos {
342            config: self.config.clone(),
343            training: None,
344            _state: PhantomData,
345        })
346    }
347}
348
349impl TrainedOrphos {
350    /// Creates a new trained Orphos instance with pre-computed training data.
351    ///
352    /// This is useful when you have previously computed training data that you
353    /// want to reuse for gene prediction on multiple sequences.
354    ///
355    /// # Arguments
356    ///
357    /// * `config` - Configuration options for gene prediction
358    /// * `training` - Pre-computed training data
359    ///
360    /// # Examples
361    ///
362    /// ```rust,no_run
363    /// use orphos_core::engine::TrainedOrphos;
364    /// use orphos_core::config::OrphosConfig;
365    /// use orphos_core::types::Training;
366    ///
367    /// let config = OrphosConfig::default();
368    /// let training = Training::default(); // In practice, load from file
369    ///
370    /// let trained = TrainedOrphos::new(config, training);
371    /// ```
372    pub const fn new(config: OrphosConfig, training: Training) -> Self {
373        Self {
374            config,
375            training: Some(training),
376            _state: PhantomData,
377        }
378    }
379
380    /// Finds genes in a single genome sequence using the trained model.
381    ///
382    /// Uses dynamic programming to find the optimal set of non-overlapping genes
383    /// based on the statistical model built during training. This method performs:
384    ///
385    /// 1. Node generation (start/stop codon detection)
386    /// 2. Node scoring (coding potential, RBS, start codon usage)
387    /// 3. Dynamic programming gene selection
388    /// 4. Gene quality filtering
389    /// 5. Start position refinement
390    ///
391    /// # Arguments
392    ///
393    /// * `encoded_sequence` - The genome sequence encoded in bitmap format
394    ///
395    /// # Returns
396    ///
397    /// A vector of [`Gene`] predictions sorted by position. Returns an empty
398    /// vector if no genes are found.
399    ///
400    /// # Errors
401    ///
402    /// Returns [`OrphosError::InvalidSequence`] if:
403    /// - The Orphos instance is not properly trained
404    /// - Node generation fails
405    /// - Sequence contains invalid characters
406    ///
407    /// # Examples
408    ///
409    /// ```rust,no_run
410    /// use orphos_core::engine::UntrainedOrphos;
411    /// use orphos_core::sequence::encoded::EncodedSequence;
412    ///
413    /// let mut orphos = UntrainedOrphos::new();
414    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
415    /// let encoded = EncodedSequence::without_masking(sequence);
416    ///
417    /// let trained = orphos.train_single_genome(&encoded)?;
418    ///
419    /// // Use OrphosAnalyzer for a higher-level API to find genes
420    /// use orphos_core::OrphosAnalyzer;
421    /// use orphos_core::config::OrphosConfig;
422    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
423    /// let results = analyzer.analyze_sequence("ATGAAACGCATTAGCACCACCATT...", None)?;
424    /// for gene in &results.genes {
425    ///     println!("Gene at {}-{} (strand: {:?})",
426    ///              gene.coordinates.begin, gene.coordinates.end, gene.coordinates.strand);
427    /// }
428    /// # Ok::<(), orphos_core::types::OrphosError>(())
429    /// ```
430    fn find_genes_single(
431        &self,
432        encoded_sequence: &EncodedSequence,
433    ) -> Result<Vec<Gene>, OrphosError> {
434        let mut nodes = Vec::new();
435        let training = self
436            .training
437            .as_ref()
438            .ok_or_else(|| OrphosError::InvalidSequence("Orphos is not trained".to_string()))?;
439
440        let _num_nodes = add_nodes(
441            encoded_sequence,
442            &mut nodes,
443            self.config.closed_ends,
444            self.config.circular,
445            training,
446        )?;
447
448        sort_nodes_by_position(&mut nodes);
449
450        // Tap the training state right before scoring to ensure parity at inference time
451
452        score_nodes(
453            encoded_sequence,
454            &mut nodes,
455            training,
456            self.config.closed_ends,
457            false,
458        )?;
459
460        record_overlapping_starts(&mut nodes, training, true);
461
462        let gene_path = match predict_genes(&mut nodes, training, true) {
463            Some(path) => path,
464            None => {
465                // No genes found - return empty gene list
466                return Ok(vec![]);
467            }
468        };
469
470        eliminate_bad_genes(&mut nodes, Some(gene_path), training);
471
472        let genes = GeneBuilder::from_nodes(&nodes, gene_path, training, 1)
473            .with_tweaked_starts()
474            .with_annotations()
475            .build();
476
477        Ok(genes)
478    }
479
480    fn find_genes_meta(
481        &self,
482        encoded_sequence: &EncodedSequence,
483    ) -> Result<(Vec<Gene>, usize), OrphosError> {
484        if !self.config.quiet {
485            eprintln!("Request: Metagenomic, Phase: Gene Finding");
486        }
487        let mut low = 0.88495 * encoded_sequence.gc_content - 0.0102337;
488        if low > 0.65 {
489            low = 0.65;
490        }
491        let mut high = 0.86596 * encoded_sequence.gc_content + 0.1131991;
492        if high < 0.35 {
493            high = 0.35;
494        }
495        let mut max_score = -100.0;
496        let mut max_phase = 0;
497
498        let mut nodes = Vec::new();
499        let mut genes = Vec::new();
500
501        for (i, _bin) in bins().iter().enumerate() {
502            let preset_training = get_preset_training_ref(i).unwrap();
503            if i == 0
504                || preset_training.translation_table
505                    != get_preset_training_ref(i - 1).unwrap().translation_table
506            {
507                let _num_nodes = add_nodes(
508                    encoded_sequence,
509                    &mut nodes,
510                    self.config.closed_ends,
511                    self.config.circular,
512                    preset_training,
513                )?;
514                sort_nodes_by_position(&mut nodes);
515            }
516            if preset_training.gc_content < low || preset_training.gc_content > high {
517                continue;
518            }
519            reset_node_scores(&mut nodes);
520            score_nodes(
521                encoded_sequence,
522                &mut nodes,
523                preset_training,
524                self.config.closed_ends,
525                true,
526            )?;
527            record_overlapping_starts(&mut nodes, preset_training, true);
528            let gene_path = predict_genes(&mut nodes, preset_training, true);
529            if let Some(path) = gene_path
530                && let Some(node) = nodes.get(path)
531                && node.scores.total_score > max_score
532            {
533                max_phase = i;
534                max_score = node.scores.total_score;
535                eliminate_bad_genes(&mut nodes, gene_path, preset_training);
536                genes = GeneBuilder::from_nodes(&nodes, path, preset_training, 1)
537                    .with_tweaked_starts()
538                    .with_annotations()
539                    .build();
540            }
541            // Recover the nodes for the best of the runs.
542        }
543        nodes.clear();
544        let best_training = get_preset_training_ref(max_phase).unwrap();
545        let _ = add_nodes(
546            encoded_sequence,
547            &mut nodes,
548            self.config.closed_ends,
549            self.config.circular,
550            best_training,
551        )?;
552        sort_nodes_by_position(&mut nodes);
553        score_nodes(
554            encoded_sequence,
555            &mut nodes,
556            best_training,
557            self.config.closed_ends,
558            true,
559        )?;
560        // Update display_score from re-scored nodes for GFF column 6
561        // This matches Prodigal's behavior where column 6 uses re-scored values
562        // but the attribute scores stay from the original scoring during the loop
563        update_display_scores(&mut genes, &nodes);
564        Ok((genes, max_phase))
565    }
566}
567
568/// Update gene display_score from re-scored nodes for GFF column 6 output.
569/// Matches Prodigal's behavior where column 6 uses re-scored values.
570fn update_display_scores(genes: &mut [Gene], nodes: &[Node]) {
571    for gene in genes.iter_mut() {
572        // Find the start node by matching position and strand
573        let target_pos = if gene.coordinates.strand == Strand::Forward {
574            gene.coordinates.begin.saturating_sub(1)
575        } else {
576            gene.coordinates.end.saturating_sub(1)
577        };
578
579        if let Some(start_node) = nodes.iter().find(|n| {
580            n.position.index == target_pos
581                && n.position.strand == gene.coordinates.strand
582                && n.position.codon_type != CodonType::Stop
583        }) {
584            gene.display_score =
585                Some(start_node.scores.coding_score + start_node.scores.start_score);
586        }
587    }
588}
589
590/// High-level gene finding analyzer with automatic training.
591///
592/// This struct provides a simplified interface for gene prediction that handles
593/// training automatically. It's the recommended entry point for most users.
594///
595/// Unlike the type-safe [`Orphos`] struct, `OrphosAnalyzer` manages training
596/// internally and provides convenient methods for analyzing sequences from various
597/// sources (files, strings, byte slices).
598///
599/// # Modes
600///
601/// - **Single Genome Mode** (default): Trains on each sequence individually for
602///   optimal accuracy on complete genomes
603/// - **Metagenomic Mode**: Uses pre-computed models for fragmented sequences
604///
605/// # Examples
606///
607/// ## Analyze a sequence string
608///
609/// ```rust,no_run
610/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
611///
612/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
613///
614/// let sequence = "ATGAAACGCATTAGCACCACCATT...";
615/// let results = analyzer.analyze_sequence(sequence, Some("genome1".to_string()))?;
616///
617/// println!("Found {} genes in {} bp sequence",
618///          results.genes.len(),
619///          results.sequence_info.length);
620/// # Ok::<(), orphos_core::types::OrphosError>(())
621/// ```
622///
623/// ## Analyze a FASTA file
624///
625/// ```rust,no_run
626/// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
627///
628/// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
629/// let results = analyzer.analyze_fasta_file("genome.fasta")?;
630///
631/// for result in results {
632///     println!("Sequence: {}", result.sequence_info.header);
633///     println!("  Genes: {}", result.genes.len());
634///     println!("  GC%: {:.2}", result.sequence_info.gc_content * 100.0);
635/// }
636/// # Ok::<(), orphos_core::types::OrphosError>(())
637/// ```
638///
639/// ## With custom configuration
640///
641/// ```rust,no_run
642/// use orphos_core::{OrphosAnalyzer, config::{OrphosConfig, OutputFormat}};
643///
644/// let config = OrphosConfig {
645///     closed_ends: true,
646///     mask_n_runs: true,
647///     output_format: OutputFormat::Gff,
648///     num_threads: Some(4),
649///     ..Default::default()
650/// };
651///
652/// let mut analyzer = OrphosAnalyzer::new(config);
653/// # Ok::<(), orphos_core::types::OrphosError>(())
654/// ```
655#[derive(Debug)]
656pub struct OrphosAnalyzer {
657    /// Configuration options for gene prediction
658    pub config: OrphosConfig,
659}
660
661impl OrphosAnalyzer {
662    /// Creates a new analyzer with the specified configuration.
663    ///
664    /// # Arguments
665    ///
666    /// * `config` - Configuration options for gene prediction
667    ///
668    /// # Examples
669    ///
670    /// ```rust
671    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
672    ///
673    /// let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
674    /// ```
675    pub const fn new(config: OrphosConfig) -> Self {
676        Self { config }
677    }
678
679    /// Analyzes sequences from a FASTA file.
680    ///
681    /// Reads all sequences from the FASTA file and performs gene prediction on each.
682    /// In single genome mode, each sequence is trained and analyzed independently.
683    ///
684    /// # Arguments
685    ///
686    /// * `path` - Path to the FASTA file
687    ///
688    /// # Returns
689    ///
690    /// A vector of [`OrphosResults`], one for each sequence in the file.
691    ///
692    /// # Errors
693    ///
694    /// Returns [`OrphosError`] if:
695    /// - The file cannot be read
696    /// - The FASTA format is invalid
697    /// - Any sequence fails analysis
698    ///
699    /// # Examples
700    ///
701    /// ```rust,no_run
702    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
703    ///
704    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
705    /// let results = analyzer.analyze_fasta_file("genomes.fasta")?;
706    ///
707    /// for (i, result) in results.iter().enumerate() {
708    ///     println!("Sequence {}: {} genes", i + 1, result.genes.len());
709    /// }
710    /// # Ok::<(), orphos_core::types::OrphosError>(())
711    /// ```
712    pub fn analyze_fasta_file<P: AsRef<Path>>(
713        &mut self,
714        path: P,
715    ) -> Result<Vec<OrphosResults>, OrphosError> {
716        let sequences = read_fasta_sequences(path.as_ref().to_str().unwrap())?;
717
718        let mut results = Vec::new();
719        for (header, description, seq_bytes) in sequences {
720            let result = self.analyze_sequence_bytes(&seq_bytes, header, description)?;
721            results.push(result);
722        }
723        Ok(results)
724    }
725
726    /// Analyzes a single sequence from a string.
727    ///
728    /// Converts the string sequence to bytes and performs gene prediction.
729    /// This is a convenience method for analyzing sequences already loaded
730    /// into memory.
731    ///
732    /// # Arguments
733    ///
734    /// * `sequence` - DNA sequence string (A, T, G, C, N)
735    /// * `header` - Optional sequence identifier (defaults to "Orphos_Seq_1")
736    ///
737    /// # Returns
738    ///
739    /// [`OrphosResults`] containing genes, training data, and sequence info.
740    ///
741    /// # Errors
742    ///
743    /// Returns [`OrphosError`] if:
744    /// - The sequence is too short (< 20,000 bp recommended)
745    /// - Training fails
746    /// - Gene prediction fails
747    ///
748    /// # Examples
749    ///
750    /// ```rust,no_run
751    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
752    ///
753    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
754    ///
755    /// let sequence = "ATGAAACGCATTAGCACCACCATT...";
756    /// let results = analyzer.analyze_sequence(sequence, Some("E. coli K12".to_string()))?;
757    ///
758    /// println!("Analyzed: {}", results.sequence_info.header);
759    /// println!("Found {} genes", results.genes.len());
760    /// println!("GC content: {:.2}%", results.sequence_info.gc_content * 100.0);
761    /// # Ok::<(), orphos_core::types::OrphosError>(())
762    /// ```
763    pub fn analyze_sequence(
764        &mut self,
765        sequence: &str,
766        header: Option<String>,
767    ) -> Result<OrphosResults, OrphosError> {
768        let seq_bytes = sequence.as_bytes();
769        let header = header.unwrap_or_else(|| "Orphos_Seq_1".to_string());
770
771        self.analyze_sequence_bytes(seq_bytes, header, None)
772    }
773
774    /// Analyzes a single sequence from raw bytes.
775    ///
776    /// This is the core analysis method used by other convenience methods.
777    /// It handles the complete workflow: encoding, training, and gene prediction.
778    ///
779    /// # Arguments
780    ///
781    /// * `sequence` - Raw DNA sequence bytes (ASCII: A, T, G, C, N)
782    /// * `header` - Sequence identifier for output
783    /// * `description` - Optional sequence description
784    ///
785    /// # Returns
786    ///
787    /// [`OrphosResults`] containing:
788    /// - Predicted genes with coordinates and scores
789    /// - Training parameters used
790    /// - Sequence statistics (length, GC content)
791    ///
792    /// # Errors
793    ///
794    /// Returns [`OrphosError`] if:
795    /// - The sequence is too short for reliable training
796    /// - Sequence encoding fails
797    /// - Training or prediction fails
798    ///
799    /// # Examples
800    ///
801    /// ```rust,no_run
802    /// use orphos_core::{OrphosAnalyzer, config::OrphosConfig};
803    ///
804    /// let mut analyzer = OrphosAnalyzer::new(OrphosConfig::default());
805    ///
806    /// let sequence = b"ATGAAACGCATTAGCACCACCATT...";
807    /// let results = analyzer.analyze_sequence_bytes(
808    ///     sequence,
809    ///     "sequence1".to_string(),
810    ///     Some("E. coli genome".to_string())
811    /// )?;
812    ///
813    /// println!("Found {} genes", results.genes.len());
814    /// # Ok::<(), orphos_core::types::OrphosError>(())
815    /// ```
816    pub fn analyze_sequence_bytes(
817        &mut self,
818        sequence: &[u8],
819        header: String,
820        description: Option<String>,
821    ) -> Result<OrphosResults, OrphosError> {
822        let sequence_length = sequence.len();
823        let encoded_sequence = self.encode_sequence(sequence);
824
825        let mut untrained_orphos = UntrainedOrphos::with_config(self.config.clone())?;
826        let trained_orphos = if self.config.metagenomic {
827            untrained_orphos.train_meta_genome(&encoded_sequence)?
828        } else {
829            untrained_orphos.train_single_genome(&encoded_sequence)?
830        };
831        let (genes, metagenomic_model, best_training) = if self.config.metagenomic {
832            let (genes, best_phase) = trained_orphos.find_genes_meta(&encoded_sequence)?;
833            let bin = &bins()[best_phase];
834            let training = get_preset_training_ref(best_phase).unwrap();
835            let model_desc = format!(
836                "{}|{}|{}|{:.1}|{}|{}",
837                bin.id,
838                bin.name,
839                bin.domain,
840                bin.gc_percent,
841                training.translation_table,
842                if training.uses_shine_dalgarno { 1 } else { 0 }
843            );
844            (genes, Some(model_desc), training.clone())
845        } else {
846            let genes = trained_orphos.find_genes_single(&encoded_sequence)?;
847            let training = trained_orphos.training.clone().unwrap_or_default();
848            (genes, None, training)
849        };
850
851        Ok(OrphosResults {
852            genes: genes.clone(),
853            training_used: best_training,
854            sequence_info: SequenceInfo {
855                length: sequence_length,
856                gc_content: encoded_sequence.gc_content,
857                num_genes: genes.len(),
858                header,
859                description,
860            },
861            metagenomic_model,
862        })
863    }
864
865    /// Encodes a sequence to bitmap format for efficient processing.
866    ///
867    /// Converts the DNA sequence into a compact binary representation that
868    /// enables fast nucleotide lookups. Optionally masks runs of N characters.
869    ///
870    /// # Arguments
871    ///
872    /// * `sequence` - Raw DNA sequence bytes
873    ///
874    /// # Returns
875    ///
876    /// An [`EncodedSequence`] with forward and reverse-complement strands
877    /// encoded in bitmap format.
878    fn encode_sequence(&self, sequence: &[u8]) -> EncodedSequence {
879        if self.config.mask_n_runs {
880            EncodedSequence::with_masking(sequence)
881        } else {
882            EncodedSequence::without_masking(sequence)
883        }
884    }
885}
886
887#[cfg(test)]
888mod tests {
889    use super::*;
890    use crate::config::OutputFormat;
891    use crate::constants::TEST_SEQUENCE_REPEAT_FACTOR;
892    use std::env;
893    use std::fs;
894
895    // Helper function to create a simple test sequence
896    fn create_test_sequence() -> Vec<u8> {
897        // Simple DNA sequence with some genes
898        "ATGAAACGTAAATAG".as_bytes().to_vec()
899    }
900
901    // Helper function to create a longer test sequence for training
902    fn create_training_sequence() -> Vec<u8> {
903        // A longer sequence suitable for training (> 20,000 bp recommended)
904        let basic_gene = "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG";
905        basic_gene
906            .repeat(TEST_SEQUENCE_REPEAT_FACTOR)
907            .as_bytes()
908            .to_vec()
909    }
910
911    // Helper function to create an EncodedSequence for testing
912    fn create_encoded_sequence_for_test(seq: &[u8]) -> EncodedSequence {
913        EncodedSequence::without_masking(seq)
914    }
915
916    #[test]
917    fn test_training_state_traits() {
918        // Test that training state markers implement required traits
919        let _untrained: Untrained = Untrained;
920        let _trained: Trained = Trained;
921
922        // Test that they can be cloned and debugged
923        let untrained_clone = _untrained.clone();
924        let trained_clone = _trained.clone();
925
926        assert_eq!(format!("{:?}", untrained_clone), "Untrained");
927        assert_eq!(format!("{:?}", trained_clone), "Trained");
928    }
929
930    #[test]
931    fn test_untrained_orphos_new() {
932        let orphos = UntrainedOrphos::new();
933
934        assert!(!orphos.config.metagenomic);
935        assert!(!orphos.config.closed_ends);
936        assert!(!orphos.config.mask_n_runs);
937        assert!(!orphos.config.force_non_sd);
938        assert!(!orphos.config.quiet);
939        assert_eq!(orphos.config.output_format, OutputFormat::Genbank);
940        assert!(orphos.training.is_none());
941    }
942
943    #[test]
944    fn test_untrained_orphos_with_config() {
945        let config = OrphosConfig {
946            metagenomic: true,
947            closed_ends: true,
948            quiet: true,
949            ..OrphosConfig::default()
950        };
951
952        let result = UntrainedOrphos::with_config(config.clone());
953        assert!(result.is_ok());
954
955        let orphos = result.unwrap();
956        assert!(orphos.config.metagenomic);
957        assert!(orphos.config.closed_ends);
958        assert!(orphos.config.quiet);
959        assert!(orphos.training.is_none());
960    }
961
962    #[test]
963    fn test_untrained_orphos_with_thread_config() {
964        let config = OrphosConfig {
965            num_threads: Some(2),
966            ..OrphosConfig::default()
967        };
968
969        let result = UntrainedOrphos::with_config(config);
970        // Thread configuration might fail depending on system
971        // Expected to potentially fail
972        let _ = result;
973    }
974
975    #[test]
976    fn test_untrained_orphos_with_invalid_thread_config() {
977        let config = OrphosConfig {
978            num_threads: Some(0),
979            ..OrphosConfig::default()
980        };
981
982        let result = UntrainedOrphos::with_config(config);
983        // Thread configuration might fail depending on system
984        // Rayon might handle it gracefully or it might fail
985        let _ = result;
986    }
987
988    #[test]
989    fn test_untrained_orphos_with_conflicting_topology_config() {
990        let config = OrphosConfig {
991            circular: true,
992            closed_ends: true,
993            ..OrphosConfig::default()
994        };
995
996        let result = UntrainedOrphos::with_config(config);
997        assert!(result.is_err());
998        if let Err(OrphosError::InvalidSequence(msg)) = result {
999            assert!(msg.contains("circular"));
1000            assert!(msg.contains("closed_ends"));
1001        } else {
1002            panic!("Expected InvalidSequence error");
1003        }
1004    }
1005
1006    #[test]
1007    fn test_trained_orphos_new() {
1008        let config = OrphosConfig::default();
1009        let training = Training::default();
1010
1011        let orphos = TrainedOrphos::new(config, training);
1012
1013        assert!(orphos.training.is_some());
1014    }
1015
1016    #[test]
1017    fn test_train_single_genome_basic() {
1018        let mut orphos = UntrainedOrphos::new();
1019        let sequence = create_training_sequence();
1020        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
1021
1022        let result = orphos.train_single_genome(&encoded_sequence);
1023
1024        assert!(result.is_ok());
1025        let trained = result.unwrap();
1026        assert!(trained.training.is_some());
1027    }
1028
1029    #[test]
1030    fn test_train_single_genome_quiet_mode() {
1031        let config = OrphosConfig {
1032            quiet: true,
1033            ..OrphosConfig::default()
1034        };
1035        let mut orphos = UntrainedOrphos::with_config(config).unwrap();
1036
1037        let sequence = create_training_sequence();
1038        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
1039
1040        let result = orphos.train_single_genome(&encoded_sequence);
1041
1042        assert!(result.is_ok());
1043    }
1044
1045    #[test]
1046    fn test_train_single_genome_force_non_sd() {
1047        let config = OrphosConfig {
1048            force_non_sd: true,
1049            ..OrphosConfig::default()
1050        };
1051        let mut orphos = UntrainedOrphos::with_config(config).unwrap();
1052
1053        let sequence = create_training_sequence();
1054        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
1055
1056        let result = orphos.train_single_genome(&encoded_sequence);
1057
1058        assert!(result.is_ok());
1059        let trained = result.unwrap();
1060        let training = trained.training.unwrap();
1061        assert!(!training.uses_shine_dalgarno); // Should be forced to false
1062    }
1063
1064    #[test]
1065    fn test_trained_orphos_find_genes_single() {
1066        // First create a trained orphos
1067        let mut orphos = UntrainedOrphos::new();
1068        let sequence = create_training_sequence();
1069        let encoded_sequence = create_encoded_sequence_for_test(&sequence);
1070
1071        let trained = orphos.train_single_genome(&encoded_sequence).unwrap();
1072
1073        // Now test gene finding
1074        let test_seq = create_test_sequence();
1075        let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
1076
1077        let result = trained.find_genes_single(&test_encoded_sequence);
1078
1079        assert!(result.is_ok());
1080        let _genes = result.unwrap();
1081        // Should find genes in the sequence - number could be 0 for very short sequences
1082    }
1083
1084    #[test]
1085    fn test_trained_orphos_find_genes_without_training() {
1086        let config = OrphosConfig::default();
1087        let orphos = TrainedOrphos {
1088            config,
1089            training: None,
1090            _state: PhantomData,
1091        };
1092
1093        let test_seq = create_test_sequence();
1094        let test_encoded_sequence = create_encoded_sequence_for_test(&test_seq);
1095
1096        let result = orphos.find_genes_single(&test_encoded_sequence);
1097
1098        assert!(result.is_err());
1099        if let Err(OrphosError::InvalidSequence(msg)) = result {
1100            assert!(msg.contains("not trained"));
1101        } else {
1102            panic!("Expected InvalidSequence error");
1103        }
1104    }
1105
1106    #[test]
1107    fn test_orphos_analyzer_new() {
1108        let config = OrphosConfig::default();
1109        let analyzer = OrphosAnalyzer::new(config.clone());
1110
1111        assert_eq!(analyzer.config.metagenomic, config.metagenomic);
1112        assert_eq!(analyzer.config.closed_ends, config.closed_ends);
1113    }
1114
1115    #[test]
1116    fn test_analyze_sequence_basic() {
1117        let config = OrphosConfig::default();
1118        let mut analyzer = OrphosAnalyzer::new(config);
1119
1120        let sequence =
1121            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1122
1123        let result = analyzer.analyze_sequence(&sequence, None);
1124        assert!(result.is_ok());
1125
1126        let analysis = result.unwrap();
1127        assert_eq!(analysis.sequence_info.header, "Orphos_Seq_1");
1128        assert_eq!(analysis.sequence_info.length, sequence.len());
1129        assert!(
1130            analysis.sequence_info.gc_content >= 0.0 && analysis.sequence_info.gc_content <= 1.0
1131        );
1132        assert!(analysis.metagenomic_model.is_none());
1133    }
1134
1135    #[test]
1136    fn test_analyze_sequence_with_header() {
1137        let config = OrphosConfig::default();
1138        let mut analyzer = OrphosAnalyzer::new(config);
1139
1140        let sequence =
1141            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1142        let header = Some("test_sequence".to_string());
1143
1144        let result = analyzer.analyze_sequence(&sequence, header);
1145        assert!(result.is_ok());
1146
1147        let analysis = result.unwrap();
1148        assert_eq!(analysis.sequence_info.header, "test_sequence");
1149    }
1150
1151    #[test]
1152    fn test_analyze_sequence_bytes() {
1153        let config = OrphosConfig::default();
1154        let mut analyzer = OrphosAnalyzer::new(config);
1155
1156        let sequence =
1157            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1158        let seq_bytes = sequence.as_bytes();
1159        let header = "test_sequence".to_string();
1160        let description = Some("Test description".to_string());
1161
1162        let result =
1163            analyzer.analyze_sequence_bytes(seq_bytes, header.clone(), description.clone());
1164        assert!(result.is_ok());
1165
1166        let analysis = result.unwrap();
1167        assert_eq!(analysis.sequence_info.header, header);
1168        assert_eq!(analysis.sequence_info.description, description);
1169        assert_eq!(analysis.sequence_info.length, seq_bytes.len());
1170    }
1171
1172    #[test]
1173    fn test_analyze_sequence_metagenomic_config() {
1174        let config = OrphosConfig {
1175            metagenomic: true,
1176            ..OrphosConfig::default()
1177        };
1178        let mut analyzer = OrphosAnalyzer::new(config);
1179
1180        let sequence =
1181            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1182
1183        let result = analyzer.analyze_sequence(&sequence, None);
1184        assert!(result.is_ok());
1185
1186        let analysis = result.unwrap();
1187        assert!(analysis.metagenomic_model.is_some());
1188        // Metagenomic model string format: "id|name|domain|gc_percent|transl_table|uses_sd"
1189        let model = analysis.metagenomic_model.unwrap();
1190        assert!(
1191            model.contains("|"),
1192            "Expected metagenomic model description with | separator"
1193        );
1194    }
1195
1196    // #[test]
1197    // fn test_encode_sequence_basic() {
1198    //     let config = OrphosConfig::default();
1199    //     let analyzer = OrphosAnalyzer::new(config);
1200
1201    //     let sequence = b"ATCG";
1202    //     let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1203    //     let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1204    //     let mut masks = Vec::new();
1205
1206    //     let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1207    //     assert!(result.is_ok());
1208
1209    //     let gc_content = result.unwrap();
1210    //     assert!((0.0..=1.0).contains(&gc_content));
1211    // }
1212
1213    // #[test]
1214    // fn test_encode_sequence_with_mask_n_runs() {
1215    //     let config = OrphosConfig {
1216    //         mask_n_runs: true,
1217    //         ..OrphosConfig::default()
1218    //     };
1219    //     let analyzer = OrphosAnalyzer::new(config);
1220
1221    //     let sequence = b"ATCGNNNNGCAT";
1222    //     let mut encoded = vec![0u8; (sequence.len() * 2).div_ceil(8)];
1223    //     let mut unknown = vec![0u8; sequence.len().div_ceil(8)];
1224    //     let mut masks = Vec::new();
1225
1226    //     let result = analyzer.encode_sequence(sequence, &mut encoded, &mut unknown, &mut masks);
1227    //     assert!(result.is_ok());
1228
1229    //     // N-run masking behavior depends on implementation - might not always create masks
1230    //     // So we just check that it doesn't crash
1231    // }
1232
1233    #[test]
1234    fn test_analyze_fasta_file_not_found() {
1235        let config = OrphosConfig::default();
1236        let mut analyzer = OrphosAnalyzer::new(config);
1237
1238        let result = analyzer.analyze_fasta_file("nonexistent_file.fa");
1239        assert!(result.is_err());
1240    }
1241
1242    #[test]
1243    fn test_analyze_fasta_file_metagenomic() {
1244        let config = OrphosConfig {
1245            metagenomic: true,
1246            ..OrphosConfig::default()
1247        };
1248        let mut analyzer = OrphosAnalyzer::new(config);
1249
1250        // Create a temporary FASTA file
1251        let fasta_content = ">test_seq\nATCG\n";
1252        let temp_dir = env::temp_dir();
1253        let temp_file = temp_dir.join("test_metagenomic.fa");
1254        fs::write(&temp_file, fasta_content).unwrap();
1255
1256        let result = analyzer.analyze_fasta_file(&temp_file);
1257        assert!(result.is_ok());
1258
1259        let results = result.unwrap();
1260        assert_eq!(results.len(), 1); // One result per sequence in the file
1261        assert!(results[0].genes.is_empty()); // But no genes found in 4bp sequence
1262
1263        let _ = fs::remove_file(temp_file);
1264    }
1265
1266    #[test]
1267    fn test_analyze_fasta_file_single_genome() {
1268        let config = OrphosConfig::default();
1269        let mut analyzer = OrphosAnalyzer::new(config);
1270
1271        // Create a temporary FASTA file with a longer sequence for training
1272        let sequence =
1273            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(300);
1274        let fasta_content = format!(">test_seq\n{}\n", sequence);
1275        let temp_dir = env::temp_dir();
1276        let temp_file = temp_dir.join("test_single_genome.fa");
1277        fs::write(&temp_file, fasta_content).unwrap();
1278
1279        let result = analyzer.analyze_fasta_file(&temp_file);
1280        assert!(result.is_ok());
1281
1282        let results = result.unwrap();
1283        assert_eq!(results.len(), 1);
1284        assert_eq!(results[0].sequence_info.header, "test_seq");
1285
1286        let _ = fs::remove_file(temp_file);
1287    }
1288
1289    #[test]
1290    fn test_analyze_empty_sequence() {
1291        let config = OrphosConfig::default();
1292        let mut analyzer = OrphosAnalyzer::new(config);
1293
1294        let sequence = "";
1295        let result = analyzer.analyze_sequence(sequence, None);
1296
1297        assert!(result.is_err());
1298        if let Err(e) = result {
1299            // Should be an InvalidSequence error about being too short
1300            match e {
1301                OrphosError::InvalidSequence(msg) => {
1302                    assert!(msg.contains("too short"));
1303                }
1304                _ => panic!("Expected InvalidSequence error for empty sequence"),
1305            }
1306        }
1307    }
1308
1309    #[test]
1310    fn test_analyze_very_short_sequence() {
1311        let config = OrphosConfig::default();
1312        let mut analyzer = OrphosAnalyzer::new(config);
1313
1314        let sequence = "ATG"; // Very short sequence (3 bp)
1315        let result = analyzer.analyze_sequence(sequence, None);
1316
1317        assert!(result.is_err());
1318        if let Err(e) = result {
1319            // Should be an InvalidSequence error about being too short
1320            match e {
1321                OrphosError::InvalidSequence(msg) => {
1322                    assert!(msg.contains("too short"));
1323                }
1324                _ => panic!("Expected InvalidSequence error for very short sequence"),
1325            }
1326        }
1327    }
1328
1329    #[test]
1330    fn test_config_cloning() {
1331        let config1 = OrphosConfig::default();
1332        let orphos1 = UntrainedOrphos::with_config(config1.clone()).unwrap();
1333
1334        let config2 = orphos1.config.clone();
1335        let _orphos2 = UntrainedOrphos::with_config(config2).unwrap();
1336    }
1337
1338    #[test]
1339    fn test_debug_formatting() {
1340        let orphos = UntrainedOrphos::new();
1341        let debug_str = format!("{:?}", orphos);
1342        assert!(debug_str.contains("Orphos"));
1343        assert!(debug_str.contains("config"));
1344
1345        let analyzer = OrphosAnalyzer::new(OrphosConfig::default());
1346        let debug_str2 = format!("{:?}", analyzer);
1347        assert!(debug_str2.contains("OrphosAnalyzer"));
1348        assert!(debug_str2.contains("config"));
1349    }
1350
1351    #[test]
1352    fn test_type_aliases() {
1353        // Test that type aliases work correctly
1354        let _untrained: UntrainedOrphos = UntrainedOrphos::new();
1355        let _trained: TrainedOrphos =
1356            TrainedOrphos::new(OrphosConfig::default(), Training::default());
1357
1358        assert_eq!(
1359            std::any::type_name::<UntrainedOrphos>(),
1360            std::any::type_name::<Orphos<Untrained>>()
1361        );
1362        assert_eq!(
1363            std::any::type_name::<TrainedOrphos>(),
1364            std::any::type_name::<Orphos<Trained>>()
1365        );
1366    }
1367
1368    #[test]
1369    fn test_training_state_phantom_data() {
1370        let untrained = UntrainedOrphos::new();
1371        let trained = TrainedOrphos::new(OrphosConfig::default(), Training::default());
1372
1373        assert_eq!(std::mem::size_of_val(&untrained._state), 0);
1374        assert_eq!(std::mem::size_of_val(&trained._state), 0);
1375    }
1376
1377    #[test]
1378    fn test_analyzer_multiple_sequences() {
1379        let config = OrphosConfig::default();
1380        let mut analyzer = OrphosAnalyzer::new(config);
1381
1382        let sequence1 =
1383            "ATGAAACGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTCGTAAATAG".repeat(150);
1384        let sequence2 =
1385            "ATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATAG".repeat(200);
1386
1387        let result1 = analyzer.analyze_sequence(&sequence1, Some("seq1".to_string()));
1388        let result2 = analyzer.analyze_sequence(&sequence2, Some("seq2".to_string()));
1389
1390        assert!(result1.is_ok());
1391        assert!(result2.is_ok());
1392
1393        let analysis1 = result1.unwrap();
1394        let analysis2 = result2.unwrap();
1395
1396        assert_eq!(analysis1.sequence_info.header, "seq1");
1397        assert_eq!(analysis2.sequence_info.header, "seq2");
1398        assert_ne!(
1399            analysis1.sequence_info.length,
1400            analysis2.sequence_info.length
1401        );
1402    }
1403
1404    #[test]
1405    fn test_error_handling_edge_cases() {
1406        let config = OrphosConfig::default();
1407        let mut analyzer = OrphosAnalyzer::new(config);
1408
1409        // Test with sequence containing invalid characters
1410        let invalid_sequence = "ATCGXYZ123";
1411        let result = analyzer.analyze_sequence(invalid_sequence, None);
1412
1413        // Should either handle gracefully or return appropriate error
1414        let _ = result;
1415    }
1416}