1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
// =============================================================================
// =============================================================================
// =============================================================================
// =============================================================================
/// Version string for Orphos
pub const VERSION: &str = "1.0.0";
// =============================================================================
// =============================================================================
/// Maximum allowed sequence length in base pairs
pub const MAX_SEQUENCE_LENGTH: usize = 32_000_000;
/// Maximum line length for input parsing
pub const MAX_LINE_LENGTH: usize = 10_000;
/// Size of sequence masks for filtering regions
pub const MASK_SIZE: usize = 50;
/// Minimum sequence length required for gene prediction
pub const MIN_SEQUENCE_LENGTH: usize = 96;
/// Number of reading frames to analyze (forward and reverse)
pub const READING_FRAMES: usize = 3;
/// Length of a codon in base pairs
pub const CODON_LENGTH: usize = 3;
/// Sliding window size for sequence analysis
pub const SLIDING_WINDOW_SIZE: usize = 120;
/// SIMD processing chunk size for sequence analysis
pub const CHUNK_SIZE: usize = 32;
// =============================================================================
// =============================================================================
/// Minimum gene length in base pairs for normal genes
pub const MINIMUM_GENE_LENGTH: usize = 90;
/// Minimum gene length in base pairs for genes extending to sequence edges
pub const MINIMUM_EDGE_GENE_LENGTH: usize = 60;
/// Maximum allowed overlap between genes on the same strand
pub const MAXIMUM_SAME_OVERLAP: usize = 60;
/// Maximum allowed overlap between genes on opposite strands
pub const MAXIMUM_OPPOSITE_OVERLAP: i32 = 200;
/// Maximum overlap for optimization algorithms
pub const MAXIMUM_OVERLAP: usize = 60;
/// Offset for stop codon processing
pub const STOP_CODON_OFFSET: usize = 2;
/// Penalty factor for gene overlaps
pub const OVERLAP_PENALTY_FACTOR: f64 = 0.15;
// =============================================================================
// =============================================================================
/// Maximum number of nodes (start/stop positions) to consider
pub const STT_NOD: usize = 100_000;
/// Maximum number of genes that can be predicted in a single sequence
pub const MAXIMUM_GENES: usize = 50000;
/// Alternative maximum gene count constant (used in different contexts)
pub const MAX_GENES: usize = 30000;
/// Maximum distance between nodes for scoring
pub const MAX_NODE_DIST: usize = 500;
// =============================================================================
// =============================================================================
/// Size of dicodon sequences for gene scoring
pub const DICODON_SIZE: usize = 6;
/// Total number of possible dicodons (4^6)
pub const NUM_DICODONS: usize = 4096;
/// Minimum motif length for pattern matching
pub const MIN_MOTIF_LENGTH: usize = 3;
/// Maximum motif length for pattern matching
pub const MAX_MOTIF_LENGTH: usize = 6;
/// Minimum distance from gene start for motif searching
pub const MIN_DISTANCE_FROM_START: usize = 4;
/// Maximum distance for ribosome binding site detection
pub const MAX_RIBOSOME_DISTANCE: usize = 15;
/// Upstream distance for RBS motif searching
pub const RBS_UPSTREAM_DISTANCE: usize = 20;
/// Downstream distance for RBS motif searching
pub const RBS_DOWNSTREAM_DISTANCE: usize = 6;
/// Distance threshold for operon structure consideration
pub const OPERON_DISTANCE: f64 = 60.0;
// =============================================================================
// =============================================================================
/// Minimum log likelihood value for scoring
pub const MIN_LOG_LIKELIHOOD: f64 = -5.0;
/// Maximum log likelihood value for scoring
pub const MAX_LOG_LIKELIHOOD: f64 = 5.0;
/// Minimum motif score threshold
pub const MIN_MOTIF_SCORE: f64 = -4.0;
/// Initial maximum score for optimization
pub const INITIAL_MAX_SCORE: f64 = -100.0;
/// Motif threshold offset for scoring adjustments
pub const MOTIF_THRESHOLD_OFFSET: f64 = 0.69;
/// Minimum cumulative score for sequence analysis
pub const MIN_CUMULATIVE_SCORE: f64 = 6.0;
/// Score bonus applied to genes extending to sequence edges
pub const EDGE_BONUS: f64 = 0.74;
/// Upstream score penalty for edge genes
pub const EDGE_UPSTREAM: f64 = -1.00;
/// Score penalty applied in metagenomic mode
pub const METAGENOMIC_PENALTY: f64 = 7.5;
/// Coefficient divisor for metagenomic penalty calculation
pub const METAGENOMIC_PENALTY_DIVISOR: f64 = 2700.0;
/// Sentinel value for coding scores
pub const CODING_SCORE_SENTINEL: f64 = -10000.0;
// =============================================================================
// =============================================================================
/// Search window size for node analysis
pub const NODE_SEARCH_WINDOW: usize = 500;
/// Alternative search window for optimization
pub const SEARCH_WINDOW: usize = 100;
/// Threshold for short gene classification
pub const SHORT_GENE_THRESHOLD: usize = 250;
/// Length threshold for metagenomic analysis
pub const METAGENOMIC_LENGTH_THRESHOLD: usize = 3000;
/// Minimum gene length for metagenomic analysis
pub const MIN_META_GENE_LENGTH: usize = 120;
/// Length factor for metagenomic minimum calculation
pub const METAGENOMIC_MIN_LENGTH_FACTOR: usize = 1500;
/// Range for upstream motif scanning
pub const UPSTREAM_SCAN_RANGE: usize = 45;
/// Starting position to skip in upstream scanning
pub const UPSTREAM_SKIP_START: usize = 2;
/// Ending position to skip in upstream scanning
pub const UPSTREAM_SKIP_END: usize = 15;
/// Threshold for coding score evaluation
pub const CODING_SCORE_THRESHOLD: f64 = 5.0;
/// Length factor threshold for gene scoring
pub const LENGTH_FACTOR_THRESHOLD: f64 = 3.0;
/// Multiplier for length factor calculations
pub const LENGTH_FACTOR_MULTIPLIER: f64 = 0.5;
/// Threshold for edge position detection
pub const EDGE_POSITION_THRESHOLD: usize = 2;
/// Offset for edge position calculations
pub const EDGE_POSITION_OFFSET: usize = 3;
/// Weight factor for upstream composition scoring
pub const UPSTREAM_COMPOSITION_WEIGHT: f64 = 0.4;
/// Threshold for no motif detection
pub const NO_MOTIF_THRESHOLD: f64 = -0.5;
/// Penalty for negative scores
pub const NEGATIVE_SCORE_PENALTY: f64 = 0.5;
/// Minimum gene size in codons
pub const MIN_GENE_SIZE_CODONS: i32 = 80;
/// Maximum gene size in codons
pub const MAX_GENE_SIZE_CODONS: i32 = 1000;
/// Scaling factor for gene size calculations
pub const GENE_SIZE_SCALING_FACTOR: f64 = 920.0;
// =============================================================================
// =============================================================================
/// Maximum training iterations for Shine-Dalgarno analysis
pub const MAX_TRAINING_ITERATIONS_SD: usize = 10;
/// Maximum training iterations for non-Shine-Dalgarno analysis
pub const MAX_TRAINING_ITERATIONS_NONSD: usize = 20;
/// Initial score threshold for training
pub const INITIAL_SCORE_THRESHOLD: f64 = 35.0;
/// Threshold divisor for training iterations
pub const THRESHOLD_DIVISOR: f64 = 2.0;
/// Gene ratio threshold for training validation
pub const GENE_RATIO_THRESHOLD: f64 = 2000.0;
/// Coverage threshold for upstream motif analysis
pub const UPSTREAM_MOTIF_COVERAGE_THRESHOLD: f64 = 0.2;
/// Minimum GC content for analysis
pub const MIN_GC_CONTENT: f64 = 0.1;
/// Maximum GC content for analysis
pub const MAX_GC_CONTENT: f64 = 0.9;
/// Low GC frequency threshold
pub const LOW_GC_FREQ: f64 = 0.45;
/// High GC frequency threshold
pub const HIGH_GC_FREQ: f64 = 0.05;
/// Minimum weight clamping value
pub const WEIGHT_CLAMP_MIN: f64 = -4.0;
/// Maximum weight clamping value
pub const WEIGHT_CLAMP_MAX: f64 = 4.0;
/// High threshold for RBS weight evaluation
pub const RBS_WEIGHT_THRESHOLD_HIGH: f64 = 1.0;
/// Low threshold for RBS weight evaluation
pub const RBS_WEIGHT_THRESHOLD_LOW: f64 = -0.5;
/// Strong threshold for RBS weight evaluation
pub const RBS_WEIGHT_STRONG_THRESHOLD: f64 = 2.0;
/// Number of codon types (ATG, GTG, TTG)
pub const NUM_CODON_TYPES: usize = 3;
/// Number of RBS weight patterns
pub const NUM_RBS_WEIGHTS: usize = 28;
/// Number of nucleotide bases (A, T, G, C)
pub const NUM_BASES: usize = 4;
/// Number of upstream positions for analysis
pub const UPSTREAM_POSITIONS: usize = 32;
/// Number of motif size categories
pub const NUM_MOTIF_SIZES: usize = 4;
/// Maximum motif index value
pub const MAX_MOTIF_INDEX: usize = 4096;
/// Maximum confidence score for gene predictions
pub const MAX_CONFIDENCE_SCORE: f64 = 99.99;
/// Expected no-stop probability for genetic code analysis
pub const EXPECTED_NO_STOP_PROB: f64 = 0.953;
/// Test sequence repeat factor for creating longer sequences
pub const TEST_SEQUENCE_REPEAT_FACTOR: usize = 300;
/// Starting position for upstream analysis
pub const UPSTREAM_START_POS: usize = 1;
/// Ending position for upstream analysis
pub const UPSTREAM_END_POS: usize = 45;
/// GC content parameters for low AT frequency
pub const GC_LOW_AT_FREQ: f64 = 0.90;
/// GC content parameters for low GC frequency
pub const GC_LOW_GC_FREQ: f64 = 0.10;
/// GC content parameters for high AT frequency
pub const GC_HIGH_AT_FREQ: f64 = 0.10;
/// GC content parameters for high GC frequency
pub const GC_HIGH_GC_FREQ: f64 = 0.90;
// =============================================================================
// =============================================================================
/// Mask for lower bits in hamming distance calculations
pub const LOWER_BITS: u64 = 0x5555_5555_5555_5555;
/// Mask for upper bits in hamming distance calculations
pub const UPPER_BITS: u64 = 0xAAAA_AAAA_AAAA_AAAA;
/// Nucleotide lookup table for unpacking
pub const NUCLEOTIDE_LOOKUP: = ;
/// Character lookup for sequence display
pub const NUCLEOTIDE_LETTERS: = ;
// =============================================================================
// =============================================================================
/// Shine-Dalgarno motif descriptions for annotation
pub const RBS_DESCRIPTIONS: = ;
/// Default start weight factor used in training and scoring
pub const DEFAULT_START_WEIGHT_FACTOR: f64 = 4.35;