orphos_core/constants.rs
1// =============================================================================
2// =============================================================================
3
4// =============================================================================
5// =============================================================================
6
7/// Version string for Orphos
8pub const VERSION: &str = "1.0.0";
9
10// =============================================================================
11// =============================================================================
12
13/// Maximum allowed sequence length in base pairs
14pub const MAX_SEQUENCE_LENGTH: usize = 32_000_000;
15
16/// Maximum line length for input parsing
17pub const MAX_LINE_LENGTH: usize = 10_000;
18
19/// Size of sequence masks for filtering regions
20pub const MASK_SIZE: usize = 50;
21
22/// Minimum sequence length required for gene prediction
23pub const MIN_SEQUENCE_LENGTH: usize = 96;
24
25/// Number of reading frames to analyze (forward and reverse)
26pub const READING_FRAMES: usize = 3;
27
28/// Length of a codon in base pairs
29pub const CODON_LENGTH: usize = 3;
30
31/// Sliding window size for sequence analysis
32pub const SLIDING_WINDOW_SIZE: usize = 120;
33
34/// SIMD processing chunk size for sequence analysis
35pub const CHUNK_SIZE: usize = 32;
36
37// =============================================================================
38// =============================================================================
39
40/// Minimum gene length in base pairs for normal genes
41pub const MINIMUM_GENE_LENGTH: usize = 90;
42
43/// Minimum gene length in base pairs for genes extending to sequence edges
44pub const MINIMUM_EDGE_GENE_LENGTH: usize = 60;
45
46/// Maximum allowed overlap between genes on the same strand
47pub const MAXIMUM_SAME_OVERLAP: usize = 60;
48
49/// Maximum allowed overlap between genes on opposite strands
50pub const MAXIMUM_OPPOSITE_OVERLAP: i32 = 200;
51
52/// Maximum overlap for optimization algorithms
53pub const MAXIMUM_OVERLAP: usize = 60;
54
55/// Offset for stop codon processing
56pub const STOP_CODON_OFFSET: usize = 2;
57
58/// Penalty factor for gene overlaps
59pub const OVERLAP_PENALTY_FACTOR: f64 = 0.15;
60
61// =============================================================================
62// =============================================================================
63
64/// Maximum number of nodes (start/stop positions) to consider
65pub const STT_NOD: usize = 100_000;
66
67/// Maximum number of genes that can be predicted in a single sequence
68pub const MAXIMUM_GENES: usize = 50000;
69
70/// Alternative maximum gene count constant (used in different contexts)
71pub const MAX_GENES: usize = 30000;
72
73/// Maximum distance between nodes for scoring
74pub const MAX_NODE_DIST: usize = 500;
75
76// =============================================================================
77// =============================================================================
78
79/// Size of dicodon sequences for gene scoring
80pub const DICODON_SIZE: usize = 6;
81
82/// Total number of possible dicodons (4^6)
83pub const NUM_DICODONS: usize = 4096;
84
85/// Minimum motif length for pattern matching
86pub const MIN_MOTIF_LENGTH: usize = 3;
87
88/// Maximum motif length for pattern matching
89pub const MAX_MOTIF_LENGTH: usize = 6;
90
91/// Minimum distance from gene start for motif searching
92pub const MIN_DISTANCE_FROM_START: usize = 4;
93
94/// Maximum distance for ribosome binding site detection
95pub const MAX_RIBOSOME_DISTANCE: usize = 15;
96
97/// Upstream distance for RBS motif searching
98pub const RBS_UPSTREAM_DISTANCE: usize = 20;
99
100/// Downstream distance for RBS motif searching
101pub const RBS_DOWNSTREAM_DISTANCE: usize = 6;
102
103/// Distance threshold for operon structure consideration
104pub const OPERON_DISTANCE: f64 = 60.0;
105
106// =============================================================================
107// =============================================================================
108
109/// Minimum log likelihood value for scoring
110pub const MIN_LOG_LIKELIHOOD: f64 = -5.0;
111
112/// Maximum log likelihood value for scoring
113pub const MAX_LOG_LIKELIHOOD: f64 = 5.0;
114
115/// Minimum motif score threshold
116pub const MIN_MOTIF_SCORE: f64 = -4.0;
117
118/// Initial maximum score for optimization
119pub const INITIAL_MAX_SCORE: f64 = -100.0;
120
121/// Motif threshold offset for scoring adjustments
122pub const MOTIF_THRESHOLD_OFFSET: f64 = 0.69;
123
124/// Minimum cumulative score for sequence analysis
125pub const MIN_CUMULATIVE_SCORE: f64 = 6.0;
126
127/// Score bonus applied to genes extending to sequence edges
128pub const EDGE_BONUS: f64 = 0.74;
129
130/// Upstream score penalty for edge genes
131pub const EDGE_UPSTREAM: f64 = -1.00;
132
133/// Score penalty applied in metagenomic mode
134pub const METAGENOMIC_PENALTY: f64 = 7.5;
135
136/// Coefficient divisor for metagenomic penalty calculation
137pub const METAGENOMIC_PENALTY_DIVISOR: f64 = 2700.0;
138
139/// Sentinel value for coding scores
140pub const CODING_SCORE_SENTINEL: f64 = -10000.0;
141
142// =============================================================================
143// =============================================================================
144
145/// Search window size for node analysis
146pub const NODE_SEARCH_WINDOW: usize = 500;
147
148/// Alternative search window for optimization
149pub const SEARCH_WINDOW: usize = 100;
150
151/// Threshold for short gene classification
152pub const SHORT_GENE_THRESHOLD: usize = 250;
153
154/// Length threshold for metagenomic analysis
155pub const METAGENOMIC_LENGTH_THRESHOLD: usize = 3000;
156
157/// Minimum gene length for metagenomic analysis
158pub const MIN_META_GENE_LENGTH: usize = 120;
159
160/// Length factor for metagenomic minimum calculation
161pub const METAGENOMIC_MIN_LENGTH_FACTOR: usize = 1500;
162
163/// Range for upstream motif scanning
164pub const UPSTREAM_SCAN_RANGE: usize = 45;
165
166/// Starting position to skip in upstream scanning
167pub const UPSTREAM_SKIP_START: usize = 2;
168
169/// Ending position to skip in upstream scanning
170pub const UPSTREAM_SKIP_END: usize = 15;
171
172/// Threshold for coding score evaluation
173pub const CODING_SCORE_THRESHOLD: f64 = 5.0;
174
175/// Length factor threshold for gene scoring
176pub const LENGTH_FACTOR_THRESHOLD: f64 = 3.0;
177
178/// Multiplier for length factor calculations
179pub const LENGTH_FACTOR_MULTIPLIER: f64 = 0.5;
180
181/// Threshold for edge position detection
182pub const EDGE_POSITION_THRESHOLD: usize = 2;
183
184/// Offset for edge position calculations
185pub const EDGE_POSITION_OFFSET: usize = 3;
186
187/// Weight factor for upstream composition scoring
188pub const UPSTREAM_COMPOSITION_WEIGHT: f64 = 0.4;
189
190/// Threshold for no motif detection
191pub const NO_MOTIF_THRESHOLD: f64 = -0.5;
192
193/// Penalty for negative scores
194pub const NEGATIVE_SCORE_PENALTY: f64 = 0.5;
195
196/// Minimum gene size in codons
197pub const MIN_GENE_SIZE_CODONS: i32 = 80;
198
199/// Maximum gene size in codons
200pub const MAX_GENE_SIZE_CODONS: i32 = 1000;
201
202/// Scaling factor for gene size calculations
203pub const GENE_SIZE_SCALING_FACTOR: f64 = 920.0;
204
205// =============================================================================
206// =============================================================================
207
208/// Maximum training iterations for Shine-Dalgarno analysis
209pub const MAX_TRAINING_ITERATIONS_SD: usize = 10;
210
211/// Maximum training iterations for non-Shine-Dalgarno analysis
212pub const MAX_TRAINING_ITERATIONS_NONSD: usize = 20;
213
214/// Initial score threshold for training
215pub const INITIAL_SCORE_THRESHOLD: f64 = 35.0;
216
217/// Threshold divisor for training iterations
218pub const THRESHOLD_DIVISOR: f64 = 2.0;
219
220/// Gene ratio threshold for training validation
221pub const GENE_RATIO_THRESHOLD: f64 = 2000.0;
222
223/// Coverage threshold for upstream motif analysis
224pub const UPSTREAM_MOTIF_COVERAGE_THRESHOLD: f64 = 0.2;
225
226/// Minimum GC content for analysis
227pub const MIN_GC_CONTENT: f64 = 0.1;
228
229/// Maximum GC content for analysis
230pub const MAX_GC_CONTENT: f64 = 0.9;
231
232/// Low GC frequency threshold
233pub const LOW_GC_FREQ: f64 = 0.45;
234
235/// High GC frequency threshold
236pub const HIGH_GC_FREQ: f64 = 0.05;
237
238/// Minimum weight clamping value
239pub const WEIGHT_CLAMP_MIN: f64 = -4.0;
240
241/// Maximum weight clamping value
242pub const WEIGHT_CLAMP_MAX: f64 = 4.0;
243
244/// High threshold for RBS weight evaluation
245pub const RBS_WEIGHT_THRESHOLD_HIGH: f64 = 1.0;
246
247/// Low threshold for RBS weight evaluation
248pub const RBS_WEIGHT_THRESHOLD_LOW: f64 = -0.5;
249
250/// Strong threshold for RBS weight evaluation
251pub const RBS_WEIGHT_STRONG_THRESHOLD: f64 = 2.0;
252
253/// Number of codon types (ATG, GTG, TTG)
254pub const NUM_CODON_TYPES: usize = 3;
255
256/// Number of RBS weight patterns
257pub const NUM_RBS_WEIGHTS: usize = 28;
258
259/// Number of nucleotide bases (A, T, G, C)
260pub const NUM_BASES: usize = 4;
261
262/// Number of upstream positions for analysis
263pub const UPSTREAM_POSITIONS: usize = 32;
264
265/// Number of motif size categories
266pub const NUM_MOTIF_SIZES: usize = 4;
267
268/// Maximum motif index value
269pub const MAX_MOTIF_INDEX: usize = 4096;
270
271/// Maximum confidence score for gene predictions
272pub const MAX_CONFIDENCE_SCORE: f64 = 99.99;
273
274/// Expected no-stop probability for genetic code analysis
275pub const EXPECTED_NO_STOP_PROB: f64 = 0.953;
276
277/// Test sequence repeat factor for creating longer sequences
278pub const TEST_SEQUENCE_REPEAT_FACTOR: usize = 300;
279
280/// Starting position for upstream analysis
281pub const UPSTREAM_START_POS: usize = 1;
282
283/// Ending position for upstream analysis
284pub const UPSTREAM_END_POS: usize = 45;
285
286/// GC content parameters for low AT frequency
287pub const GC_LOW_AT_FREQ: f64 = 0.90;
288
289/// GC content parameters for low GC frequency
290pub const GC_LOW_GC_FREQ: f64 = 0.10;
291
292/// GC content parameters for high AT frequency
293pub const GC_HIGH_AT_FREQ: f64 = 0.10;
294
295/// GC content parameters for high GC frequency
296pub const GC_HIGH_GC_FREQ: f64 = 0.90;
297
298// =============================================================================
299// =============================================================================
300
301/// Mask for lower bits in hamming distance calculations
302pub const LOWER_BITS: u64 = 0x5555_5555_5555_5555;
303
304/// Mask for upper bits in hamming distance calculations
305pub const UPPER_BITS: u64 = 0xAAAA_AAAA_AAAA_AAAA;
306
307/// Nucleotide lookup table for unpacking
308pub const NUCLEOTIDE_LOOKUP: [u8; 4] = [b'A', b'C', b'G', b'T'];
309
310/// Character lookup for sequence display
311pub const NUCLEOTIDE_LETTERS: [char; 4] = ['A', 'G', 'C', 'T'];
312
313// =============================================================================
314// =============================================================================
315
316/// Shine-Dalgarno motif descriptions for annotation
317pub const RBS_DESCRIPTIONS: [(&str, &str); 28] = [
318 ("None", "None"),
319 ("GGA/GAG/AGG", "3-4bp"),
320 ("3Base/5BMM", "13-15bp"),
321 ("4Base/6BMM", "13-15bp"),
322 ("AGxAG", "11-12bp"),
323 ("AGxAG", "3-4bp"),
324 ("GGA/GAG/AGG", "11-12bp"),
325 ("GGxGG", "11-12bp"),
326 ("GGxGG", "3-4bp"),
327 ("AGxAG", "5-10bp"),
328 ("AGGAG(G)/GGAGG", "13-15bp"),
329 ("AGGA/GGAG/GAGG", "3-4bp"),
330 ("AGGA/GGAG/GAGG", "11-12bp"),
331 ("GGA/GAG/AGG", "5-10bp"),
332 ("GGxGG", "5-10bp"),
333 ("AGGA", "5-10bp"),
334 ("GGAG/GAGG", "5-10bp"),
335 ("AGxAGG/AGGxGG", "11-12bp"),
336 ("AGxAGG/AGGxGG", "3-4bp"),
337 ("AGxAGG/AGGxGG", "5-10bp"),
338 ("AGGAG/GGAGG", "11-12bp"),
339 ("AGGAG", "3-4bp"),
340 ("AGGAG", "5-10bp"),
341 ("GGAGG", "3-4bp"),
342 ("GGAGG", "5-10bp"),
343 ("AGGAGG", "11-12bp"),
344 ("AGGAGG", "3-4bp"),
345 ("AGGAGG", "5-10bp"),
346];
347
348/// Default start weight factor used in training and scoring
349pub const DEFAULT_START_WEIGHT_FACTOR: f64 = 4.35;