grangers 0.5.0

A rust library for working with genomic ranges and annotations.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
use crate::grangers_utils::FIELDCOLUMNS;
use anyhow::bail;
use polars::prelude::DataFrame;
use std::collections::HashSet;
use tracing::warn;

/// Represents an inclusive genomic interval.
///
/// This structure is used to define a range on a genome, such as a gene, a regulatory element,
/// or any other genomic feature, where both the start and end positions are considered part of the interval.
///
/// # Fields
///
/// * `start`: The starting position of the interval (1-based index). This is the first position
///   included in the interval.
/// * `end`: The ending position of the interval (1-based index). This position is also included
///   in the interval.
///
/// # Notes
///
/// In bioinformatics, it's crucial to clarify whether intervals are 0-based or 1-based, as well
/// as whether they are inclusive or exclusive. This structure uses 1-based indexing and includes
/// both start and end positions, which is common in formats like GTF or GFF.
///
/// # Examples
///
/// Creating an inclusive interval representing the first 100 bases of a chromosome:
///
/// ```rust
/// let interval = InclusiveInterval { start: 1, end: 100 };
/// assert_eq!(interval.start, 1);
/// assert_eq!(interval.end, 100);
/// ```
///
/// This structure simplifies interval handling and is integral for genomic analyses, ensuring clear
/// and accurate representation of genomic ranges.
pub struct InclusiveInterval {
    pub start: u64,
    pub end: u64,
}

pub enum Strand {
    Positive,
    Negative,
}

impl std::fmt::Display for Strand {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
            Strand::Positive => write!(f, "+"),
            Strand::Negative => write!(f, "-"),
        }
    }
}

#[derive(Copy, Clone)]
/// Configuration options for generating flanking regions around genomic intervals.
///
/// This structure is used to specify how flanking regions should be constructed relative to a given
/// genomic interval. Flanking regions can be important for various genomic analyses, including promoter
/// studies, regulatory element identification, and more.
///
/// # Fields
///
/// * `start`: If `true`, generate a flanking region at the start (5' end) of the genomic interval.
/// * `both`: If `true`, generate flanking regions at both ends of the genomic interval. If set to `true`,
///   this option overrides `start` to apply flanking regions to both ends.
/// * `ignore_strand`: If `true`, flanking regions are generated without considering the strand orientation
///   of the genomic interval. This can be useful when the strand information is irrelevant or unavailable.
///
/// # Notes
///
/// Flanking regions are additional sequences that lie adjacent to the main interval of interest. Depending
/// on the options set, these can be added to just one side of the interval, or both. Strand orientation can
/// affect the direction in which flanks are added (5' or 3' ends) unless ignored.
///
/// # Examples
///
/// Creating flank options to get regions only at the start of the interval, considering strand orientation:
///
/// ```rust
/// let flank_options = FlankOptions {
///     start: true,
///     both: false,
///     ignore_strand: false,
/// };
/// ```
///
/// Creating options to get flanking regions on both sides, ignoring strand:
///
/// ```rust
/// let flank_options = FlankOptions {
///     start: false, // Ignored due to `both` being `true`
///     both: true,
///     ignore_strand: true,
/// };
/// ```
///
/// This structure provides a clear and flexible way to define how flanking regions should be generated
/// around genomic intervals for different types of genomic studies.
pub struct FlankOptions {
    pub start: bool,
    pub both: bool,
    pub ignore_strand: bool,
}

impl Default for FlankOptions {
    fn default() -> FlankOptions {
        FlankOptions {
            start: true,
            both: false,
            ignore_strand: false,
        }
    }
}

impl FlankOptions {
    /// Constructs a new `FlankOptions` instance with custom settings.
    ///
    /// Allows the user to specify whether to create flanks at the start, at both ends,
    /// and whether to consider the genomic strand orientation.
    ///
    /// # Arguments
    ///
    /// * `start`: If `true`, a flank will be generated at the start of the interval.
    /// * `both`: If `true`, flanks will be generated on both sides of the interval.
    /// * `ignore_strand`: If `true`, the flanking region will be generated without considering strand orientation.
    ///
    /// # Examples
    ///
    /// Creating a `FlankOptions` instance to generate flanking regions at both ends without strand consideration:
    ///
    /// ```rust
    /// let custom_options = FlankOptions::new(false, true, true);
    /// assert_eq!(custom_options.both, true);
    /// assert_eq!(custom_options.ignore_strand, true);
    /// ```
    ///
    /// This constructor allows for the dynamic creation of flank generation settings, facilitating tailored genomic data manipulation.
    pub fn new(start: bool, both: bool, ignore_strand: bool) -> FlankOptions {
        FlankOptions {
            start,
            both,
            ignore_strand,
        }
    }
}

/// Configuration options for merging genomic intervals.
///
/// This structure is used to define how genomic intervals should be merged based on specific criteria such as sequence name and strand,
/// while considering or ignoring certain attributes like strand orientation. Additionally, it allows setting a slack for merging intervals
/// that are not exactly adjacent but close enough to be considered part of the same feature.
///
/// # Fields
///
/// * `by`: A vector of strings representing the columns by which intervals should be merged.
/// * `slack`: The maximum distance between intervals that can still be considered for merging.
/// * `ignore_strand`: If `true`, intervals are merged without considering their strand orientation.
///
/// # Default
///
/// Implements the `Default` trait, providing a default set of options:
///
/// * `by`: Default to merging by "seqname" and "strand", which is common for genomic intervals.
/// * `slack`: Default slack of 1, allowing adjacent intervals to be merged.
/// * `ignore_strand`: Defaults to `false`, considering strand in merging process.
///
/// # Examples
///
/// Creating default merge options:
///
/// ```rust
/// let default_options = MergeOptions::default();
/// assert_eq!(default_options.by, vec!["seqname", "strand"]);
/// assert_eq!(default_options.slack, 1);
/// assert_eq!(default_options.ignore_strand, false);
/// ```
///
/// Creating custom merge options:
///
/// ```rust
/// let custom_options = MergeOptions::new(&["seqname"], true, 10)?;
/// assert!(custom_options.ignore_strand);
/// assert_eq!(custom_options.slack, 10);
/// ```
pub struct MergeOptions {
    pub by: Vec<String>,
    pub slack: i64,
    pub ignore_strand: bool,
}

impl Default for MergeOptions {
    fn default() -> MergeOptions {
        MergeOptions {
            by: vec![String::from("seqname"), String::from("strand")],
            slack: 1,
            ignore_strand: false,
        }
    }
}

impl MergeOptions {
    /// Creates a new `MergeOptions` instance with specified settings.
    ///
    /// This method initializes `MergeOptions` with a set of columns for merging criteria,
    /// an option to ignore strand information, and a slack for considering nearly adjacent intervals.
    ///
    /// # Arguments
    ///
    /// * `by`: An array of references to strings specifying the columns used for merging.
    ///   It should not include "start" or "end" columns.
    /// * `ignore_strand`: If true, intervals will be merged without regard to their strand orientation.
    /// * `slack`: The distance within which intervals are considered adjacent and thus mergeable.
    ///
    /// # Errors
    ///
    /// Returns an error if the `by` array contains "start" or "end", as these columns cannot be used for merging.
    ///
    /// # Examples
    ///
    /// Creating custom merge options:
    ///
    /// ```rust
    /// let options = MergeOptions::new(&["gene_id", "transcript_id"], false, 5)?;
    /// assert_eq!(options.by, vec!["gene_id", "transcript_id", "strand"]); // Note: "strand" is added by default
    /// assert_eq!(options.slack, 5);
    /// assert!(!options.ignore_strand);
    /// ```
    ///
    /// This method allows the user to configure the merging process to fit specific needs, enhancing flexibility in genomic data handling.
    pub fn new<T: AsRef<str>>(
        by: &[T],
        ignore_strand: bool,
        slack: i64,
    ) -> anyhow::Result<MergeOptions> {
        // avoid duplicated columns
        let mut by_hash: HashSet<String> = by.iter().map(|n| n.as_ref().to_string()).collect();

        if slack < 1 {
            warn!("It usually doen't make sense to set a non-positive slack.")
        }

        if by_hash.take(&String::from("start")).is_some()
            | by_hash.take(&String::from("end")).is_some()
        {
            bail!("The provided `by` vector cannot contain the start or end column")
        };

        if ignore_strand {
            if by_hash.take(&String::from("strand")).is_some() {
                warn!("Remove `strand` from the provided `by` vector as the ignored_strand flag is set.")
            }
        } else {
            by_hash.insert(String::from("strand"));
        }

        // add chromosome name and strand if needed
        if by_hash.insert(String::from("seqname")) {
            warn!("Added `seqname` to the `by` vector as it is required.")
        };

        Ok(MergeOptions {
            by: by_hash.into_iter().collect(),
            slack,
            ignore_strand,
        })
    }
}

#[derive(Clone, Copy, PartialEq, Eq)]
/// Options for extending a genomic feature.
///
/// This enum is used to specify how a genomic interval should be extended. It can be
/// extended towards the start, the end, or in both directions. This is commonly used
/// in genomic data processing and analysis where extending genomic features like genes
/// or regulatory elements is necessary for various computational tasks.
///
/// # Variants
///
/// * `Start`: Extend the genomic feature towards the start (5' direction).
/// * `End`: Extend the genomic feature towards the end (3' direction).
/// * `Both`: Extend the genomic feature on both sides, towards both start and end.
///
/// # Examples
///
/// Specifying extension towards the start of the feature:
///
/// ```rust
/// let extension_option = ExtendOption::Start;
/// ```
///
/// Specifying extension in both directions:
///
/// ```notrun
/// let extension_option = ExtendOption::Both;
/// ```
///
/// These extension options can be used in various genomic data manipulation tasks, such as
/// extending regulatory regions, adjusting gene coordinates, or creating buffers around features for
/// downstream analysis.
pub enum ExtendOption {
    /// Extend the feature to the start
    Start,
    /// Extend the feature to the end
    End,
    /// Extend the feature to both sides
    Both,
}

pub struct GetSequenceOptions {}

/// Options used for dealing with out-of-boundary features
#[derive(Clone, Copy, PartialEq, Eq)]
/// Options for handling out-of-bounds (OOB) genomic coordinates.
///
/// This enumeration defines strategies for dealing with genomic features or intervals
/// that extend beyond the boundaries of a reference sequence, such as a chromosome or contig.
/// It's commonly used in genomic data processing to determine how to handle intervals
/// when extracting sequences or extending features beyond reference sequence limits.
///
/// # Variants
///
/// * `Truncate`: Adjust out-of-bounds intervals to fit within the available sequence range.
///   This can result in shorter sequences than originally specified but ensures that all
///   returned sequences are valid within the context of the reference.
/// * `Skip`: Ignore any intervals that extend beyond the boundaries of the reference sequence.
///   This can result in some data being omitted from the results but maintains the original
///   size and integrity of the remaining sequences.
///
/// # Examples
///
/// Choosing to truncate sequences that extend beyond the reference:
///
/// ```rust
/// let oob_option = OOBOption::Truncate;
/// ```
///
/// Choosing to skip any features that extend beyond the reference boundaries:
///
/// ```rust
/// let oob_option = OOBOption::Skip;
/// ```
///
/// These handling strategies are vital for ensuring that genomic data analyses remain robust
/// and adaptable to varying data qualities and reference sequence constraints.
pub enum OOBOption {
    Truncate,
    Skip,
}

/// Structure representing the columns of genomic feature data, commonly found in GTF/GFF files.
///
/// This structure contains fields that correspond to standard column names in GTF or GFF format files,
/// facilitating the mapping between the file's columns and the expected fields used in genomic data processing.
/// The fields in this struct are utilized by various Grangers methods to interpret the genomic feature information correctly.
///
/// It is designed according to the GTF/GFF format.
/// <https://useast.ensembl.org/info/website/upload/gff.html>
/// The default values are the column names used in GTF/GFF files.
/// This will be used in almost all Grangers methods.
/// # Fields
///
/// - `seqname`: Corresponds to the reference sequence name column in GTF/GFF (required).
/// - `source`: Source of the feature, such as database or algorithm used (optional).
/// - `feature_type`: Type of genomic feature (e.g., gene, transcript, exon) (optional).
/// - `start`: Start position of the feature in the reference sequence (required).
/// - `end`: End position of the feature in the reference sequence (required).
/// - `score`: Score or value associated with the feature (optional).
/// - `strand`: Strand of the feature ('+' or '-') (required).
/// - `phase`: Phase or frame for coding features (optional).
/// - `gene_id`: Identifier for the gene associated with the feature (optional).
/// - `gene_name`: Name of the gene associated with the feature (optional).
/// - `transcript_id`: Identifier for the transcript associated with the feature (optional).
/// - `exon_number`: Order or number of the exon within its transcript (optional).
///
/// # Examples
///
/// Creating default `FieldColumns` for standard GTF/GFF processing:
///
/// ```rust
/// let field_columns = FieldColumns::default();
/// ```
///
/// Customizing `FieldColumns` for specific data processing needs:
///
/// ```rust
/// let mut custom_fields = FieldColumns::default();
/// custom_fields.gene_id = Some("gene_ident".to_string());
/// custom_fields.exon_number = Some("exon_order".to_string());
/// ```
///
/// The default implementations adhere to typical conventions, but they can be adjusted as needed
/// to fit the specific layout and requirements of the input data files.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct FieldColumns {
    /// the name of the reference sequence column, usually it is called "seqname".\
    /// This corresponds to the first column in a GTF/GFF file.\
    /// This field is required, and the corresponding column should not contain missing values (nulls).\
    /// You can drop all rows with missing values in this column by calling `df.drop_nulls(Some(["seqname"]))`.
    pub seqname: String,
    /// the name of the source column, usually it is called "source".
    /// This is the second column in a GTF/GFF file. It records the source (HAVANA, ENSEMBL, etc) of the feature.
    pub source: Option<String>,
    /// The name of the feature type column whose values should be "gene", "transcript" and "exon", etc.
    /// This should be the third column in a GTF/GFF file.
    /// This field is required for transcriptome related methods, like `introns()`, `get_transcript_sequeneces()`, etc.
    pub feature_type: Option<String>,
    /// The column name of the start position column, usually it is called "start".
    /// This is the fourth column in a GTF/GFF file.
    /// This field is required, and the corresponding column should not contain missing values (nulls).
    pub start: String,
    /// The column name of the end position column, usually it is called "end".
    /// This is the fifth column in a GTF/GFF file.
    /// This field is required, and the corresponding column should not contain missing values (nulls).
    pub end: String,
    /// The column name of the score column, usually it is called "score".
    pub score: Option<String>,
    /// The column name of the strand column, usually it is called "strand".
    /// This is the seventh column in a GTF/GFF file.\
    /// This field is required, and the corresponding column should not contain missing values (nulls) for calling most Grangers methods.\
    /// If this field is missing, you can add one by calling `df.update_column("strand", vec!['.'; df.height()])`.\
    /// If it contains missing values, you can fill them by calling `df.fill_none("strand", '.')`.
    pub strand: String,
    /// The column name of the phase column, usually it is called "frame" or "phase".
    /// This is the eighth column in a GTF/GFF file.
    pub phase: Option<String>,
    /// The column name of the gene_id column, usually it is called "gene_id".
    pub gene_id: Option<String>,
    /// The column name of the gene_name column, usually it is called "gene_id".
    pub gene_name: Option<String>,
    /// The column name of the transcript ID column, usually it is called "transcript_id".
    pub transcript_id: Option<String>,
    // The column name of the exon ID column, usually it is called "exon_id".
    // pub exon_id: Option<String>,
    /// The column name of the exon number (order) column, usually it is called "exon_number".
    /// This column is used to sort the exons of a transcript.
    /// If this column is missing, the exons will be sorted by their start positions.
    pub exon_number: Option<String>,
}

impl FieldColumns {
    /// Returns a reference to the `seqname` field.
    ///
    /// This field represents the name of the reference sequence (such as a chromosome or contig) to which the genomic features pertain.
    /// It's a crucial identifier in genomic datasets, typically corresponding to the first column in GTF/GFF files.
    ///
    /// # Returns
    /// A string slice (&str) pointing to the `seqname` value.
    pub fn seqname(&self) -> &str {
        self.seqname.as_str()
    }

    /// Returns an optional reference to the `source` field.
    ///
    /// This field denotes the origin of the genomic feature, such as the software or organization that generated the data.
    /// It aligns with the second column in standard GTF/GFF formats, though it's not mandatory for all processing tasks.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `source` value, if it exists.
    pub fn source(&self) -> Option<&str> {
        self.source.as_deref()
    }

    /// Returns an optional reference to the `feature_type` field.
    ///
    /// This specifies the type of genomic feature (e.g., gene, transcript, exon) and corresponds to the third column in GTF/GFF files.
    /// It's vital for differentiating between various biological entities within the genomic data.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `feature_type` value, if it exists.
    pub fn feature_type(&self) -> Option<&str> {
        self.feature_type.as_deref()
    }
    /// Returns a reference to the `start` field.
    ///
    /// This indicates the starting position of the genomic feature on the reference sequence, typically aligning with the fourth column in GTF/GFF files.
    /// It's essential for identifying the genomic location of the feature.
    ///
    /// # Returns
    /// A string slice (&str) pointing to the `start` value.
    pub fn start(&self) -> &str {
        self.start.as_str()
    }
    /// Returns a reference to the `end` field.
    ///
    /// This marks the ending position of the genomic feature, typically aligning with the fifth column in GTF/GFF formats.
    /// This value, along with `start`, helps define the exact genomic span of the feature.
    ///
    /// # Returns
    /// A string slice (&str) pointing to the `end` value.
    pub fn end(&self) -> &str {
        self.end.as_str()
    }
    /// Returns an optional reference to the `score` field.
    ///
    /// This field may contain a numeric score or value associated with the feature's significance or confidence level, corresponding to the sixth column in some formats.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `score` value, if it exists.
    pub fn score(&self) -> Option<&str> {
        self.score.as_deref()
    }
    /// Returns a reference to the `strand` field.
    ///
    /// This indicates the genomic strand (either '+' or '-') that the feature is associated with, typically found in the seventh column in GTF/GFF files.
    /// It's crucial for understanding the directional context of the feature.
    ///
    /// # Returns
    /// A string slice (&str) pointing to the `strand` value.
    pub fn strand(&self) -> &str {
        self.strand.as_str()
    }
    /// Returns an optional reference to the `phase` field.
    ///
    /// This field, often called 'frame', is relevant for coding sequences and can affect how the sequence is translated into amino acids.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `phase` value, if it exists.
    pub fn phase(&self) -> Option<&str> {
        self.phase.as_deref()
    }
    /// Returns an optional reference to the `gene_id` field.
    ///
    /// This field is typically used to associate features with a specific gene identifier, facilitating gene-centric analyses.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `gene_id` value, if it exists.
    pub fn gene_id(&self) -> Option<&str> {
        self.gene_id.as_deref()
    }

    /// Returns an optional reference to the `gene_name` field.
    ///
    /// This is akin to `gene_id` but typically contains a human-readable gene name, useful for reports and labeling.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `gene_name` value, if it exists.
    pub fn gene_name(&self) -> Option<&str> {
        self.gene_name.as_deref()
    }

    /// Returns an optional reference to the `transcript_id` field.
    ///
    /// This field links genomic features to a specific transcript identifier, essential for transcriptome analyses.
    ///
    /// # Returns
    /// An optional string slice (&str) pointing to the `transcript_id` value, if it exists.
    pub fn transcript_id(&self) -> Option<&str> {
        self.transcript_id.as_deref()
    }
    // get a reference to the exon_id field
    // pub fn exon_id(&self) -> Option<&str> {
    //     self.exon_id.as_deref()
    // }

    /// Returns an optional reference to the `exon_number` field.
    ///
    /// This field is used to specify the ordering of exons within a transcript. It is particularly
    /// useful for reconstructing the transcript structure from exon features.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// assert!(field_columns.exon_number().is_none());
    ///
    /// let mut field_columns = FieldColumns::default();
    /// field_columns.update("exon_number", "my_exon_number");
    /// assert_eq!(field_columns.exon_number(), Some("my_exon_number"));
    /// ```
    ///
    /// # Returns
    ///
    /// An optional string slice (`&str`) pointing to the `exon_number` value, if it exists.
    /// This aids in sorting exons to align with their sequential order in the corresponding transcript.
    pub fn exon_number(&self) -> Option<&str> {
        self.exon_number.as_deref()
    }
}

impl Default for FieldColumns {
    fn default() -> Self {
        Self {
            seqname: "seqname".to_string(),
            source: Some("source".to_string()),
            feature_type: Some("feature_type".to_string()),
            start: "start".to_string(),
            end: "end".to_string(),
            score: Some("score".to_string()),
            strand: "strand".to_string(),
            phase: Some("phase".to_string()),
            gene_id: Some("gene_id".to_string()),
            gene_name: Some("gene_name".to_string()),
            transcript_id: Some("transcript_id".to_string()),
            exon_number: Some("exon_number".to_string()),
        }
    }
}

impl FieldColumns {
    /// Returns an array of optional field names present in `FieldColumns`.
    ///
    /// This method provides the names of optional fields that may not always be
    /// required for every analysis but can provide additional context or data
    /// when available. These fields include `source`, `feature_type`, `score`,
    /// `phase`, `gene_id`, `gene_name`, `transcript_id`, and `exon_number`.
    ///
    /// # Returns
    ///
    /// An array of `Option<&str>`, where each element represents the name of an
    /// optional field if it is set, otherwise `None`.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// let optional_fields = field_columns.optional_fields();
    /// println!("{:?}", optional_fields);
    /// ```
    pub fn optional_fields(&self) -> [Option<&str>; 8] {
        [
            self.source(),
            self.feature_type(),
            self.score(),
            self.phase(),
            self.gene_id(),
            self.gene_name(),
            self.transcript_id(),
            self.exon_number(),
        ]
    }
    /// Returns an array of field names used specifically in the GTF file format.
    ///
    /// This method extracts the names of fields that are standard for GTF format files.
    /// These include `seqname`, `source`, `feature_type`, `start`, `end`, `score`,
    /// `strand`, and `phase`. Some of these fields might be optional in general but
    /// are typically found in GTF files.
    ///
    /// # Returns
    ///
    /// An array of `&str`, each representing a field name used in GTF files.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// let gtf_fields = field_columns.gtf_fields();
    /// println!("{:?}", gtf_fields);
    /// ```
    pub fn gtf_fields(&self) -> [&str; 8] {
        [
            self.seqname(),
            self.source().unwrap_or(""),
            self.feature_type().unwrap_or(""),
            self.start(),
            self.end(),
            self.score().unwrap_or(""),
            self.strand(),
            self.phase().unwrap_or(""),
        ]
    }

    /// Returns an array of attribute field names used specifically in the GTF format.
    ///
    /// In the context of GTF files, certain attributes provide additional information
    /// about genomic features such as `gene_id`, `gene_name`, `transcript_id`, and
    /// `exon_number`. This method returns the names of these attribute fields if they
    /// have been set.
    ///
    /// # Returns
    ///
    /// An array of `Option<&str>`, where each element corresponds to one of the GTF
    /// attribute fields. If a particular attribute is not set, its value in the array
    /// will be `None`.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// let gtf_attributes = field_columns.gtf_attributes();
    /// println!("{:?}", gtf_attributes);
    /// ```
    pub fn gtf_attributes(&self) -> [Option<&str>; 4] {
        [
            self.gene_id(),
            self.gene_name(),
            self.transcript_id(),
            self.exon_number(),
        ]
    }
    /// Returns an array of essential field names required for basic genomic feature analysis.
    ///
    /// This method provides the names of fields that are fundamental to genomic analyses,
    /// specifically those related to the positioning and identification of genomic features.
    /// These fields include `seqname`, `start`, `end`, and `strand`.
    ///
    /// # Returns
    ///
    /// An array of `&str`, each representing a name of an essential field.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// let essential_fields = field_columns.essential_fields();
    /// println!("{:?}", essential_fields);
    /// ```
    pub fn essential_fields(&self) -> [&str; 4] {
        [self.seqname(), self.start(), self.end(), self.strand()]
    }

    /// Validates the `FieldColumns` against a given DataFrame.
    ///
    /// This method checks if the required fields specified in the `FieldColumns` exist in
    /// the provided DataFrame. It optionally prints warnings and can halt execution if
    /// critical fields are missing and cannot be automatically fixed.
    ///
    /// # Arguments
    ///
    /// * `df` - The DataFrame against which to validate the `FieldColumns`.
    /// * `is_warn` - If `true`, print warnings for missing fields.
    /// * `is_bail` - If `true`, throw an error if essential fields are missing.
    ///
    /// # Returns
    ///
    /// `Ok(bool)` indicating whether the `FieldColumns` is valid within the context of
    /// the provided DataFrame. Returns `Err` if critical fields are missing and cannot be fixed.
    ///
    /// # Examples
    ///
    /// ```
    /// let mut field_columns = FieldColumns::default();
    /// let df = DataFrame::new(vec![])?;
    /// let is_valid = field_columns.is_valid(&df, true, false)?;
    /// println!("Is valid: {}", is_valid);
    /// ```
    pub fn is_valid(&self, df: &DataFrame, is_warn: bool, is_bail: bool) -> anyhow::Result<bool> {
        let mut is_valid = true;
        // check required fields
        if df.column(self.seqname()).is_err() {
            is_valid = false;
            if is_warn {
                warn!(
                    "The dataframe does not contain the specified seqname column {}; Cannot proceed. You can add one by calling `df.update_column(\"seqname\", vec!['.'; df.height()])`",
                    self.seqname()
                )
            }
        }
        if df.column(self.start()).is_err() {
            is_valid = false;
            if is_warn {
                warn!(
                    "The dataframe does not contain the specified start column {}; Cannot proceed. You can add one by calling `df.update_column(\"start\", vec!['.'; df.height()])`",
                    self.start()
                )
            }
        }
        if df.column(self.end()).is_err() {
            is_valid = false;
            if is_warn {
                warn!(
                    "The dataframe does not contain the specified end column {}; Cannot proceed. You can add one by calling `df.update_column(\"end\", vec!['.'; df.height()])`",
                    self.end()
                )
            }
        }
        if df.column(self.strand()).is_err() {
            is_valid = false;
            if is_warn {
                warn!(
                    "The dataframe does not contain the specified strand column {}; Cannot proceed. You can add one by calling `df.update_column(\"strand\", vec!['.'; df.height()])`",
                    self.strand()
                )
            }
        }
        // check additional fields
        if let Some(s) = self.source() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided source column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }
        if let Some(s) = self.feature_type() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided feature_type column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }
        if let Some(s) = self.score() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided score column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }
        if let Some(s) = self.phase() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided phase column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }

        if let Some(s) = self.gene_id() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided gene_id column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }

        if let Some(s) = self.gene_name() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided gene_name column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }

        if let Some(s) = self.transcript_id() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided transcript_id column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }

        if let Some(s) = self.exon_number() {
            if df.column(s).is_err() {
                is_valid = false;
                if is_warn {
                    warn!("The provided exon_number column {} is not found in the dataframe; It will be ignored", s)
                }
            }
        }

        if !is_valid & is_bail {
            bail!(
                "The FieldColumns is not valid; Please try fix it by calling FieldColumns::fix()."
            )
        }

        if !is_valid & is_warn {
            warn!(
                "The FieldColumns is not valid; Please try fix it by calling FieldColumns::fix()."
            )
        }

        Ok(is_valid)
    }

    /// Attempts to correct any missing or incorrect field mappings based on a provided DataFrame.
    ///
    /// This method attempts to fix the FieldColumns instance by ensuring that all required and optional
    /// fields have corresponding columns in the given DataFrame. If a necessary field is missing or
    /// incorrect in FieldColumns but exists in the DataFrame under a standard name (like "seqname" for
    /// the sequence name), this method updates the FieldColumns instance to use the correct column name.
    /// If a standard column name does not exist in the DataFrame, an error is returned.
    ///
    /// # Arguments
    ///
    /// * `df` - The DataFrame against which to validate and fix the FieldColumns.
    /// * `is_warn` - If true, the method will log a warning message for each issue it encounters and attempts to fix.
    ///
    /// # Returns
    ///
    /// `Ok(())` if the FieldColumns instance was successfully fixed or if no fixes were needed.
    /// `Err(anyhow::Error)` if a required field could not be fixed because the corresponding column
    /// does not exist in the DataFrame.
    ///
    /// # Examples
    ///
    /// ```
    /// let mut field_columns = FieldColumns::default();
    /// let df = DataFrame::new(vec![])?; // Assume this is a populated DataFrame
    /// field_columns.fix(&df, true)?;
    /// ```
    pub fn fix(&mut self, df: &DataFrame, is_warn: bool) -> anyhow::Result<()> {
        // try fix required fields
        if df.column(self.seqname()).is_err() {
            if is_warn {
                warn!(
                    "cannot find the specified seqname column {} in the dataframe; try to fix",
                    self.seqname()
                );
            }
            if df.column("seqname").is_ok() {
                self.seqname = "seqname".to_string();
            } else {
                bail!("The dataframe does not contain the specified seqname column {} or a column named \"seqname\"; Cannot fix.", self.seqname());
            }
        }
        if df.column(self.start()).is_err() {
            if is_warn {
                warn!(
                    "cannot find the specified start column {} in the dataframe; try to fix",
                    self.start()
                );
            }
            if df.column("start").is_ok() {
                self.start = "start".to_string();
            } else {
                bail!("The dataframe does not contain the specified start column {} or a column named \"start\"; Cannot fix.", self.start());
            }
        }
        if df.column(self.end()).is_err() {
            if is_warn {
                warn!(
                    "cannot find the specified end column {} in the dataframe; try to fix",
                    self.end()
                );
            }
            if df.column("end").is_ok() {
                self.end = "end".to_string();
            } else {
                bail!("The dataframe does not contain the specified end column {} or a column named \"end\"; Cannot fix.", self.end());
            }
        }
        if df.column(self.strand()).is_err() {
            if is_warn {
                warn!(
                    "cannot find the specified strand column {} in the dataframe; try to fix",
                    self.strand()
                );
            }
            if df.column("strand").is_ok() {
                self.strand = "strand".to_string();
            } else {
                bail!("The dataframe does not contain the specified strand column {} or a column named \"strand\"; Cannot fix. If this is desired, you can add a dummy strand column by calling `df.update_column(\"strand\", vec!['.'; df.height()])`", self.strand());
            }
        }

        // try fix optional fields
        if let Some(s) = self.source() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!(
                        "cannot find the specified source column {} in the dataframe; try to fix",
                        s
                    );
                }
                self.source = if df.column("source").is_ok() {
                    Some("source".to_string())
                } else {
                    None
                }
            }
        }
        if let Some(s) = self.feature_type() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!("cannot find the specified feature_type column {} in the dataframe; try to fix", s);
                }
                self.feature_type = if df.column("feature_type").is_ok() {
                    Some("feature_type".to_string())
                } else {
                    None
                }
            }
        }
        if let Some(s) = self.score() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!(
                        "cannot find the specified score column {} in the dataframe; try to fix",
                        s
                    );
                }
                self.score = if df.column("score").is_ok() {
                    Some("score".to_string())
                } else {
                    None
                }
            }
        }
        if let Some(s) = self.phase() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!(
                        "cannot find the specified phase column {} in the dataframe; try to fix",
                        s
                    );
                }
                self.phase = if df.column("phase").is_ok() {
                    Some("phase".to_string())
                } else {
                    None
                }
            }
        }
        if let Some(s) = self.gene_id() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!(
                        "cannot find the specified gene_id column {} in the dataframe; try to fix",
                        s
                    );
                }
                self.gene_id = if df.column("gene_id").is_ok() {
                    Some("gene_id".to_string())
                } else {
                    None
                }
            }
        }

        if let Some(s) = self.gene_name() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!(
                        "cannot find the specified gene_name column {} in the dataframe; try to fix",
                        s
                    );
                }
                self.gene_name = if df.column("gene_name").is_ok() {
                    Some("gene_name".to_string())
                } else {
                    None
                }
            }
        }

        if let Some(s) = self.transcript_id() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!("cannot find the specified transcript_id column {} in the dataframe; try to fix", s);
                }
                self.transcript_id = if df.column("transcript_id").is_ok() {
                    Some("transcript_id".to_string())
                } else {
                    None
                }
            }
        }

        if let Some(s) = self.exon_number() {
            if df.column(s).is_err() {
                if is_warn {
                    warn!("cannot find the specified exon_number column {} in the dataframe; try to fix", s);
                }
                self.exon_number = if df.column("exon_number").is_ok() {
                    Some("exon_number".to_string())
                } else {
                    None
                }
            }
        }

        Ok(())
    }

    /// Updates the value of a specific field within the `FieldColumns`.
    ///
    /// This method allows modifying the column names mapped in `FieldColumns` to better match the
    /// structure of a different DataFrame or to correct any mistakes in the initial setup. If the
    /// specified field name does not exist in `FieldColumns`, the method will return an error.
    ///
    /// # Arguments
    ///
    /// * `field` - A string slice that holds the name of the field to update (e.g., "seqname", "start").
    /// * `value` - A string slice representing the new value for the field.
    ///
    /// # Returns
    ///
    /// * `Ok(())` if the field was successfully updated.
    /// * `Err(anyhow::Error)` if the field name provided does not exist in `FieldColumns`.
    ///
    /// # Examples
    ///
    /// ```
    /// let mut field_columns = FieldColumns::default();
    /// field_columns.update("seqname", "chromosome")?;
    /// assert_eq!(field_columns.seqname(), "chromosome");
    /// ```
    pub fn update<T: AsRef<str>>(&mut self, field: T, value: T) -> anyhow::Result<()> {
        let value = value.as_ref().to_string();
        match field.as_ref() {
            "seqname" => self.seqname = value,
            "source" => self.source = Some(value),
            "feature_type" => self.feature_type = Some(value),
            "start" => self.start = value,
            "end" => self.end = value,
            "score" => self.score = Some(value),
            "strand" => self.strand = value,
            "phase" => self.phase = Some(value),
            "gene_id" => self.gene_id = Some(value),
            "gene_name" => self.gene_name = Some(value),
            "transcript_id" => self.transcript_id = Some(value),
            "exon_number" => self.exon_number = Some(value),
            _ => bail!("invalid field name: {}", field.as_ref()),
        }

        Ok(())
    }

    /// Retrieves the value of a specified field from the `FieldColumns`.
    ///
    /// This method returns the current value associated with a given field, if it exists.
    ///
    /// # Arguments
    ///
    /// * `field` - A string slice representing the name of the field to retrieve.
    ///
    /// # Returns
    ///
    /// An `Option<&str>` which is `Some` containing the value of the field if the field exists,
    /// otherwise `None`.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// assert_eq!(field_columns.field("seqname"), Some("seqname"));
    /// assert_eq!(field_columns.field("nonexistent"), None);
    /// ```
    pub fn field<T: AsRef<str>>(&self, field: T) -> Option<&str> {
        match field.as_ref() {
            "seqname" => Some(self.seqname.as_str()),
            "source" => self.source(),
            "feature_type" => self.feature_type(),
            "start" => Some(self.start.as_str()),
            "end" => Some(self.end.as_str()),
            "score" => self.score(),
            "strand" => Some(self.strand.as_str()),
            "phase" => self.phase(),
            "gene_id" => self.gene_id(),
            "gene_name" => self.gene_name(),
            "transcript_id" => self.transcript_id(),
            "exon_number" => self.exon_number(),
            _ => None,
        }
    }

    /// Retrieves the value of a specified field from the `FieldColumns`, with an option to trigger an error if the field is not found.
    ///
    /// This method functions similarly to `field`, but it can optionally return an error if the field does not exist, based on the `is_bail` argument.
    ///
    /// # Arguments
    ///
    /// * `field` - A string slice representing the name of the field to retrieve.
    /// * `is_bail` - A boolean indicating whether to bail (return an error) if the field does not exist.
    ///
    /// # Returns
    ///
    /// * `Ok(Some(&str))` if the field exists.
    /// * `Ok(None)` if the field does not exist and `is_bail` is set to `false`.
    /// * `Err(anyhow::Error)` if the field does not exist and `is_bail` is set to `true`.
    ///
    /// # Examples
    ///
    /// ```
    /// let field_columns = FieldColumns::default();
    /// assert_eq!(field_columns.field_checked("seqname", false).unwrap(), Some("seqname"));
    /// assert!(field_columns.field_checked("nonexistent", true).is_err());
    /// ```
    pub fn field_checked<T: AsRef<str>>(
        &self,
        field: T,
        is_bail: bool,
    ) -> anyhow::Result<Option<&str>> {
        match field.as_ref() {
            "seqname" => Ok(Some(self.seqname.as_str())),
            "source" => Ok(self.source()),
            "feature_type" => Ok(self.feature_type()),
            "start" => Ok(Some(self.start.as_str())),
            "end" => Ok(Some(self.end.as_str())),
            "score" => Ok(self.score()),
            "strand" => Ok(Some(self.strand.as_str())),
            "phase" => Ok(self.phase()),
            "gene_id" => Ok(self.gene_id()),
            "gene_name" => Ok(self.gene_name()),
            "transcript_id" => Ok(self.transcript_id()),
            "exon_number" => Ok(self.exon_number()),
            _ => {
                if is_bail {
                    bail!(
                        "The provided field {} is not a valid field name; Cannot proceed",
                        field.as_ref()
                    );
                }

                warn!(
                    "The provided field {} is not a valid field name; It will be ignored",
                    field.as_ref()
                );
                Ok(None)
            }
        }
    }

    pub fn all_fields() -> [&'static str; 12] {
        FIELDCOLUMNS
    }
}