1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
use thiserror::Error;

use crate::utils::get_unique_characters_of_sequence;
use sigalign_core::reference::PatternIndex;
use lt_fm_index::{
    LtFmIndex, Block, blocks,
};

pub type Lfi32B2V64 = Lfi32<blocks::Block2<u64>>;
pub type Lfi32B3V64 = Lfi32<blocks::Block3<u64>>;
pub type Lfi32B4V64 = Lfi32<blocks::Block4<u64>>;
pub type Lfi32B5V64 = Lfi32<blocks::Block5<u64>>;

pub struct Lfi32<B: Block<u32>> {
    inner: LtFmIndex<u32, B>,
}

#[derive(Debug, Clone)]
pub struct LfiOption {
    pub suffix_array_sampling_ratio: u64,
    pub lookup_table_max_bytes_size : u64,
    pub use_safe_guard: bool,
}
impl LfiOption {
    pub fn new(
        suffix_array_sampling_ratio: u64,
        lookup_table_max_bytes_size: u64,
        use_safe_guard: bool,
    ) -> Self {
        Self {
            suffix_array_sampling_ratio,
            lookup_table_max_bytes_size,
            use_safe_guard,
        }
    }
}

impl <B: Block<u32>> PatternIndex for Lfi32<B> {
    type Option = LfiOption;
    type BuildError = LfiBuildError;
    
    fn new(concatenated_sequence : Vec<u8>, option: Self::Option) -> Result<Self, Self::BuildError> {
        let unique_sequence = get_unique_characters_of_sequence(&concatenated_sequence);
        let mut valid_characters: Vec<Vec<u8>> = unique_sequence.into_iter().map(|v| vec![v]).collect();
        if !option.use_safe_guard {
            valid_characters.pop(); // Remove last character
        }
        let characters_by_index: Vec<&[u8]> = valid_characters.iter()
            .map(|v| v.as_slice())
            .collect();
        if characters_by_index.len() as u32 > B::MAX_CHR {
            let err: LfiBuildError = Self::BuildError::OverMaximumCharacters {
                max: B::MAX_CHR,
                input: characters_by_index.len() as u32,
            };
            return Err(err);
        }

        let sequence_length = concatenated_sequence.len();
        if sequence_length >= u32::MAX as usize {
            return Err(Self::BuildError::SequenceLengthOver(u32::MAX as u64));
        }
        let lookup_table_kmer_size = calculate_lookup_table_kmer_size(
            characters_by_index.len(),
            option.lookup_table_max_bytes_size as usize,
        );

        match LtFmIndex::build(
            concatenated_sequence,
            &characters_by_index,
            option.suffix_array_sampling_ratio as u32,
            lookup_table_kmer_size,
        ) {
            Ok(v) => Ok(Self { inner: v }),
            Err(err) => Err(Self::BuildError::InvalidOption(format!("{}", err))),
        }
    }
    fn get_sorted_positions(&self, pattern: &[u8]) -> Vec<u32> {
        let mut positions = self.inner.locate(pattern);
        positions.sort_unstable();
        positions
    }
}

fn calculate_lookup_table_kmer_size(
    chr_count: usize,
    maximum_bytes_size: usize,
) -> u32 {
    let max_cap = 50;
    for v in 1..=max_cap {
        let estimated_byte_size_of_lt = (chr_count+1).pow(v);
        if estimated_byte_size_of_lt >= maximum_bytes_size {
            return v - 1
        }
    }
    max_cap
}

#[derive(Debug, Error)]
pub enum LfiBuildError {
    /// Triggered when sequence length exceeds the maximum allowable capacity.
    #[error("Sequence length is over the maximum capacity {0}")]
    SequenceLengthOver(u64),
    /// Triggered when input characters exceed the maximum limit that the `PatternIndex` can index.
    #[error("Pattern index can make index of {max} characters, input is {input}")]
    OverMaximumCharacters{
        max: u32,    // The maximum number of characters that PatternIndex can index
        input: u32,  // Input characters
    },
    /// Triggered when the invalid option is passed.
    #[error("Error in option: {0}")]
    InvalidOption(String), // Error message
}

// Impl Extensions
use sigalign_core::reference::extensions::{
    Serialize,
    EstimateSize,
};
//  - Serialize
impl<B: Block<u32>> Serialize for Lfi32<B> {
    fn save_to<W>(&self, mut writer: W) -> Result<(), std::io::Error> where
        W: std::io::Write
    {
        self.inner.save_to(&mut writer)?;
        Ok(())
    }
    fn load_from<R>(mut reader: R) -> Result<Self, std::io::Error> where
        R: std::io::Read,
        Self: Sized
    {
        let inner = LtFmIndex::load_from(&mut reader)?;
        Ok(Self { inner })
    }
}
//  - EstimateSize
impl<B: Block<u32>> EstimateSize for Lfi32<B> {
    fn serialized_size(&self) -> usize {
        self.inner.to_be_saved_size()
    }
}