lumosai_vector_fastembed/
models.rs

1//! FastEmbed model definitions and metadata
2
3use serde::{Deserialize, Serialize};
4
5/// Available FastEmbed models with their configurations
6#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
7pub enum FastEmbedModel {
8    /// BGE Small English v1.5 (384 dimensions)
9    /// Best for: General purpose, fast inference
10    BGESmallENV15,
11    
12    /// BGE Base English v1.5 (768 dimensions)
13    /// Best for: Balanced performance and quality
14    BGEBaseENV15,
15    
16    /// BGE Large English v1.5 (1024 dimensions)
17    /// Best for: High quality embeddings, slower inference
18    BGELargeENV15,
19    
20    /// All MiniLM L6 v2 (384 dimensions)
21    /// Best for: Fast inference, good for semantic search
22    AllMiniLML6V2,
23    
24    /// All MiniLM L12 v2 (384 dimensions)
25    /// Best for: Better quality than L6, still fast
26    AllMiniLML12V2,
27    
28    /// Multilingual E5 Small (384 dimensions)
29    /// Best for: Multilingual applications, 100+ languages
30    MultilingualE5Small,
31    
32    /// Multilingual E5 Base (768 dimensions)
33    /// Best for: High-quality multilingual embeddings
34    MultilingualE5Base,
35    
36    /// Multilingual E5 Large (1024 dimensions)
37    /// Best for: Best multilingual quality, slower inference
38    MultilingualE5Large,
39    
40    /// Custom model with name and dimensions
41    Custom {
42        name: String,
43        dimensions: usize,
44        max_sequence_length: Option<usize>,
45    },
46}
47
48impl FastEmbedModel {
49    /// Get the model name string used by FastEmbed
50    pub fn model_name(&self) -> &str {
51        match self {
52            FastEmbedModel::BGESmallENV15 => "BAAI/bge-small-en-v1.5",
53            FastEmbedModel::BGEBaseENV15 => "BAAI/bge-base-en-v1.5",
54            FastEmbedModel::BGELargeENV15 => "BAAI/bge-large-en-v1.5",
55            FastEmbedModel::AllMiniLML6V2 => "sentence-transformers/all-MiniLM-L6-v2",
56            FastEmbedModel::AllMiniLML12V2 => "sentence-transformers/all-MiniLM-L12-v2",
57            FastEmbedModel::MultilingualE5Small => "intfloat/multilingual-e5-small",
58            FastEmbedModel::MultilingualE5Base => "intfloat/multilingual-e5-base",
59            FastEmbedModel::MultilingualE5Large => "intfloat/multilingual-e5-large",
60            FastEmbedModel::Custom { name, .. } => name,
61        }
62    }
63    
64    /// Get the embedding dimensions
65    pub fn dimensions(&self) -> usize {
66        match self {
67            FastEmbedModel::BGESmallENV15 => 384,
68            FastEmbedModel::BGEBaseENV15 => 768,
69            FastEmbedModel::BGELargeENV15 => 1024,
70            FastEmbedModel::AllMiniLML6V2 => 384,
71            FastEmbedModel::AllMiniLML12V2 => 384,
72            FastEmbedModel::MultilingualE5Small => 384,
73            FastEmbedModel::MultilingualE5Base => 768,
74            FastEmbedModel::MultilingualE5Large => 1024,
75            FastEmbedModel::Custom { dimensions, .. } => *dimensions,
76        }
77    }
78    
79    /// Get the maximum sequence length supported by the model
80    pub fn max_sequence_length(&self) -> usize {
81        match self {
82            FastEmbedModel::BGESmallENV15 => 512,
83            FastEmbedModel::BGEBaseENV15 => 512,
84            FastEmbedModel::BGELargeENV15 => 512,
85            FastEmbedModel::AllMiniLML6V2 => 256,
86            FastEmbedModel::AllMiniLML12V2 => 256,
87            FastEmbedModel::MultilingualE5Small => 512,
88            FastEmbedModel::MultilingualE5Base => 512,
89            FastEmbedModel::MultilingualE5Large => 512,
90            FastEmbedModel::Custom { max_sequence_length, .. } => {
91                max_sequence_length.unwrap_or(512)
92            }
93        }
94    }
95    
96    /// Get a description of the model
97    pub fn description(&self) -> &str {
98        match self {
99            FastEmbedModel::BGESmallENV15 => {
100                "BGE Small English v1.5 - Fast and efficient for general purpose embedding tasks"
101            }
102            FastEmbedModel::BGEBaseENV15 => {
103                "BGE Base English v1.5 - Balanced performance and quality for English text"
104            }
105            FastEmbedModel::BGELargeENV15 => {
106                "BGE Large English v1.5 - High quality embeddings with larger model size"
107            }
108            FastEmbedModel::AllMiniLML6V2 => {
109                "All MiniLM L6 v2 - Lightweight model optimized for semantic search"
110            }
111            FastEmbedModel::AllMiniLML12V2 => {
112                "All MiniLM L12 v2 - Better quality than L6 while maintaining efficiency"
113            }
114            FastEmbedModel::MultilingualE5Small => {
115                "Multilingual E5 Small - Supports 100+ languages with good performance"
116            }
117            FastEmbedModel::MultilingualE5Base => {
118                "Multilingual E5 Base - High-quality multilingual embeddings"
119            }
120            FastEmbedModel::MultilingualE5Large => {
121                "Multilingual E5 Large - Best multilingual quality with larger model size"
122            }
123            FastEmbedModel::Custom { name, .. } => name,
124        }
125    }
126    
127    /// Get the languages supported by the model
128    pub fn language_support(&self) -> Vec<&str> {
129        match self {
130            FastEmbedModel::BGESmallENV15 
131            | FastEmbedModel::BGEBaseENV15 
132            | FastEmbedModel::BGELargeENV15 
133            | FastEmbedModel::AllMiniLML6V2 
134            | FastEmbedModel::AllMiniLML12V2 => {
135                vec!["en"] // English only
136            }
137            FastEmbedModel::MultilingualE5Small 
138            | FastEmbedModel::MultilingualE5Base 
139            | FastEmbedModel::MultilingualE5Large => {
140                vec![
141                    "en", "zh", "es", "fr", "de", "it", "pt", "ru", "ja", "ko",
142                    "ar", "hi", "th", "vi", "id", "ms", "tl", "nl", "sv", "da",
143                    "no", "fi", "pl", "cs", "sk", "hu", "ro", "bg", "hr", "sl",
144                    "et", "lv", "lt", "mt", "ga", "eu", "ca", "gl", "cy", "is",
145                    "mk", "sq", "sr", "bs", "me", "hr", "sl", "sk", "cs", "pl",
146                    // ... and many more (100+ languages total)
147                ]
148            }
149            FastEmbedModel::Custom { .. } => {
150                vec!["unknown"] // Custom models have unknown language support
151            }
152        }
153    }
154    
155    /// Check if the model supports a specific language
156    pub fn supports_language(&self, language: &str) -> bool {
157        self.language_support().contains(&language)
158    }
159    
160    /// Get the model family (BGE, MiniLM, E5, etc.)
161    pub fn model_family(&self) -> ModelFamily {
162        match self {
163            FastEmbedModel::BGESmallENV15 
164            | FastEmbedModel::BGEBaseENV15 
165            | FastEmbedModel::BGELargeENV15 => ModelFamily::BGE,
166            
167            FastEmbedModel::AllMiniLML6V2 
168            | FastEmbedModel::AllMiniLML12V2 => ModelFamily::MiniLM,
169            
170            FastEmbedModel::MultilingualE5Small 
171            | FastEmbedModel::MultilingualE5Base 
172            | FastEmbedModel::MultilingualE5Large => ModelFamily::E5,
173            
174            FastEmbedModel::Custom { .. } => ModelFamily::Custom,
175        }
176    }
177    
178    /// Convert to fastembed EmbeddingModel enum
179    pub fn to_fastembed_model(&self) -> fastembed::EmbeddingModel {
180        match self {
181            FastEmbedModel::BGESmallENV15 => fastembed::EmbeddingModel::BGESmallENV15,
182            FastEmbedModel::BGEBaseENV15 => fastembed::EmbeddingModel::BGEBaseENV15,
183            FastEmbedModel::BGELargeENV15 => fastembed::EmbeddingModel::BGELargeENV15,
184            FastEmbedModel::AllMiniLML6V2 => fastembed::EmbeddingModel::AllMiniLML6V2,
185            FastEmbedModel::AllMiniLML12V2 => fastembed::EmbeddingModel::AllMiniLML12V2,
186            FastEmbedModel::MultilingualE5Small => fastembed::EmbeddingModel::MultilingualE5Small,
187            FastEmbedModel::MultilingualE5Base => fastembed::EmbeddingModel::MultilingualE5Base,
188            FastEmbedModel::MultilingualE5Large => fastembed::EmbeddingModel::MultilingualE5Large,
189            FastEmbedModel::Custom { .. } => {
190                // For custom models, default to BGE Small
191                // In practice, custom models would need special handling
192                fastembed::EmbeddingModel::BGESmallENV15
193            }
194        }
195    }
196    
197    /// Get recommended use cases for the model
198    pub fn use_cases(&self) -> Vec<&str> {
199        match self {
200            FastEmbedModel::BGESmallENV15 => {
201                vec!["semantic search", "document similarity", "clustering", "classification"]
202            }
203            FastEmbedModel::BGEBaseENV15 => {
204                vec!["semantic search", "document similarity", "RAG systems", "question answering"]
205            }
206            FastEmbedModel::BGELargeENV15 => {
207                vec!["high-quality RAG", "research applications", "complex similarity tasks"]
208            }
209            FastEmbedModel::AllMiniLML6V2 => {
210                vec!["fast semantic search", "real-time applications", "mobile deployment"]
211            }
212            FastEmbedModel::AllMiniLML12V2 => {
213                vec!["semantic search", "document clustering", "content recommendation"]
214            }
215            FastEmbedModel::MultilingualE5Small => {
216                vec!["multilingual search", "cross-language retrieval", "international applications"]
217            }
218            FastEmbedModel::MultilingualE5Base => {
219                vec!["multilingual RAG", "cross-language similarity", "global content analysis"]
220            }
221            FastEmbedModel::MultilingualE5Large => {
222                vec!["high-quality multilingual RAG", "research", "enterprise multilingual systems"]
223            }
224            FastEmbedModel::Custom { .. } => {
225                vec!["custom applications"]
226            }
227        }
228    }
229}
230
231/// Model family classification
232#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
233pub enum ModelFamily {
234    /// BGE (Beijing Academy of Artificial Intelligence) models
235    BGE,
236    /// MiniLM models (Microsoft)
237    MiniLM,
238    /// E5 models (Microsoft)
239    E5,
240    /// Custom models
241    Custom,
242}
243
244/// Detailed information about a model
245#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct ModelInfo {
247    /// Model name
248    pub name: String,
249    /// Embedding dimensions
250    pub dimensions: usize,
251    /// Maximum sequence length
252    pub max_sequence_length: usize,
253    /// Model description
254    pub description: String,
255    /// Supported languages
256    pub language_support: Vec<String>,
257}
258
259impl ModelInfo {
260    /// Create model info from a FastEmbedModel
261    pub fn from_model(model: &FastEmbedModel) -> Self {
262        Self {
263            name: model.model_name().to_string(),
264            dimensions: model.dimensions(),
265            max_sequence_length: model.max_sequence_length(),
266            description: model.description().to_string(),
267            language_support: model.language_support().iter().map(|s| s.to_string()).collect(),
268        }
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275    
276    #[test]
277    fn test_model_properties() {
278        let model = FastEmbedModel::BGESmallENV15;
279        assert_eq!(model.model_name(), "BAAI/bge-small-en-v1.5");
280        assert_eq!(model.dimensions(), 384);
281        assert_eq!(model.max_sequence_length(), 512);
282        assert!(model.supports_language("en"));
283        assert!(!model.supports_language("zh"));
284    }
285    
286    #[test]
287    fn test_multilingual_model() {
288        let model = FastEmbedModel::MultilingualE5Small;
289        assert!(model.supports_language("en"));
290        assert!(model.supports_language("zh"));
291        assert!(model.supports_language("es"));
292        assert_eq!(model.model_family(), ModelFamily::E5);
293    }
294    
295    #[test]
296    fn test_custom_model() {
297        let model = FastEmbedModel::Custom {
298            name: "custom-model".to_string(),
299            dimensions: 512,
300            max_sequence_length: Some(1024),
301        };
302        
303        assert_eq!(model.model_name(), "custom-model");
304        assert_eq!(model.dimensions(), 512);
305        assert_eq!(model.max_sequence_length(), 1024);
306        assert_eq!(model.model_family(), ModelFamily::Custom);
307    }
308    
309    #[test]
310    fn test_model_info() {
311        let model = FastEmbedModel::BGEBaseENV15;
312        let info = ModelInfo::from_model(&model);
313        
314        assert_eq!(info.name, "BAAI/bge-base-en-v1.5");
315        assert_eq!(info.dimensions, 768);
316        assert!(info.language_support.contains(&"en".to_string()));
317    }
318}