1#![allow(dead_code)]
13
14use chrono::{DateTime, Utc};
15use serde::{Deserialize, Serialize};
16use std::collections::HashMap;
17
18use super::models::MessageRole;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26#[serde(rename_all = "snake_case")]
27pub enum Modality {
28 Text,
30 Image,
32 Video,
34 Audio,
36 PointCloud,
38 Action,
40 Sensor,
42 Depth,
44 Segmentation,
46 BoundingBox,
48 Pose,
50 Trajectory,
52}
53
54impl std::fmt::Display for Modality {
55 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56 match self {
57 Modality::Text => write!(f, "text"),
58 Modality::Image => write!(f, "image"),
59 Modality::Video => write!(f, "video"),
60 Modality::Audio => write!(f, "audio"),
61 Modality::PointCloud => write!(f, "point_cloud"),
62 Modality::Action => write!(f, "action"),
63 Modality::Sensor => write!(f, "sensor"),
64 Modality::Depth => write!(f, "depth"),
65 Modality::Segmentation => write!(f, "segmentation"),
66 Modality::BoundingBox => write!(f, "bounding_box"),
67 Modality::Pose => write!(f, "pose"),
68 Modality::Trajectory => write!(f, "trajectory"),
69 }
70 }
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
75#[serde(rename_all = "UPPERCASE")]
76pub enum ModelCategory {
77 LLM,
79 VLM,
81 VLA,
83 ALM,
85 VALM,
87 Multimodal,
89 Embodied,
91}
92
93impl std::fmt::Display for ModelCategory {
94 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95 match self {
96 ModelCategory::LLM => write!(f, "LLM"),
97 ModelCategory::VLM => write!(f, "VLM"),
98 ModelCategory::VLA => write!(f, "VLA"),
99 ModelCategory::ALM => write!(f, "ALM"),
100 ModelCategory::VALM => write!(f, "VALM"),
101 ModelCategory::Multimodal => write!(f, "Multimodal"),
102 ModelCategory::Embodied => write!(f, "Embodied"),
103 }
104 }
105}
106
107#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct ModalityCapabilities {
114 pub category: ModelCategory,
116 pub input_modalities: Vec<Modality>,
118 pub output_modalities: Vec<Modality>,
120 #[serde(default, skip_serializing_if = "Option::is_none")]
122 pub max_image_resolution: Option<(u32, u32)>,
123 #[serde(default, skip_serializing_if = "Option::is_none")]
125 pub max_video_duration: Option<u32>,
126 #[serde(default, skip_serializing_if = "Option::is_none")]
128 pub max_audio_duration: Option<u32>,
129 #[serde(default, skip_serializing_if = "Option::is_none")]
131 pub max_images_per_request: Option<u32>,
132 #[serde(default)]
134 pub supported_image_formats: Vec<ImageFormat>,
135 #[serde(default)]
137 pub supports_streaming: bool,
138 #[serde(default)]
140 pub supports_interleaved: bool,
141}
142
143impl Default for ModalityCapabilities {
144 fn default() -> Self {
145 Self {
146 category: ModelCategory::LLM,
147 input_modalities: vec![Modality::Text],
148 output_modalities: vec![Modality::Text],
149 max_image_resolution: None,
150 max_video_duration: None,
151 max_audio_duration: None,
152 max_images_per_request: None,
153 supported_image_formats: vec![],
154 supports_streaming: false,
155 supports_interleaved: false,
156 }
157 }
158}
159
160impl ModalityCapabilities {
161 pub fn llm() -> Self {
163 Self {
164 category: ModelCategory::LLM,
165 input_modalities: vec![Modality::Text],
166 output_modalities: vec![Modality::Text],
167 supports_streaming: true,
168 ..Default::default()
169 }
170 }
171
172 pub fn vlm() -> Self {
174 Self {
175 category: ModelCategory::VLM,
176 input_modalities: vec![Modality::Text, Modality::Image],
177 output_modalities: vec![Modality::Text],
178 max_image_resolution: Some((4096, 4096)),
179 max_images_per_request: Some(20),
180 supported_image_formats: vec![
181 ImageFormat::Png,
182 ImageFormat::Jpeg,
183 ImageFormat::Webp,
184 ImageFormat::Gif,
185 ],
186 supports_streaming: true,
187 supports_interleaved: true,
188 ..Default::default()
189 }
190 }
191
192 pub fn vla() -> Self {
194 Self {
195 category: ModelCategory::VLA,
196 input_modalities: vec![
197 Modality::Text,
198 Modality::Image,
199 Modality::Sensor,
200 Modality::Depth,
201 ],
202 output_modalities: vec![Modality::Text, Modality::Action, Modality::Trajectory],
203 max_image_resolution: Some((1024, 1024)),
204 max_images_per_request: Some(10),
205 supported_image_formats: vec![ImageFormat::Png, ImageFormat::Jpeg],
206 supports_streaming: true,
207 supports_interleaved: true,
208 ..Default::default()
209 }
210 }
211
212 pub fn multimodal() -> Self {
214 Self {
215 category: ModelCategory::Multimodal,
216 input_modalities: vec![
217 Modality::Text,
218 Modality::Image,
219 Modality::Audio,
220 Modality::Video,
221 ],
222 output_modalities: vec![Modality::Text, Modality::Image, Modality::Audio],
223 max_image_resolution: Some((4096, 4096)),
224 max_video_duration: Some(3600),
225 max_audio_duration: Some(3600),
226 max_images_per_request: Some(50),
227 supported_image_formats: vec![
228 ImageFormat::Png,
229 ImageFormat::Jpeg,
230 ImageFormat::Webp,
231 ImageFormat::Gif,
232 ],
233 supports_streaming: true,
234 supports_interleaved: true,
235 }
236 }
237
238 pub fn embodied() -> Self {
240 Self {
241 category: ModelCategory::Embodied,
242 input_modalities: vec![
243 Modality::Text,
244 Modality::Image,
245 Modality::Depth,
246 Modality::PointCloud,
247 Modality::Sensor,
248 Modality::Pose,
249 ],
250 output_modalities: vec![
251 Modality::Text,
252 Modality::Action,
253 Modality::Trajectory,
254 Modality::Pose,
255 ],
256 max_image_resolution: Some((1280, 720)),
257 max_images_per_request: Some(8),
258 supported_image_formats: vec![ImageFormat::Png, ImageFormat::Jpeg],
259 supports_streaming: true,
260 supports_interleaved: true,
261 ..Default::default()
262 }
263 }
264
265 pub fn supports_input(&self, modality: Modality) -> bool {
267 self.input_modalities.contains(&modality)
268 }
269
270 pub fn supports_output(&self, modality: Modality) -> bool {
272 self.output_modalities.contains(&modality)
273 }
274
275 pub fn supports_vision(&self) -> bool {
277 self.supports_input(Modality::Image) || self.supports_input(Modality::Video)
278 }
279
280 pub fn supports_actions(&self) -> bool {
282 self.supports_output(Modality::Action) || self.supports_output(Modality::Trajectory)
283 }
284}
285
286#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
288#[serde(rename_all = "lowercase")]
289pub enum ImageFormat {
290 Png,
291 Jpeg,
292 Webp,
293 Gif,
294 Bmp,
295 Tiff,
296 Heic,
297}
298
299impl std::fmt::Display for ImageFormat {
300 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
301 match self {
302 ImageFormat::Png => write!(f, "png"),
303 ImageFormat::Jpeg => write!(f, "jpeg"),
304 ImageFormat::Webp => write!(f, "webp"),
305 ImageFormat::Gif => write!(f, "gif"),
306 ImageFormat::Bmp => write!(f, "bmp"),
307 ImageFormat::Tiff => write!(f, "tiff"),
308 ImageFormat::Heic => write!(f, "heic"),
309 }
310 }
311}
312
313#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct ImageContent {
320 pub data: ImageData,
322 #[serde(default, skip_serializing_if = "Option::is_none")]
324 pub format: Option<ImageFormat>,
325 #[serde(default, skip_serializing_if = "Option::is_none")]
327 pub alt_text: Option<String>,
328 #[serde(default, skip_serializing_if = "Vec::is_empty")]
330 pub regions: Vec<BoundingBoxRegion>,
331}
332
333#[derive(Debug, Clone, Serialize, Deserialize)]
335#[serde(untagged)]
336pub enum ImageData {
337 Base64 {
339 #[serde(rename = "base64")]
340 data: String,
341 media_type: String,
342 },
343 Url {
345 url: String,
346 #[serde(default, skip_serializing_if = "Option::is_none")]
347 detail: Option<ImageDetail>,
348 },
349}
350
351#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
353#[serde(rename_all = "lowercase")]
354pub enum ImageDetail {
355 Low,
357 High,
359 Auto,
361}
362
363#[derive(Debug, Clone, Serialize, Deserialize)]
365pub struct BoundingBoxRegion {
366 pub label: String,
368 pub x: f32,
370 pub y: f32,
371 pub width: f32,
372 pub height: f32,
373 #[serde(default, skip_serializing_if = "Option::is_none")]
375 pub confidence: Option<f32>,
376}
377
378#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct VideoContent {
381 pub data: VideoData,
383 #[serde(default, skip_serializing_if = "Option::is_none")]
385 pub duration: Option<f32>,
386 #[serde(default, skip_serializing_if = "Option::is_none")]
388 pub start_time: Option<f32>,
389 #[serde(default, skip_serializing_if = "Option::is_none")]
391 pub end_time: Option<f32>,
392 #[serde(default, skip_serializing_if = "Option::is_none")]
394 pub fps: Option<f32>,
395}
396
397#[derive(Debug, Clone, Serialize, Deserialize)]
399#[serde(untagged)]
400pub enum VideoData {
401 Url { url: String },
403 Frames { frames: Vec<ImageContent> },
405 Base64 { base64: String, media_type: String },
407}
408
409#[derive(Debug, Clone, Serialize, Deserialize)]
411pub struct AudioContent {
412 pub data: AudioData,
414 #[serde(default, skip_serializing_if = "Option::is_none")]
416 pub format: Option<String>,
417 #[serde(default, skip_serializing_if = "Option::is_none")]
419 pub duration: Option<f32>,
420 #[serde(default, skip_serializing_if = "Option::is_none")]
422 pub sample_rate: Option<u32>,
423 #[serde(default, skip_serializing_if = "Option::is_none")]
425 pub transcription: Option<String>,
426}
427
428#[derive(Debug, Clone, Serialize, Deserialize)]
430#[serde(untagged)]
431pub enum AudioData {
432 Url { url: String },
434 Base64 { base64: String, media_type: String },
436}
437
438#[derive(Debug, Clone, Serialize, Deserialize)]
444pub struct ActionCommand {
445 pub action_type: ActionType,
447 pub parameters: ActionParameters,
449 #[serde(default, skip_serializing_if = "Option::is_none")]
451 pub confidence: Option<f32>,
452 #[serde(default, skip_serializing_if = "Option::is_none")]
454 pub timestamp: Option<DateTime<Utc>>,
455 #[serde(default, skip_serializing_if = "Option::is_none")]
457 pub duration_ms: Option<u64>,
458}
459
460#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
462#[serde(rename_all = "snake_case")]
463pub enum ActionType {
464 Move,
466 Rotate,
467 Stop,
468
469 Grasp,
471 Release,
472 Push,
473 Pull,
474 Place,
475 Pick,
476
477 Open,
479 Close,
480
481 MoveArm,
483 MoveJoint,
484
485 Look,
487 Focus,
488
489 Custom,
491 Wait,
492 Sequence,
493}
494
495#[derive(Debug, Clone, Serialize, Deserialize)]
497#[serde(untagged)]
498pub enum ActionParameters {
499 Movement {
501 #[serde(default)]
502 x: f64,
503 #[serde(default)]
504 y: f64,
505 #[serde(default)]
506 z: f64,
507 #[serde(default)]
509 is_velocity: bool,
510 #[serde(default, skip_serializing_if = "Option::is_none")]
512 frame: Option<String>,
513 },
514 Rotation {
516 #[serde(default, skip_serializing_if = "Option::is_none")]
517 roll: Option<f64>,
518 #[serde(default, skip_serializing_if = "Option::is_none")]
519 pitch: Option<f64>,
520 #[serde(default, skip_serializing_if = "Option::is_none")]
521 yaw: Option<f64>,
522 #[serde(default, skip_serializing_if = "Option::is_none")]
523 quaternion: Option<[f64; 4]>,
524 },
525 Gripper {
527 aperture: f64,
529 #[serde(default, skip_serializing_if = "Option::is_none")]
531 force: Option<f64>,
532 },
533 JointPositions {
535 positions: Vec<f64>,
537 #[serde(default, skip_serializing_if = "Vec::is_empty")]
539 joint_names: Vec<String>,
540 },
541 TargetPose {
543 position: [f64; 3],
544 orientation: [f64; 4],
546 },
547 Trajectory {
549 waypoints: Vec<Waypoint>,
550 #[serde(default, skip_serializing_if = "Option::is_none")]
552 interpolation: Option<String>,
553 },
554 Custom(serde_json::Value),
556}
557
558#[derive(Debug, Clone, Serialize, Deserialize)]
560pub struct Waypoint {
561 pub position: [f64; 3],
563 #[serde(default, skip_serializing_if = "Option::is_none")]
565 pub orientation: Option<[f64; 4]>,
566 #[serde(default, skip_serializing_if = "Option::is_none")]
568 pub time: Option<f64>,
569 #[serde(default, skip_serializing_if = "Option::is_none")]
571 pub gripper: Option<f64>,
572}
573
574#[derive(Debug, Clone, Serialize, Deserialize)]
576pub struct SensorData {
577 pub sensor_type: SensorType,
579 pub values: SensorValues,
581 pub timestamp: DateTime<Utc>,
583 #[serde(default, skip_serializing_if = "Option::is_none")]
585 pub frame: Option<String>,
586}
587
588#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
590#[serde(rename_all = "snake_case")]
591pub enum SensorType {
592 JointState,
594 Imu,
596 ForceTorque,
598 Depth,
600 Lidar,
602 Localization,
604 Tactile,
606 Odometry,
608 Custom,
610}
611
612#[derive(Debug, Clone, Serialize, Deserialize)]
614#[serde(untagged)]
615pub enum SensorValues {
616 JointState {
618 positions: Vec<f64>,
619 #[serde(default, skip_serializing_if = "Vec::is_empty")]
620 velocities: Vec<f64>,
621 #[serde(default, skip_serializing_if = "Vec::is_empty")]
622 efforts: Vec<f64>,
623 },
624 Imu {
626 linear_acceleration: [f64; 3],
627 angular_velocity: [f64; 3],
628 #[serde(default, skip_serializing_if = "Option::is_none")]
629 orientation: Option<[f64; 4]>,
630 },
631 ForceTorque { force: [f64; 3], torque: [f64; 3] },
633 Depth {
635 data: String,
636 width: u32,
637 height: u32,
638 #[serde(default, skip_serializing_if = "Option::is_none")]
639 encoding: Option<String>,
640 },
641 PointCloud {
643 points: Vec<[f64; 3]>,
644 #[serde(default, skip_serializing_if = "Vec::is_empty")]
645 colors: Vec<[u8; 3]>,
646 },
647 Pose {
649 position: [f64; 3],
650 orientation: [f64; 4],
651 },
652 Numeric(Vec<f64>),
654 Custom(serde_json::Value),
656}
657
658#[derive(Debug, Clone, Serialize, Deserialize)]
664#[serde(tag = "type", rename_all = "snake_case")]
665pub enum ContentPart {
666 Text { text: String },
668 Image(ImageContent),
670 Video(VideoContent),
672 Audio(AudioContent),
674 Sensor(SensorData),
676 Action(ActionCommand),
678 File {
680 url: String,
681 #[serde(default, skip_serializing_if = "Option::is_none")]
682 mime_type: Option<String>,
683 },
684}
685
686#[derive(Debug, Clone, Serialize, Deserialize)]
688pub struct MultimodalMessage {
689 pub role: MessageRole,
691 pub content: Vec<ContentPart>,
693 #[serde(default = "Utc::now")]
695 pub timestamp: DateTime<Utc>,
696 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
698 pub metadata: HashMap<String, serde_json::Value>,
699}
700
701impl MultimodalMessage {
702 pub fn text(role: MessageRole, text: impl Into<String>) -> Self {
704 Self {
705 role,
706 content: vec![ContentPart::Text { text: text.into() }],
707 timestamp: Utc::now(),
708 metadata: HashMap::new(),
709 }
710 }
711
712 pub fn with_image(role: MessageRole, text: impl Into<String>, image: ImageContent) -> Self {
714 Self {
715 role,
716 content: vec![
717 ContentPart::Text { text: text.into() },
718 ContentPart::Image(image),
719 ],
720 timestamp: Utc::now(),
721 metadata: HashMap::new(),
722 }
723 }
724
725 pub fn add_image(&mut self, image: ImageContent) {
727 self.content.push(ContentPart::Image(image));
728 }
729
730 pub fn add_sensor(&mut self, sensor: SensorData) {
732 self.content.push(ContentPart::Sensor(sensor));
733 }
734
735 pub fn add_action(&mut self, action: ActionCommand) {
737 self.content.push(ContentPart::Action(action));
738 }
739
740 pub fn text_content(&self) -> String {
742 self.content
743 .iter()
744 .filter_map(|part| match part {
745 ContentPart::Text { text } => Some(text.as_str()),
746 _ => None,
747 })
748 .collect::<Vec<_>>()
749 .join("\n")
750 }
751
752 pub fn images(&self) -> Vec<&ImageContent> {
754 self.content
755 .iter()
756 .filter_map(|part| match part {
757 ContentPart::Image(img) => Some(img),
758 _ => None,
759 })
760 .collect()
761 }
762
763 pub fn actions(&self) -> Vec<&ActionCommand> {
765 self.content
766 .iter()
767 .filter_map(|part| match part {
768 ContentPart::Action(action) => Some(action),
769 _ => None,
770 })
771 .collect()
772 }
773}
774
775#[derive(Debug, Clone, Serialize, Deserialize)]
781pub struct MultimodalModel {
782 pub id: String,
784 pub name: String,
786 pub provider: String,
788 pub category: ModelCategory,
790 pub capabilities: ModalityCapabilities,
792 pub max_context: u32,
794 #[serde(default, skip_serializing_if = "Option::is_none")]
796 pub version: Option<String>,
797 #[serde(default, skip_serializing_if = "Option::is_none")]
799 pub release_date: Option<String>,
800 #[serde(default, skip_serializing_if = "Option::is_none")]
802 pub description: Option<String>,
803 #[serde(default, skip_serializing_if = "Option::is_none")]
805 pub pricing: Option<ModelPricing>,
806 #[serde(default)]
808 pub available: bool,
809 #[serde(default)]
811 pub local: bool,
812}
813
814#[derive(Debug, Clone, Serialize, Deserialize)]
816pub struct ModelPricing {
817 #[serde(default, skip_serializing_if = "Option::is_none")]
819 pub input_per_million: Option<f64>,
820 #[serde(default, skip_serializing_if = "Option::is_none")]
822 pub output_per_million: Option<f64>,
823 #[serde(default, skip_serializing_if = "Option::is_none")]
825 pub per_image: Option<f64>,
826 #[serde(default, skip_serializing_if = "Option::is_none")]
828 pub per_video_minute: Option<f64>,
829 #[serde(default, skip_serializing_if = "Option::is_none")]
831 pub per_audio_minute: Option<f64>,
832 #[serde(default = "default_currency")]
834 pub currency: String,
835}
836
837fn default_currency() -> String {
838 "USD".to_string()
839}
840
841pub fn vlm_models() -> Vec<MultimodalModel> {
843 vec![
844 MultimodalModel {
846 id: "gpt-4o".to_string(),
847 name: "GPT-4o".to_string(),
848 provider: "OpenAI".to_string(),
849 category: ModelCategory::Multimodal,
850 capabilities: ModalityCapabilities::multimodal(),
851 max_context: 128000,
852 version: Some("2024-11-20".to_string()),
853 release_date: Some("2024-05-13".to_string()),
854 description: Some("Most capable GPT-4 with vision, audio, and text".to_string()),
855 pricing: Some(ModelPricing {
856 input_per_million: Some(2.50),
857 output_per_million: Some(10.00),
858 per_image: None,
859 per_video_minute: None,
860 per_audio_minute: None,
861 currency: "USD".to_string(),
862 }),
863 available: true,
864 local: false,
865 },
866 MultimodalModel {
867 id: "gpt-4o-mini".to_string(),
868 name: "GPT-4o Mini".to_string(),
869 provider: "OpenAI".to_string(),
870 category: ModelCategory::VLM,
871 capabilities: ModalityCapabilities::vlm(),
872 max_context: 128000,
873 version: Some("2024-07-18".to_string()),
874 release_date: Some("2024-07-18".to_string()),
875 description: Some("Affordable small model with vision capabilities".to_string()),
876 pricing: Some(ModelPricing {
877 input_per_million: Some(0.15),
878 output_per_million: Some(0.60),
879 per_image: None,
880 per_video_minute: None,
881 per_audio_minute: None,
882 currency: "USD".to_string(),
883 }),
884 available: true,
885 local: false,
886 },
887 MultimodalModel {
889 id: "gemini-2.0-flash".to_string(),
890 name: "Gemini 2.0 Flash".to_string(),
891 provider: "Google".to_string(),
892 category: ModelCategory::Multimodal,
893 capabilities: ModalityCapabilities::multimodal(),
894 max_context: 1000000,
895 version: Some("2.0".to_string()),
896 release_date: Some("2024-12-11".to_string()),
897 description: Some("Fastest Gemini with native multimodal generation".to_string()),
898 pricing: Some(ModelPricing {
899 input_per_million: Some(0.075),
900 output_per_million: Some(0.30),
901 per_image: None,
902 per_video_minute: None,
903 per_audio_minute: None,
904 currency: "USD".to_string(),
905 }),
906 available: true,
907 local: false,
908 },
909 MultimodalModel {
910 id: "gemini-1.5-pro".to_string(),
911 name: "Gemini 1.5 Pro".to_string(),
912 provider: "Google".to_string(),
913 category: ModelCategory::Multimodal,
914 capabilities: ModalityCapabilities::multimodal(),
915 max_context: 2000000,
916 version: Some("1.5".to_string()),
917 release_date: Some("2024-02-15".to_string()),
918 description: Some("2M context window with video understanding".to_string()),
919 pricing: Some(ModelPricing {
920 input_per_million: Some(1.25),
921 output_per_million: Some(5.00),
922 per_image: None,
923 per_video_minute: None,
924 per_audio_minute: None,
925 currency: "USD".to_string(),
926 }),
927 available: true,
928 local: false,
929 },
930 MultimodalModel {
932 id: "claude-3-5-sonnet".to_string(),
933 name: "Claude 3.5 Sonnet".to_string(),
934 provider: "Anthropic".to_string(),
935 category: ModelCategory::VLM,
936 capabilities: ModalityCapabilities::vlm(),
937 max_context: 200000,
938 version: Some("20241022".to_string()),
939 release_date: Some("2024-10-22".to_string()),
940 description: Some("Best overall Claude with strong vision".to_string()),
941 pricing: Some(ModelPricing {
942 input_per_million: Some(3.00),
943 output_per_million: Some(15.00),
944 per_image: None,
945 per_video_minute: None,
946 per_audio_minute: None,
947 currency: "USD".to_string(),
948 }),
949 available: true,
950 local: false,
951 },
952 MultimodalModel {
954 id: "llava-1.6".to_string(),
955 name: "LLaVA 1.6".to_string(),
956 provider: "Open Source".to_string(),
957 category: ModelCategory::VLM,
958 capabilities: ModalityCapabilities::vlm(),
959 max_context: 4096,
960 version: Some("1.6".to_string()),
961 release_date: Some("2024-01-30".to_string()),
962 description: Some("Open-source vision-language model".to_string()),
963 pricing: None,
964 available: true,
965 local: true,
966 },
967 MultimodalModel {
968 id: "qwen2-vl".to_string(),
969 name: "Qwen2-VL".to_string(),
970 provider: "Alibaba".to_string(),
971 category: ModelCategory::VLM,
972 capabilities: {
973 let mut caps = ModalityCapabilities::vlm();
974 caps.input_modalities.push(Modality::Video);
975 caps
976 },
977 max_context: 32768,
978 version: Some("2.0".to_string()),
979 release_date: Some("2024-08-29".to_string()),
980 description: Some("Strong open VLM with video understanding".to_string()),
981 pricing: None,
982 available: true,
983 local: true,
984 },
985 MultimodalModel {
986 id: "pixtral-12b".to_string(),
987 name: "Pixtral 12B".to_string(),
988 provider: "Mistral".to_string(),
989 category: ModelCategory::VLM,
990 capabilities: ModalityCapabilities::vlm(),
991 max_context: 128000,
992 version: Some("1.0".to_string()),
993 release_date: Some("2024-09-11".to_string()),
994 description: Some("Mistral's vision model, runs locally".to_string()),
995 pricing: None,
996 available: true,
997 local: true,
998 },
999 ]
1000}
1001
1002pub fn vla_models() -> Vec<MultimodalModel> {
1004 vec![
1005 MultimodalModel {
1006 id: "rt-2".to_string(),
1007 name: "RT-2".to_string(),
1008 provider: "Google DeepMind".to_string(),
1009 category: ModelCategory::VLA,
1010 capabilities: ModalityCapabilities::vla(),
1011 max_context: 4096,
1012 version: Some("2.0".to_string()),
1013 release_date: Some("2023-07-28".to_string()),
1014 description: Some("Robotics Transformer 2 - vision-language-action model".to_string()),
1015 pricing: None,
1016 available: false,
1017 local: false,
1018 },
1019 MultimodalModel {
1020 id: "rt-x".to_string(),
1021 name: "RT-X".to_string(),
1022 provider: "Open X-Embodiment".to_string(),
1023 category: ModelCategory::VLA,
1024 capabilities: ModalityCapabilities::vla(),
1025 max_context: 4096,
1026 version: Some("1.0".to_string()),
1027 release_date: Some("2023-10-05".to_string()),
1028 description: Some("Cross-embodiment robotics foundation model".to_string()),
1029 pricing: None,
1030 available: true,
1031 local: true,
1032 },
1033 MultimodalModel {
1034 id: "octo".to_string(),
1035 name: "Octo".to_string(),
1036 provider: "Berkeley AI Research".to_string(),
1037 category: ModelCategory::VLA,
1038 capabilities: ModalityCapabilities::vla(),
1039 max_context: 2048,
1040 version: Some("1.0".to_string()),
1041 release_date: Some("2024-05-10".to_string()),
1042 description: Some("Generalist robot policy from Open X-Embodiment".to_string()),
1043 pricing: None,
1044 available: true,
1045 local: true,
1046 },
1047 MultimodalModel {
1048 id: "openvla".to_string(),
1049 name: "OpenVLA".to_string(),
1050 provider: "Stanford/Berkeley".to_string(),
1051 category: ModelCategory::VLA,
1052 capabilities: ModalityCapabilities::vla(),
1053 max_context: 4096,
1054 version: Some("7B".to_string()),
1055 release_date: Some("2024-06-13".to_string()),
1056 description: Some("Open-source 7B parameter VLA model".to_string()),
1057 pricing: None,
1058 available: true,
1059 local: true,
1060 },
1061 MultimodalModel {
1062 id: "palm-e".to_string(),
1063 name: "PaLM-E".to_string(),
1064 provider: "Google".to_string(),
1065 category: ModelCategory::Embodied,
1066 capabilities: ModalityCapabilities::embodied(),
1067 max_context: 8192,
1068 version: Some("562B".to_string()),
1069 release_date: Some("2023-03-06".to_string()),
1070 description: Some("Embodied multimodal language model".to_string()),
1071 pricing: None,
1072 available: false,
1073 local: false,
1074 },
1075 MultimodalModel {
1076 id: "gr-1".to_string(),
1077 name: "GR-1".to_string(),
1078 provider: "Fourier Intelligence".to_string(),
1079 category: ModelCategory::VLA,
1080 capabilities: ModalityCapabilities::vla(),
1081 max_context: 2048,
1082 version: Some("1.0".to_string()),
1083 release_date: Some("2024-03-18".to_string()),
1084 description: Some("VLA for humanoid robot manipulation".to_string()),
1085 pricing: None,
1086 available: false,
1087 local: false,
1088 },
1089 MultimodalModel {
1090 id: "pi0".to_string(),
1091 name: "Pi-Zero".to_string(),
1092 provider: "Physical Intelligence".to_string(),
1093 category: ModelCategory::VLA,
1094 capabilities: ModalityCapabilities::vla(),
1095 max_context: 4096,
1096 version: Some("1.0".to_string()),
1097 release_date: Some("2024-10-31".to_string()),
1098 description: Some("General-purpose robot foundation model".to_string()),
1099 pricing: None,
1100 available: false,
1101 local: false,
1102 },
1103 ]
1104}
1105
1106#[cfg(test)]
1111mod tests {
1112 use super::*;
1113
1114 #[test]
1115 fn test_modality_display() {
1116 assert_eq!(format!("{}", Modality::Text), "text");
1117 assert_eq!(format!("{}", Modality::Image), "image");
1118 assert_eq!(format!("{}", Modality::Action), "action");
1119 }
1120
1121 #[test]
1122 fn test_model_category_display() {
1123 assert_eq!(format!("{}", ModelCategory::LLM), "LLM");
1124 assert_eq!(format!("{}", ModelCategory::VLM), "VLM");
1125 assert_eq!(format!("{}", ModelCategory::VLA), "VLA");
1126 }
1127
1128 #[test]
1129 fn test_vlm_capabilities() {
1130 let caps = ModalityCapabilities::vlm();
1131 assert!(caps.supports_input(Modality::Text));
1132 assert!(caps.supports_input(Modality::Image));
1133 assert!(!caps.supports_input(Modality::Action));
1134 assert!(caps.supports_vision());
1135 assert!(!caps.supports_actions());
1136 }
1137
1138 #[test]
1139 fn test_vla_capabilities() {
1140 let caps = ModalityCapabilities::vla();
1141 assert!(caps.supports_input(Modality::Text));
1142 assert!(caps.supports_input(Modality::Image));
1143 assert!(caps.supports_input(Modality::Sensor));
1144 assert!(caps.supports_output(Modality::Action));
1145 assert!(caps.supports_output(Modality::Trajectory));
1146 assert!(caps.supports_vision());
1147 assert!(caps.supports_actions());
1148 }
1149
1150 #[test]
1151 fn test_multimodal_message() {
1152 let mut msg = MultimodalMessage::text(MessageRole::User, "What's in this image?");
1153 msg.add_image(ImageContent {
1154 data: ImageData::Url {
1155 url: "https://example.com/image.jpg".to_string(),
1156 detail: Some(ImageDetail::Auto),
1157 },
1158 format: Some(ImageFormat::Jpeg),
1159 alt_text: Some("Test image".to_string()),
1160 regions: vec![],
1161 });
1162
1163 assert_eq!(msg.images().len(), 1);
1164 assert_eq!(msg.text_content(), "What's in this image?");
1165 }
1166
1167 #[test]
1168 fn test_action_command() {
1169 let action = ActionCommand {
1170 action_type: ActionType::Grasp,
1171 parameters: ActionParameters::Gripper {
1172 aperture: 0.5,
1173 force: Some(10.0),
1174 },
1175 confidence: Some(0.95),
1176 timestamp: None,
1177 duration_ms: Some(500),
1178 };
1179
1180 assert_eq!(action.action_type, ActionType::Grasp);
1181 }
1182
1183 #[test]
1184 fn test_vlm_models_registry() {
1185 let models = vlm_models();
1186 assert!(!models.is_empty());
1187
1188 let gpt4o = models.iter().find(|m| m.id == "gpt-4o").unwrap();
1189 assert_eq!(gpt4o.category, ModelCategory::Multimodal);
1190 assert!(gpt4o.available);
1191 }
1192
1193 #[test]
1194 fn test_vla_models_registry() {
1195 let models = vla_models();
1196 assert!(!models.is_empty());
1197
1198 let openvla = models.iter().find(|m| m.id == "openvla").unwrap();
1199 assert_eq!(openvla.category, ModelCategory::VLA);
1200 assert!(openvla.local);
1201 }
1202}