chasm_cli/agency/
modality.rs

1// Copyright (c) 2024-2026 Nervosys LLC
2// SPDX-License-Identifier: Apache-2.0
3//! Model Modality Support
4//!
5//! Defines modalities for different types of AI models:
6//! - LLM (Language Models) - Text-only
7//! - VLM (Vision-Language Models) - Text + Images
8//! - VLA (Vision-Language-Action Models) - Text + Images + Actions/Robotics
9//! - ALM (Audio-Language Models) - Text + Audio
10//! - VALM (Video-Audio-Language Models) - Text + Video + Audio
11
12#![allow(dead_code)]
13
14use chrono::{DateTime, Utc};
15use serde::{Deserialize, Serialize};
16use std::collections::HashMap;
17
18use super::models::MessageRole;
19
20// =============================================================================
21// Core Modality Types
22// =============================================================================
23
24/// Input/Output modality types
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26#[serde(rename_all = "snake_case")]
27pub enum Modality {
28    /// Text input/output
29    Text,
30    /// Image input/output
31    Image,
32    /// Video input/output
33    Video,
34    /// Audio input/output
35    Audio,
36    /// 3D point cloud or mesh
37    PointCloud,
38    /// Action commands (for robotics/VLA)
39    Action,
40    /// Sensor data (proprioception, IMU, etc.)
41    Sensor,
42    /// Depth map
43    Depth,
44    /// Semantic segmentation
45    Segmentation,
46    /// Bounding boxes
47    BoundingBox,
48    /// Pose estimation
49    Pose,
50    /// Trajectory/path data
51    Trajectory,
52}
53
54impl std::fmt::Display for Modality {
55    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56        match self {
57            Modality::Text => write!(f, "text"),
58            Modality::Image => write!(f, "image"),
59            Modality::Video => write!(f, "video"),
60            Modality::Audio => write!(f, "audio"),
61            Modality::PointCloud => write!(f, "point_cloud"),
62            Modality::Action => write!(f, "action"),
63            Modality::Sensor => write!(f, "sensor"),
64            Modality::Depth => write!(f, "depth"),
65            Modality::Segmentation => write!(f, "segmentation"),
66            Modality::BoundingBox => write!(f, "bounding_box"),
67            Modality::Pose => write!(f, "pose"),
68            Modality::Trajectory => write!(f, "trajectory"),
69        }
70    }
71}
72
73/// Model category based on modality support
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
75#[serde(rename_all = "UPPERCASE")]
76pub enum ModelCategory {
77    /// Language Model - Text only
78    LLM,
79    /// Vision-Language Model - Text + Images
80    VLM,
81    /// Vision-Language-Action Model - Text + Images + Actions
82    VLA,
83    /// Audio-Language Model - Text + Audio  
84    ALM,
85    /// Video-Audio-Language Model - Text + Video + Audio
86    VALM,
87    /// Multimodal - supports multiple modalities
88    Multimodal,
89    /// Embodied AI - for robotics with full sensor/action support
90    Embodied,
91}
92
93impl std::fmt::Display for ModelCategory {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        match self {
96            ModelCategory::LLM => write!(f, "LLM"),
97            ModelCategory::VLM => write!(f, "VLM"),
98            ModelCategory::VLA => write!(f, "VLA"),
99            ModelCategory::ALM => write!(f, "ALM"),
100            ModelCategory::VALM => write!(f, "VALM"),
101            ModelCategory::Multimodal => write!(f, "Multimodal"),
102            ModelCategory::Embodied => write!(f, "Embodied"),
103        }
104    }
105}
106
107// =============================================================================
108// Modality Capabilities
109// =============================================================================
110
111/// Describes what modalities a model can accept and produce
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct ModalityCapabilities {
114    /// Model category
115    pub category: ModelCategory,
116    /// Supported input modalities
117    pub input_modalities: Vec<Modality>,
118    /// Supported output modalities
119    pub output_modalities: Vec<Modality>,
120    /// Maximum image resolution (width, height)
121    #[serde(default, skip_serializing_if = "Option::is_none")]
122    pub max_image_resolution: Option<(u32, u32)>,
123    /// Maximum video duration in seconds
124    #[serde(default, skip_serializing_if = "Option::is_none")]
125    pub max_video_duration: Option<u32>,
126    /// Maximum audio duration in seconds
127    #[serde(default, skip_serializing_if = "Option::is_none")]
128    pub max_audio_duration: Option<u32>,
129    /// Maximum images per request
130    #[serde(default, skip_serializing_if = "Option::is_none")]
131    pub max_images_per_request: Option<u32>,
132    /// Supported image formats
133    #[serde(default)]
134    pub supported_image_formats: Vec<ImageFormat>,
135    /// Supports real-time streaming
136    #[serde(default)]
137    pub supports_streaming: bool,
138    /// Supports interleaved multi-turn with images
139    #[serde(default)]
140    pub supports_interleaved: bool,
141}
142
143impl Default for ModalityCapabilities {
144    fn default() -> Self {
145        Self {
146            category: ModelCategory::LLM,
147            input_modalities: vec![Modality::Text],
148            output_modalities: vec![Modality::Text],
149            max_image_resolution: None,
150            max_video_duration: None,
151            max_audio_duration: None,
152            max_images_per_request: None,
153            supported_image_formats: vec![],
154            supports_streaming: false,
155            supports_interleaved: false,
156        }
157    }
158}
159
160impl ModalityCapabilities {
161    /// Create capabilities for a text-only LLM
162    pub fn llm() -> Self {
163        Self {
164            category: ModelCategory::LLM,
165            input_modalities: vec![Modality::Text],
166            output_modalities: vec![Modality::Text],
167            supports_streaming: true,
168            ..Default::default()
169        }
170    }
171
172    /// Create capabilities for a Vision-Language Model
173    pub fn vlm() -> Self {
174        Self {
175            category: ModelCategory::VLM,
176            input_modalities: vec![Modality::Text, Modality::Image],
177            output_modalities: vec![Modality::Text],
178            max_image_resolution: Some((4096, 4096)),
179            max_images_per_request: Some(20),
180            supported_image_formats: vec![
181                ImageFormat::Png,
182                ImageFormat::Jpeg,
183                ImageFormat::Webp,
184                ImageFormat::Gif,
185            ],
186            supports_streaming: true,
187            supports_interleaved: true,
188            ..Default::default()
189        }
190    }
191
192    /// Create capabilities for a Vision-Language-Action Model
193    pub fn vla() -> Self {
194        Self {
195            category: ModelCategory::VLA,
196            input_modalities: vec![
197                Modality::Text,
198                Modality::Image,
199                Modality::Sensor,
200                Modality::Depth,
201            ],
202            output_modalities: vec![Modality::Text, Modality::Action, Modality::Trajectory],
203            max_image_resolution: Some((1024, 1024)),
204            max_images_per_request: Some(10),
205            supported_image_formats: vec![ImageFormat::Png, ImageFormat::Jpeg],
206            supports_streaming: true,
207            supports_interleaved: true,
208            ..Default::default()
209        }
210    }
211
212    /// Create capabilities for a multimodal model (like GPT-4o or Gemini)
213    pub fn multimodal() -> Self {
214        Self {
215            category: ModelCategory::Multimodal,
216            input_modalities: vec![
217                Modality::Text,
218                Modality::Image,
219                Modality::Audio,
220                Modality::Video,
221            ],
222            output_modalities: vec![Modality::Text, Modality::Image, Modality::Audio],
223            max_image_resolution: Some((4096, 4096)),
224            max_video_duration: Some(3600),
225            max_audio_duration: Some(3600),
226            max_images_per_request: Some(50),
227            supported_image_formats: vec![
228                ImageFormat::Png,
229                ImageFormat::Jpeg,
230                ImageFormat::Webp,
231                ImageFormat::Gif,
232            ],
233            supports_streaming: true,
234            supports_interleaved: true,
235        }
236    }
237
238    /// Create capabilities for an embodied AI model
239    pub fn embodied() -> Self {
240        Self {
241            category: ModelCategory::Embodied,
242            input_modalities: vec![
243                Modality::Text,
244                Modality::Image,
245                Modality::Depth,
246                Modality::PointCloud,
247                Modality::Sensor,
248                Modality::Pose,
249            ],
250            output_modalities: vec![
251                Modality::Text,
252                Modality::Action,
253                Modality::Trajectory,
254                Modality::Pose,
255            ],
256            max_image_resolution: Some((1280, 720)),
257            max_images_per_request: Some(8),
258            supported_image_formats: vec![ImageFormat::Png, ImageFormat::Jpeg],
259            supports_streaming: true,
260            supports_interleaved: true,
261            ..Default::default()
262        }
263    }
264
265    /// Check if this model supports a given input modality
266    pub fn supports_input(&self, modality: Modality) -> bool {
267        self.input_modalities.contains(&modality)
268    }
269
270    /// Check if this model supports a given output modality
271    pub fn supports_output(&self, modality: Modality) -> bool {
272        self.output_modalities.contains(&modality)
273    }
274
275    /// Check if this model supports vision input
276    pub fn supports_vision(&self) -> bool {
277        self.supports_input(Modality::Image) || self.supports_input(Modality::Video)
278    }
279
280    /// Check if this model supports action output
281    pub fn supports_actions(&self) -> bool {
282        self.supports_output(Modality::Action) || self.supports_output(Modality::Trajectory)
283    }
284}
285
286/// Supported image formats
287#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
288#[serde(rename_all = "lowercase")]
289pub enum ImageFormat {
290    Png,
291    Jpeg,
292    Webp,
293    Gif,
294    Bmp,
295    Tiff,
296    Heic,
297}
298
299impl std::fmt::Display for ImageFormat {
300    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
301        match self {
302            ImageFormat::Png => write!(f, "png"),
303            ImageFormat::Jpeg => write!(f, "jpeg"),
304            ImageFormat::Webp => write!(f, "webp"),
305            ImageFormat::Gif => write!(f, "gif"),
306            ImageFormat::Bmp => write!(f, "bmp"),
307            ImageFormat::Tiff => write!(f, "tiff"),
308            ImageFormat::Heic => write!(f, "heic"),
309        }
310    }
311}
312
313// =============================================================================
314// Multimodal Content Types
315// =============================================================================
316
317/// Image content for vision models
318#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct ImageContent {
320    /// Image data (base64 encoded or URL)
321    pub data: ImageData,
322    /// Image format
323    #[serde(default, skip_serializing_if = "Option::is_none")]
324    pub format: Option<ImageFormat>,
325    /// Image description/alt text
326    #[serde(default, skip_serializing_if = "Option::is_none")]
327    pub alt_text: Option<String>,
328    /// Bounding box regions of interest
329    #[serde(default, skip_serializing_if = "Vec::is_empty")]
330    pub regions: Vec<BoundingBoxRegion>,
331}
332
333/// Image data - either base64 encoded or URL reference
334#[derive(Debug, Clone, Serialize, Deserialize)]
335#[serde(untagged)]
336pub enum ImageData {
337    /// Base64 encoded image data
338    Base64 {
339        #[serde(rename = "base64")]
340        data: String,
341        media_type: String,
342    },
343    /// URL reference to image
344    Url {
345        url: String,
346        #[serde(default, skip_serializing_if = "Option::is_none")]
347        detail: Option<ImageDetail>,
348    },
349}
350
351/// Image detail level for vision models
352#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
353#[serde(rename_all = "lowercase")]
354pub enum ImageDetail {
355    /// Low resolution processing
356    Low,
357    /// High resolution processing
358    High,
359    /// Auto-select based on image size
360    Auto,
361}
362
363/// Bounding box region in an image
364#[derive(Debug, Clone, Serialize, Deserialize)]
365pub struct BoundingBoxRegion {
366    /// Region label
367    pub label: String,
368    /// Normalized coordinates (0.0 - 1.0)
369    pub x: f32,
370    pub y: f32,
371    pub width: f32,
372    pub height: f32,
373    /// Confidence score (0.0 - 1.0)
374    #[serde(default, skip_serializing_if = "Option::is_none")]
375    pub confidence: Option<f32>,
376}
377
378/// Video content for video models
379#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct VideoContent {
381    /// Video URL or base64 data
382    pub data: VideoData,
383    /// Video duration in seconds
384    #[serde(default, skip_serializing_if = "Option::is_none")]
385    pub duration: Option<f32>,
386    /// Start time for clip (seconds)
387    #[serde(default, skip_serializing_if = "Option::is_none")]
388    pub start_time: Option<f32>,
389    /// End time for clip (seconds)
390    #[serde(default, skip_serializing_if = "Option::is_none")]
391    pub end_time: Option<f32>,
392    /// Frame rate
393    #[serde(default, skip_serializing_if = "Option::is_none")]
394    pub fps: Option<f32>,
395}
396
397/// Video data - URL or uploaded frames
398#[derive(Debug, Clone, Serialize, Deserialize)]
399#[serde(untagged)]
400pub enum VideoData {
401    /// URL reference to video
402    Url { url: String },
403    /// Sequence of frames as images
404    Frames { frames: Vec<ImageContent> },
405    /// Base64 encoded video
406    Base64 { base64: String, media_type: String },
407}
408
409/// Audio content for audio models
410#[derive(Debug, Clone, Serialize, Deserialize)]
411pub struct AudioContent {
412    /// Audio data
413    pub data: AudioData,
414    /// Audio format (mp3, wav, etc.)
415    #[serde(default, skip_serializing_if = "Option::is_none")]
416    pub format: Option<String>,
417    /// Duration in seconds
418    #[serde(default, skip_serializing_if = "Option::is_none")]
419    pub duration: Option<f32>,
420    /// Sample rate
421    #[serde(default, skip_serializing_if = "Option::is_none")]
422    pub sample_rate: Option<u32>,
423    /// Transcription (if available)
424    #[serde(default, skip_serializing_if = "Option::is_none")]
425    pub transcription: Option<String>,
426}
427
428/// Audio data
429#[derive(Debug, Clone, Serialize, Deserialize)]
430#[serde(untagged)]
431pub enum AudioData {
432    /// URL reference to audio
433    Url { url: String },
434    /// Base64 encoded audio
435    Base64 { base64: String, media_type: String },
436}
437
438// =============================================================================
439// VLA (Vision-Language-Action) Types
440// =============================================================================
441
442/// Action command for VLA models
443#[derive(Debug, Clone, Serialize, Deserialize)]
444pub struct ActionCommand {
445    /// Action type
446    pub action_type: ActionType,
447    /// Action parameters
448    pub parameters: ActionParameters,
449    /// Confidence score (0.0 - 1.0)
450    #[serde(default, skip_serializing_if = "Option::is_none")]
451    pub confidence: Option<f32>,
452    /// Timestamp for this action
453    #[serde(default, skip_serializing_if = "Option::is_none")]
454    pub timestamp: Option<DateTime<Utc>>,
455    /// Duration of action in milliseconds
456    #[serde(default, skip_serializing_if = "Option::is_none")]
457    pub duration_ms: Option<u64>,
458}
459
460/// Types of robot actions
461#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
462#[serde(rename_all = "snake_case")]
463pub enum ActionType {
464    // Navigation actions
465    Move,
466    Rotate,
467    Stop,
468
469    // Manipulation actions
470    Grasp,
471    Release,
472    Push,
473    Pull,
474    Place,
475    Pick,
476
477    // End effector actions
478    Open,
479    Close,
480
481    // Arm actions
482    MoveArm,
483    MoveJoint,
484
485    // Camera actions
486    Look,
487    Focus,
488
489    // Generic
490    Custom,
491    Wait,
492    Sequence,
493}
494
495/// Parameters for different action types
496#[derive(Debug, Clone, Serialize, Deserialize)]
497#[serde(untagged)]
498pub enum ActionParameters {
499    /// Movement parameters (x, y, z displacement or velocity)
500    Movement {
501        #[serde(default)]
502        x: f64,
503        #[serde(default)]
504        y: f64,
505        #[serde(default)]
506        z: f64,
507        /// Whether values are velocities or positions
508        #[serde(default)]
509        is_velocity: bool,
510        /// Coordinate frame (world, robot, end_effector)
511        #[serde(default, skip_serializing_if = "Option::is_none")]
512        frame: Option<String>,
513    },
514    /// Rotation parameters (roll, pitch, yaw or quaternion)
515    Rotation {
516        #[serde(default, skip_serializing_if = "Option::is_none")]
517        roll: Option<f64>,
518        #[serde(default, skip_serializing_if = "Option::is_none")]
519        pitch: Option<f64>,
520        #[serde(default, skip_serializing_if = "Option::is_none")]
521        yaw: Option<f64>,
522        #[serde(default, skip_serializing_if = "Option::is_none")]
523        quaternion: Option<[f64; 4]>,
524    },
525    /// Gripper parameters
526    Gripper {
527        /// Aperture (0.0 = closed, 1.0 = fully open)
528        aperture: f64,
529        /// Force limit
530        #[serde(default, skip_serializing_if = "Option::is_none")]
531        force: Option<f64>,
532    },
533    /// Joint positions
534    JointPositions {
535        /// Joint angles in radians
536        positions: Vec<f64>,
537        /// Joint names (if applicable)
538        #[serde(default, skip_serializing_if = "Vec::is_empty")]
539        joint_names: Vec<String>,
540    },
541    /// Target pose (position + orientation)
542    TargetPose {
543        position: [f64; 3],
544        /// Quaternion [w, x, y, z]
545        orientation: [f64; 4],
546    },
547    /// Trajectory of waypoints
548    Trajectory {
549        waypoints: Vec<Waypoint>,
550        /// Interpolation method
551        #[serde(default, skip_serializing_if = "Option::is_none")]
552        interpolation: Option<String>,
553    },
554    /// Custom parameters as JSON
555    Custom(serde_json::Value),
556}
557
558/// Waypoint in a trajectory
559#[derive(Debug, Clone, Serialize, Deserialize)]
560pub struct Waypoint {
561    /// Position [x, y, z]
562    pub position: [f64; 3],
563    /// Orientation quaternion [w, x, y, z]
564    #[serde(default, skip_serializing_if = "Option::is_none")]
565    pub orientation: Option<[f64; 4]>,
566    /// Timestamp offset in seconds
567    #[serde(default, skip_serializing_if = "Option::is_none")]
568    pub time: Option<f64>,
569    /// Gripper state at this waypoint
570    #[serde(default, skip_serializing_if = "Option::is_none")]
571    pub gripper: Option<f64>,
572}
573
574/// Sensor data input for VLA models
575#[derive(Debug, Clone, Serialize, Deserialize)]
576pub struct SensorData {
577    /// Sensor type
578    pub sensor_type: SensorType,
579    /// Sensor values
580    pub values: SensorValues,
581    /// Timestamp
582    pub timestamp: DateTime<Utc>,
583    /// Sensor frame/reference
584    #[serde(default, skip_serializing_if = "Option::is_none")]
585    pub frame: Option<String>,
586}
587
588/// Types of sensors
589#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
590#[serde(rename_all = "snake_case")]
591pub enum SensorType {
592    /// Joint positions/velocities
593    JointState,
594    /// Inertial measurement unit
595    Imu,
596    /// Force/torque sensor
597    ForceTorque,
598    /// Depth camera
599    Depth,
600    /// LIDAR
601    Lidar,
602    /// GPS/Localization
603    Localization,
604    /// Touch/pressure sensor
605    Tactile,
606    /// Odometry
607    Odometry,
608    /// Custom sensor
609    Custom,
610}
611
612/// Sensor values for different sensor types
613#[derive(Debug, Clone, Serialize, Deserialize)]
614#[serde(untagged)]
615pub enum SensorValues {
616    /// Joint state (positions, velocities, efforts)
617    JointState {
618        positions: Vec<f64>,
619        #[serde(default, skip_serializing_if = "Vec::is_empty")]
620        velocities: Vec<f64>,
621        #[serde(default, skip_serializing_if = "Vec::is_empty")]
622        efforts: Vec<f64>,
623    },
624    /// IMU (acceleration, angular velocity)
625    Imu {
626        linear_acceleration: [f64; 3],
627        angular_velocity: [f64; 3],
628        #[serde(default, skip_serializing_if = "Option::is_none")]
629        orientation: Option<[f64; 4]>,
630    },
631    /// Force/torque (6D wrench)
632    ForceTorque { force: [f64; 3], torque: [f64; 3] },
633    /// Depth map (as base64 or URL)
634    Depth {
635        data: String,
636        width: u32,
637        height: u32,
638        #[serde(default, skip_serializing_if = "Option::is_none")]
639        encoding: Option<String>,
640    },
641    /// Point cloud
642    PointCloud {
643        points: Vec<[f64; 3]>,
644        #[serde(default, skip_serializing_if = "Vec::is_empty")]
645        colors: Vec<[u8; 3]>,
646    },
647    /// Pose (position + orientation)
648    Pose {
649        position: [f64; 3],
650        orientation: [f64; 4],
651    },
652    /// Generic numeric values
653    Numeric(Vec<f64>),
654    /// Custom values as JSON
655    Custom(serde_json::Value),
656}
657
658// =============================================================================
659// Multimodal Message Content
660// =============================================================================
661
662/// Content part of a multimodal message
663#[derive(Debug, Clone, Serialize, Deserialize)]
664#[serde(tag = "type", rename_all = "snake_case")]
665pub enum ContentPart {
666    /// Text content
667    Text { text: String },
668    /// Image content
669    Image(ImageContent),
670    /// Video content
671    Video(VideoContent),
672    /// Audio content
673    Audio(AudioContent),
674    /// Sensor data (for VLA)
675    Sensor(SensorData),
676    /// Action command (for VLA output)
677    Action(ActionCommand),
678    /// File reference
679    File {
680        url: String,
681        #[serde(default, skip_serializing_if = "Option::is_none")]
682        mime_type: Option<String>,
683    },
684}
685
686/// A multimodal message that can contain mixed content types
687#[derive(Debug, Clone, Serialize, Deserialize)]
688pub struct MultimodalMessage {
689    /// Message role
690    pub role: MessageRole,
691    /// Content parts
692    pub content: Vec<ContentPart>,
693    /// Timestamp
694    #[serde(default = "Utc::now")]
695    pub timestamp: DateTime<Utc>,
696    /// Additional metadata
697    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
698    pub metadata: HashMap<String, serde_json::Value>,
699}
700
701impl MultimodalMessage {
702    /// Create a new text-only message
703    pub fn text(role: MessageRole, text: impl Into<String>) -> Self {
704        Self {
705            role,
706            content: vec![ContentPart::Text { text: text.into() }],
707            timestamp: Utc::now(),
708            metadata: HashMap::new(),
709        }
710    }
711
712    /// Create a message with text and image
713    pub fn with_image(role: MessageRole, text: impl Into<String>, image: ImageContent) -> Self {
714        Self {
715            role,
716            content: vec![
717                ContentPart::Text { text: text.into() },
718                ContentPart::Image(image),
719            ],
720            timestamp: Utc::now(),
721            metadata: HashMap::new(),
722        }
723    }
724
725    /// Add an image to this message
726    pub fn add_image(&mut self, image: ImageContent) {
727        self.content.push(ContentPart::Image(image));
728    }
729
730    /// Add sensor data to this message
731    pub fn add_sensor(&mut self, sensor: SensorData) {
732        self.content.push(ContentPart::Sensor(sensor));
733    }
734
735    /// Add an action to this message
736    pub fn add_action(&mut self, action: ActionCommand) {
737        self.content.push(ContentPart::Action(action));
738    }
739
740    /// Get all text content concatenated
741    pub fn text_content(&self) -> String {
742        self.content
743            .iter()
744            .filter_map(|part| match part {
745                ContentPart::Text { text } => Some(text.as_str()),
746                _ => None,
747            })
748            .collect::<Vec<_>>()
749            .join("\n")
750    }
751
752    /// Get all images in this message
753    pub fn images(&self) -> Vec<&ImageContent> {
754        self.content
755            .iter()
756            .filter_map(|part| match part {
757                ContentPart::Image(img) => Some(img),
758                _ => None,
759            })
760            .collect()
761    }
762
763    /// Get all actions in this message
764    pub fn actions(&self) -> Vec<&ActionCommand> {
765        self.content
766            .iter()
767            .filter_map(|part| match part {
768                ContentPart::Action(action) => Some(action),
769                _ => None,
770            })
771            .collect()
772    }
773}
774
775// =============================================================================
776// VLM/VLA Model Registry
777// =============================================================================
778
779/// Known VLM/VLA model with capabilities
780#[derive(Debug, Clone, Serialize, Deserialize)]
781pub struct MultimodalModel {
782    /// Model identifier
783    pub id: String,
784    /// Display name
785    pub name: String,
786    /// Provider name
787    pub provider: String,
788    /// Model category
789    pub category: ModelCategory,
790    /// Modality capabilities
791    pub capabilities: ModalityCapabilities,
792    /// Maximum context length (tokens)
793    pub max_context: u32,
794    /// Model version
795    #[serde(default, skip_serializing_if = "Option::is_none")]
796    pub version: Option<String>,
797    /// Release date
798    #[serde(default, skip_serializing_if = "Option::is_none")]
799    pub release_date: Option<String>,
800    /// Model description
801    #[serde(default, skip_serializing_if = "Option::is_none")]
802    pub description: Option<String>,
803    /// Pricing info
804    #[serde(default, skip_serializing_if = "Option::is_none")]
805    pub pricing: Option<ModelPricing>,
806    /// Whether model is available via API
807    #[serde(default)]
808    pub available: bool,
809    /// Whether model can run locally
810    #[serde(default)]
811    pub local: bool,
812}
813
814/// Model pricing information
815#[derive(Debug, Clone, Serialize, Deserialize)]
816pub struct ModelPricing {
817    /// Cost per million input tokens
818    #[serde(default, skip_serializing_if = "Option::is_none")]
819    pub input_per_million: Option<f64>,
820    /// Cost per million output tokens
821    #[serde(default, skip_serializing_if = "Option::is_none")]
822    pub output_per_million: Option<f64>,
823    /// Cost per image
824    #[serde(default, skip_serializing_if = "Option::is_none")]
825    pub per_image: Option<f64>,
826    /// Cost per minute of video
827    #[serde(default, skip_serializing_if = "Option::is_none")]
828    pub per_video_minute: Option<f64>,
829    /// Cost per minute of audio
830    #[serde(default, skip_serializing_if = "Option::is_none")]
831    pub per_audio_minute: Option<f64>,
832    /// Currency
833    #[serde(default = "default_currency")]
834    pub currency: String,
835}
836
837fn default_currency() -> String {
838    "USD".to_string()
839}
840
841/// Get built-in VLM models
842pub fn vlm_models() -> Vec<MultimodalModel> {
843    vec![
844        // OpenAI VLMs
845        MultimodalModel {
846            id: "gpt-4o".to_string(),
847            name: "GPT-4o".to_string(),
848            provider: "OpenAI".to_string(),
849            category: ModelCategory::Multimodal,
850            capabilities: ModalityCapabilities::multimodal(),
851            max_context: 128000,
852            version: Some("2024-11-20".to_string()),
853            release_date: Some("2024-05-13".to_string()),
854            description: Some("Most capable GPT-4 with vision, audio, and text".to_string()),
855            pricing: Some(ModelPricing {
856                input_per_million: Some(2.50),
857                output_per_million: Some(10.00),
858                per_image: None,
859                per_video_minute: None,
860                per_audio_minute: None,
861                currency: "USD".to_string(),
862            }),
863            available: true,
864            local: false,
865        },
866        MultimodalModel {
867            id: "gpt-4o-mini".to_string(),
868            name: "GPT-4o Mini".to_string(),
869            provider: "OpenAI".to_string(),
870            category: ModelCategory::VLM,
871            capabilities: ModalityCapabilities::vlm(),
872            max_context: 128000,
873            version: Some("2024-07-18".to_string()),
874            release_date: Some("2024-07-18".to_string()),
875            description: Some("Affordable small model with vision capabilities".to_string()),
876            pricing: Some(ModelPricing {
877                input_per_million: Some(0.15),
878                output_per_million: Some(0.60),
879                per_image: None,
880                per_video_minute: None,
881                per_audio_minute: None,
882                currency: "USD".to_string(),
883            }),
884            available: true,
885            local: false,
886        },
887        // Google VLMs
888        MultimodalModel {
889            id: "gemini-2.0-flash".to_string(),
890            name: "Gemini 2.0 Flash".to_string(),
891            provider: "Google".to_string(),
892            category: ModelCategory::Multimodal,
893            capabilities: ModalityCapabilities::multimodal(),
894            max_context: 1000000,
895            version: Some("2.0".to_string()),
896            release_date: Some("2024-12-11".to_string()),
897            description: Some("Fastest Gemini with native multimodal generation".to_string()),
898            pricing: Some(ModelPricing {
899                input_per_million: Some(0.075),
900                output_per_million: Some(0.30),
901                per_image: None,
902                per_video_minute: None,
903                per_audio_minute: None,
904                currency: "USD".to_string(),
905            }),
906            available: true,
907            local: false,
908        },
909        MultimodalModel {
910            id: "gemini-1.5-pro".to_string(),
911            name: "Gemini 1.5 Pro".to_string(),
912            provider: "Google".to_string(),
913            category: ModelCategory::Multimodal,
914            capabilities: ModalityCapabilities::multimodal(),
915            max_context: 2000000,
916            version: Some("1.5".to_string()),
917            release_date: Some("2024-02-15".to_string()),
918            description: Some("2M context window with video understanding".to_string()),
919            pricing: Some(ModelPricing {
920                input_per_million: Some(1.25),
921                output_per_million: Some(5.00),
922                per_image: None,
923                per_video_minute: None,
924                per_audio_minute: None,
925                currency: "USD".to_string(),
926            }),
927            available: true,
928            local: false,
929        },
930        // Anthropic VLMs
931        MultimodalModel {
932            id: "claude-3-5-sonnet".to_string(),
933            name: "Claude 3.5 Sonnet".to_string(),
934            provider: "Anthropic".to_string(),
935            category: ModelCategory::VLM,
936            capabilities: ModalityCapabilities::vlm(),
937            max_context: 200000,
938            version: Some("20241022".to_string()),
939            release_date: Some("2024-10-22".to_string()),
940            description: Some("Best overall Claude with strong vision".to_string()),
941            pricing: Some(ModelPricing {
942                input_per_million: Some(3.00),
943                output_per_million: Some(15.00),
944                per_image: None,
945                per_video_minute: None,
946                per_audio_minute: None,
947                currency: "USD".to_string(),
948            }),
949            available: true,
950            local: false,
951        },
952        // Local/Open VLMs
953        MultimodalModel {
954            id: "llava-1.6".to_string(),
955            name: "LLaVA 1.6".to_string(),
956            provider: "Open Source".to_string(),
957            category: ModelCategory::VLM,
958            capabilities: ModalityCapabilities::vlm(),
959            max_context: 4096,
960            version: Some("1.6".to_string()),
961            release_date: Some("2024-01-30".to_string()),
962            description: Some("Open-source vision-language model".to_string()),
963            pricing: None,
964            available: true,
965            local: true,
966        },
967        MultimodalModel {
968            id: "qwen2-vl".to_string(),
969            name: "Qwen2-VL".to_string(),
970            provider: "Alibaba".to_string(),
971            category: ModelCategory::VLM,
972            capabilities: {
973                let mut caps = ModalityCapabilities::vlm();
974                caps.input_modalities.push(Modality::Video);
975                caps
976            },
977            max_context: 32768,
978            version: Some("2.0".to_string()),
979            release_date: Some("2024-08-29".to_string()),
980            description: Some("Strong open VLM with video understanding".to_string()),
981            pricing: None,
982            available: true,
983            local: true,
984        },
985        MultimodalModel {
986            id: "pixtral-12b".to_string(),
987            name: "Pixtral 12B".to_string(),
988            provider: "Mistral".to_string(),
989            category: ModelCategory::VLM,
990            capabilities: ModalityCapabilities::vlm(),
991            max_context: 128000,
992            version: Some("1.0".to_string()),
993            release_date: Some("2024-09-11".to_string()),
994            description: Some("Mistral's vision model, runs locally".to_string()),
995            pricing: None,
996            available: true,
997            local: true,
998        },
999    ]
1000}
1001
1002/// Get built-in VLA models
1003pub fn vla_models() -> Vec<MultimodalModel> {
1004    vec![
1005        MultimodalModel {
1006            id: "rt-2".to_string(),
1007            name: "RT-2".to_string(),
1008            provider: "Google DeepMind".to_string(),
1009            category: ModelCategory::VLA,
1010            capabilities: ModalityCapabilities::vla(),
1011            max_context: 4096,
1012            version: Some("2.0".to_string()),
1013            release_date: Some("2023-07-28".to_string()),
1014            description: Some("Robotics Transformer 2 - vision-language-action model".to_string()),
1015            pricing: None,
1016            available: false,
1017            local: false,
1018        },
1019        MultimodalModel {
1020            id: "rt-x".to_string(),
1021            name: "RT-X".to_string(),
1022            provider: "Open X-Embodiment".to_string(),
1023            category: ModelCategory::VLA,
1024            capabilities: ModalityCapabilities::vla(),
1025            max_context: 4096,
1026            version: Some("1.0".to_string()),
1027            release_date: Some("2023-10-05".to_string()),
1028            description: Some("Cross-embodiment robotics foundation model".to_string()),
1029            pricing: None,
1030            available: true,
1031            local: true,
1032        },
1033        MultimodalModel {
1034            id: "octo".to_string(),
1035            name: "Octo".to_string(),
1036            provider: "Berkeley AI Research".to_string(),
1037            category: ModelCategory::VLA,
1038            capabilities: ModalityCapabilities::vla(),
1039            max_context: 2048,
1040            version: Some("1.0".to_string()),
1041            release_date: Some("2024-05-10".to_string()),
1042            description: Some("Generalist robot policy from Open X-Embodiment".to_string()),
1043            pricing: None,
1044            available: true,
1045            local: true,
1046        },
1047        MultimodalModel {
1048            id: "openvla".to_string(),
1049            name: "OpenVLA".to_string(),
1050            provider: "Stanford/Berkeley".to_string(),
1051            category: ModelCategory::VLA,
1052            capabilities: ModalityCapabilities::vla(),
1053            max_context: 4096,
1054            version: Some("7B".to_string()),
1055            release_date: Some("2024-06-13".to_string()),
1056            description: Some("Open-source 7B parameter VLA model".to_string()),
1057            pricing: None,
1058            available: true,
1059            local: true,
1060        },
1061        MultimodalModel {
1062            id: "palm-e".to_string(),
1063            name: "PaLM-E".to_string(),
1064            provider: "Google".to_string(),
1065            category: ModelCategory::Embodied,
1066            capabilities: ModalityCapabilities::embodied(),
1067            max_context: 8192,
1068            version: Some("562B".to_string()),
1069            release_date: Some("2023-03-06".to_string()),
1070            description: Some("Embodied multimodal language model".to_string()),
1071            pricing: None,
1072            available: false,
1073            local: false,
1074        },
1075        MultimodalModel {
1076            id: "gr-1".to_string(),
1077            name: "GR-1".to_string(),
1078            provider: "Fourier Intelligence".to_string(),
1079            category: ModelCategory::VLA,
1080            capabilities: ModalityCapabilities::vla(),
1081            max_context: 2048,
1082            version: Some("1.0".to_string()),
1083            release_date: Some("2024-03-18".to_string()),
1084            description: Some("VLA for humanoid robot manipulation".to_string()),
1085            pricing: None,
1086            available: false,
1087            local: false,
1088        },
1089        MultimodalModel {
1090            id: "pi0".to_string(),
1091            name: "Pi-Zero".to_string(),
1092            provider: "Physical Intelligence".to_string(),
1093            category: ModelCategory::VLA,
1094            capabilities: ModalityCapabilities::vla(),
1095            max_context: 4096,
1096            version: Some("1.0".to_string()),
1097            release_date: Some("2024-10-31".to_string()),
1098            description: Some("General-purpose robot foundation model".to_string()),
1099            pricing: None,
1100            available: false,
1101            local: false,
1102        },
1103    ]
1104}
1105
1106// =============================================================================
1107// Tests
1108// =============================================================================
1109
1110#[cfg(test)]
1111mod tests {
1112    use super::*;
1113
1114    #[test]
1115    fn test_modality_display() {
1116        assert_eq!(format!("{}", Modality::Text), "text");
1117        assert_eq!(format!("{}", Modality::Image), "image");
1118        assert_eq!(format!("{}", Modality::Action), "action");
1119    }
1120
1121    #[test]
1122    fn test_model_category_display() {
1123        assert_eq!(format!("{}", ModelCategory::LLM), "LLM");
1124        assert_eq!(format!("{}", ModelCategory::VLM), "VLM");
1125        assert_eq!(format!("{}", ModelCategory::VLA), "VLA");
1126    }
1127
1128    #[test]
1129    fn test_vlm_capabilities() {
1130        let caps = ModalityCapabilities::vlm();
1131        assert!(caps.supports_input(Modality::Text));
1132        assert!(caps.supports_input(Modality::Image));
1133        assert!(!caps.supports_input(Modality::Action));
1134        assert!(caps.supports_vision());
1135        assert!(!caps.supports_actions());
1136    }
1137
1138    #[test]
1139    fn test_vla_capabilities() {
1140        let caps = ModalityCapabilities::vla();
1141        assert!(caps.supports_input(Modality::Text));
1142        assert!(caps.supports_input(Modality::Image));
1143        assert!(caps.supports_input(Modality::Sensor));
1144        assert!(caps.supports_output(Modality::Action));
1145        assert!(caps.supports_output(Modality::Trajectory));
1146        assert!(caps.supports_vision());
1147        assert!(caps.supports_actions());
1148    }
1149
1150    #[test]
1151    fn test_multimodal_message() {
1152        let mut msg = MultimodalMessage::text(MessageRole::User, "What's in this image?");
1153        msg.add_image(ImageContent {
1154            data: ImageData::Url {
1155                url: "https://example.com/image.jpg".to_string(),
1156                detail: Some(ImageDetail::Auto),
1157            },
1158            format: Some(ImageFormat::Jpeg),
1159            alt_text: Some("Test image".to_string()),
1160            regions: vec![],
1161        });
1162
1163        assert_eq!(msg.images().len(), 1);
1164        assert_eq!(msg.text_content(), "What's in this image?");
1165    }
1166
1167    #[test]
1168    fn test_action_command() {
1169        let action = ActionCommand {
1170            action_type: ActionType::Grasp,
1171            parameters: ActionParameters::Gripper {
1172                aperture: 0.5,
1173                force: Some(10.0),
1174            },
1175            confidence: Some(0.95),
1176            timestamp: None,
1177            duration_ms: Some(500),
1178        };
1179
1180        assert_eq!(action.action_type, ActionType::Grasp);
1181    }
1182
1183    #[test]
1184    fn test_vlm_models_registry() {
1185        let models = vlm_models();
1186        assert!(!models.is_empty());
1187
1188        let gpt4o = models.iter().find(|m| m.id == "gpt-4o").unwrap();
1189        assert_eq!(gpt4o.category, ModelCategory::Multimodal);
1190        assert!(gpt4o.available);
1191    }
1192
1193    #[test]
1194    fn test_vla_models_registry() {
1195        let models = vla_models();
1196        assert!(!models.is_empty());
1197
1198        let openvla = models.iter().find(|m| m.id == "openvla").unwrap();
1199        assert_eq!(openvla.category, ModelCategory::VLA);
1200        assert!(openvla.local);
1201    }
1202}