Struct ClipVisionEncoder

Source

pub struct ClipVisionEncoder {
    pub config: ClipVisionConfig,
    pub patch_embed: PatchEmbed,
    pub pos_embed: LearnablePosEmbed,
    pub encoder: ViTEncoder,
    pub cls_token: Vec<f32>,
}

Expand description

CLIP vision encoder: ViT-backbone that produces a single embed_dim CLS-token embedding per image.

Pipeline:

image [C × H × W]
  → patch_embed    → [n_patches, embed_dim]
  → prepend_cls    → [n_patches + 1, embed_dim]
  → add_pos_embed  → [n_patches + 1, embed_dim]
  → encoder        → [n_patches + 1, embed_dim]
  → tokens[0]      → [embed_dim]   (CLS token output)

Fields§

§config: ClipVisionConfig

Full configuration.

§patch_embed: PatchEmbed

Strided Conv2D patch embedder.

§pos_embed: LearnablePosEmbed

Learnable positional embeddings: n_patches + 1 positions (incl. CLS).

§encoder: ViTEncoder

Stack of ViT transformer blocks with final layer-norm.

§cls_token: Vec<f32>

CLS token: flat [embed_dim], Gaussian-initialised with scale 0.02.

Implementations§

Source §

impl ClipVisionEncoder

Source

pub fn new(cfg: ClipVisionConfig, rng: &mut LcgRng) -> VisionResult<Self>

Construct a new CLIP vision encoder.

Initialises:

Patch embedder (Conv2D kernel, bias).
Learnable positional embedding table with n_patches + 1 rows.
ViT encoder stack.
CLS token vector (N(0, 0.02²)).

§Errors

Propagates any errors from the sub-component constructors.

Source

pub fn forward_single(&self, image: &[f32]) -> VisionResult<Vec<f32>>

Run the encoder on a single image and return the CLS embedding.

§Parameters

image: flat [in_chans × img_size × img_size] CHW buffer.

§Returns

[embed_dim] CLS-token embedding.

§Errors

Returns VisionError::DimensionMismatch if the image size does not match the configured dimensions.

Source

pub fn forward_batch( &self, images: &[f32], batch_size: usize, ) -> VisionResult<Vec<Vec<f32>>>

Run the encoder on a batch of images.

§Parameters

images: flat [batch × in_chans × img_size × img_size] buffer.
batch_size: number of images.

§Returns

Vec<Vec<f32>> of length batch_size, each element is [embed_dim].

§Errors

Returns VisionError::DimensionMismatch if the flat buffer length does not match batch_size × in_chans × img_size × img_size, or if any individual forward pass fails.

Auto Trait Implementations§

§

impl UnwindSafe for ClipVisionEncoder

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

ClipVisionEncoder

Struct ClipVisionEncoder

Fields§

Implementations§

impl ClipVisionEncoder

pub fn new(cfg: ClipVisionConfig, rng: &mut LcgRng) -> VisionResult<Self>

§Errors

pub fn forward_single(&self, image: &[f32]) -> VisionResult<Vec<f32>>

§Parameters

§Returns

§Errors

pub fn forward_batch( &self, images: &[f32], batch_size: usize, ) -> VisionResult<Vec<Vec<f32>>>

§Parameters

§Returns

§Errors

Auto Trait Implementations§

impl Freeze for ClipVisionEncoder

impl RefUnwindSafe for ClipVisionEncoder

impl Send for ClipVisionEncoder

impl Sync for ClipVisionEncoder

impl Unpin for ClipVisionEncoder

impl UnsafeUnpin for ClipVisionEncoder

impl UnwindSafe for ClipVisionEncoder

Blanket Implementations§

impl<T> Any for T
where T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for T
where T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for T
where T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for T
where U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for T
where U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct ClipVisionEncoder Copy item path

Fields§

Implementations§

impl ClipVisionEncoder

pub fn new(cfg: ClipVisionConfig, rng: &mut LcgRng) -> VisionResult<Self>

§Errors

pub fn forward_single(&self, image: &[f32]) -> VisionResult<Vec<f32>>

§Parameters

§Returns

§Errors

pub fn forward_batch( &self, images: &[f32], batch_size: usize, ) -> VisionResult<Vec<Vec<f32>>>

§Parameters

§Returns

§Errors

Auto Trait Implementations§

impl Freeze for ClipVisionEncoder

impl RefUnwindSafe for ClipVisionEncoder

impl Send for ClipVisionEncoder

impl Sync for ClipVisionEncoder

impl Unpin for ClipVisionEncoder

impl UnsafeUnpin for ClipVisionEncoder

impl UnwindSafe for ClipVisionEncoder

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct ClipVisionEncoder

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,