aleph_alpha_client/
prompt.rs

1use std::{
2    borrow::{Borrow, Cow},
3    path::Path,
4};
5
6use base64::{prelude::BASE64_STANDARD, Engine};
7use image::DynamicImage;
8use itertools::Itertools;
9use serde::Serialize;
10
11use crate::image_preprocessing::{self, LoadImageError};
12
13/// A prompt which is passed to the model for inference. Usually it is one text item, but it could
14/// also be a combination of several modalities like images and text.
15#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
16pub struct Prompt<'a>(Vec<Modality<'a>>);
17
18impl<'a> Prompt<'a> {
19    /// Create a prompt from a single text item.
20    pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
21        Self(vec![Modality::from_text(text)])
22    }
23
24    /// Create a multimodal prompt from a list of individual items with any modality.
25    pub fn from_vec(items: Vec<Modality<'a>>) -> Self {
26        Self(items)
27    }
28
29    /// Allows you to borrow the contents of the prompt without allocating a new one.
30    pub fn borrow(&'a self) -> Prompt<'a> {
31        Self(self.0.iter().map(|item| item.borrow()).collect())
32    }
33
34    /// When constructing prompts programatically, it can be beneficial to append several
35    /// text items in a prompt. For example, if doing a fewshot prompt as the first item,
36    /// and user input as a second item.
37    ///
38    /// However, because of how tokenization works, having each item tokenized separately
39    /// can sometimes have strange side effects (tokenizing two partial strings does not
40    /// necessarily produce the same tokens as tokenizing the strings joined together).
41    ///
42    /// This method will take an existing prompt and merge any consecutive prompt items
43    /// by a given separator. You can use an empty string for the separator if you want
44    /// to just concatenate them.
45    pub fn join_consecutive_text_items(&mut self, separator: &str) {
46        self.0 = self
47            .0
48            .drain(..)
49            .coalesce(|a, b| match (a, b) {
50                (Modality::Text { mut data }, Modality::Text { data: other }) => {
51                    data.to_mut().push_str(separator);
52                    data.to_mut().push_str(&other);
53                    Ok(Modality::Text { data })
54                }
55                (a, b) => Err((a, b)),
56            })
57            .collect::<Vec<_>>();
58    }
59}
60
61/// The prompt for models can be a combination of different modalities (Text and Image). The type of
62/// modalities which are supported depend on the Model in question.
63#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
64#[serde(tag = "type", rename_all = "snake_case")]
65pub enum Modality<'a> {
66    /// The only type of prompt which can be used with pure language models
67    Text { data: Cow<'a, str> },
68    /// An image input into the model. See [`Modality::from_image_path`].
69    Image { data: Cow<'a, str> },
70}
71
72impl<'a> Modality<'a> {
73    /// Instantiates a text prompt
74    pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
75        Modality::Text { data: text.into() }
76    }
77
78    /// Image input for model, from file path.
79    ///
80    /// The model can only see squared pictures. Images are centercropped.
81    ///
82    /// ```no_run
83    /// use aleph_alpha_client::{Client, How, Modality, Prompt, Sampling, Stopping, TaskCompletion,
84    ///     Task, Logprobs};
85    /// use dotenvy::dotenv;
86    /// use std::path::PathBuf;
87    ///
88    /// #[tokio::main(flavor = "current_thread")]
89    /// async fn main() {
90    ///     // Create client
91    ///     let client = Client::from_env().unwrap();
92    ///     // Define task
93    ///     let task = TaskCompletion {
94    ///         prompt: Prompt::from_vec(vec![
95    ///             Modality::from_image_path("cat.png").unwrap(),
96    ///             Modality::from_text("A picture of "),
97    ///         ]),
98    ///         stopping: Stopping::from_maximum_tokens(10),
99    ///         sampling: Sampling::MOST_LIKELY,
100    ///         special_tokens: false,
101    ///         logprobs: Logprobs::No,
102    ///         echo: false,
103    ///     };
104    ///     // Execute
105    ///     let model = "luminous-base";
106    ///     let job = task.with_model(model);
107    ///     let response = client.output_of(&job, &How::default()).await.unwrap();
108    ///     // Show result
109    ///     println!("{}", response.completion);
110    /// }
111    /// ```
112    pub fn from_image_path(path: impl AsRef<Path>) -> Result<Self, LoadImageError> {
113        let bytes = image_preprocessing::from_image_path(path.as_ref())?;
114        Ok(Self::from_image_bytes(&bytes))
115    }
116
117    /// Image input for model
118    ///
119    /// The model can only see squared pictures. Images are centercropped. You may want to use this
120    /// method instead of [`Self::from_image_path`] in case you have the image in memory already
121    /// and do not want to load it from a file again.
122    pub fn from_image(image: &DynamicImage) -> Result<Self, LoadImageError> {
123        let bytes = image_preprocessing::preprocess_image(image);
124        Ok(Self::from_image_bytes(&bytes))
125    }
126
127    /// Generates an image input from the binary representation of the image.
128    ///
129    /// Using this constructor you must use a binary representation compatible with the API. Png is
130    /// guaranteed to be supported, and all others formats are converted into it. Furthermore, the
131    /// model can only look at square shaped pictures. If the picture is not square shaped it will
132    /// be center cropped.
133    fn from_image_bytes(image: &[u8]) -> Self {
134        Modality::Image {
135            data: BASE64_STANDARD.encode(image).into(),
136        }
137    }
138
139    /// Create a semantically idetical entry of modality which borrows the contents of this one.
140    ///
141    /// It is very practical to allow Modality of e.g. Text to take both ownership of the string it
142    /// contains as well as borrow a slice. However then we are creating a body from the user input
143    /// we want to avoid copying everything and needing to allocate for that modality again. This is
144    /// there this borrow function really shines.
145    pub fn borrow(&self) -> Modality<'_> {
146        match self {
147            Modality::Text { data } => Modality::Text {
148                data: Cow::Borrowed(data.borrow()),
149            },
150            Modality::Image { data } => Modality::Image {
151                data: Cow::Borrowed(data.borrow()),
152            },
153        }
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[test]
162    fn can_concatenate_prompt_items() {
163        let mut prompt =
164            Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
165        prompt.join_consecutive_text_items("");
166
167        assert_eq!(prompt.0, vec![Modality::from_text("foobar")]);
168    }
169
170    #[test]
171    fn can_concatenate_prompt_items_with_custom_separator() {
172        let mut prompt =
173            Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
174        prompt.join_consecutive_text_items("\n");
175
176        assert_eq!(prompt.0, vec![Modality::from_text("foo\nbar")]);
177    }
178}