aleph_alpha_client/
prompt.rs

1use std::{
2    borrow::{Borrow, Cow},
3    path::Path,
4};
5
6use base64::{prelude::BASE64_STANDARD, Engine};
7use image::DynamicImage;
8use itertools::Itertools;
9use serde::Serialize;
10
11use crate::image_preprocessing::{self, LoadImageError};
12
13/// A prompt which is passed to the model for inference. Usually it is one text item, but it could
14/// also be a combination of several modalities like images and text.
15#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
16pub struct Prompt<'a>(Vec<Modality<'a>>);
17
18impl<'a> Prompt<'a> {
19    /// Create a prompt from a single text item.
20    pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
21        Self(vec![Modality::from_text(text)])
22    }
23
24    /// Create a multimodal prompt from a list of individual items with any modality.
25    pub fn from_vec(items: Vec<Modality<'a>>) -> Self {
26        Self(items)
27    }
28
29    /// Allows you to borrow the contents of the prompt without allocating a new one.
30    pub fn borrow(&'a self) -> Prompt<'a> {
31        Self(self.0.iter().map(|item| item.borrow()).collect())
32    }
33
34    /// When constructing prompts programatically, it can be beneficial to append several
35    /// text items in a prompt. For example, if doing a fewshot prompt as the first item,
36    /// and user input as a second item.
37    ///
38    /// However, because of how tokenization works, having each item tokenized separately
39    /// can sometimes have strange side effects (tokenizing two partial strings does not
40    /// necessarily produce the same tokens as tokenizing the strings joined together).
41    ///
42    /// This method will take an existing prompt and merge any consecutive prompt items
43    /// by a given separator. You can use an empty string for the separator if you want
44    /// to just concatenate them.
45    pub fn join_consecutive_text_items(&mut self, separator: &str) {
46        self.0 = self
47            .0
48            .drain(..)
49            .coalesce(|a, b| match (a, b) {
50                (Modality::Text { mut data }, Modality::Text { data: other }) => {
51                    data.to_mut().push_str(separator);
52                    data.to_mut().push_str(&other);
53                    Ok(Modality::Text { data })
54                }
55                (a, b) => Err((a, b)),
56            })
57            .collect::<Vec<_>>();
58    }
59}
60
61/// The prompt for models can be a combination of different modalities (Text and Image). The type of
62/// modalities which are supported depend on the Model in question.
63#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
64#[serde(tag = "type", rename_all = "snake_case")]
65pub enum Modality<'a> {
66    /// The only type of prompt which can be used with pure language models
67    Text { data: Cow<'a, str> },
68    /// An image input into the model. See [`Modality::from_image_path`].
69    Image { data: Cow<'a, str> },
70}
71
72impl<'a> Modality<'a> {
73    /// Instantiates a text prompt
74    pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
75        Modality::Text { data: text.into() }
76    }
77
78    /// Image input for model, from file path.
79    ///
80    /// The model can only see squared pictures. Images are centercropped.
81    ///
82    /// ```no_run
83    /// use aleph_alpha_client::{Client, How, Modality, Prompt, Sampling, Stopping, TaskCompletion,
84    ///     Task, Logprobs};
85    /// use dotenvy::dotenv;
86    /// use std::path::PathBuf;
87    ///
88    /// #[tokio::main(flavor = "current_thread")]
89    /// async fn main() {
90    ///     // Create client
91    ///     let client = Client::from_env().unwrap();
92    ///     // Define task
93    ///     let task = TaskCompletion {
94    ///         prompt: Prompt::from_vec(vec![
95    ///             Modality::from_image_path("cat.png").unwrap(),
96    ///             Modality::from_text("A picture of "),
97    ///         ]),
98    ///         stopping: Stopping::from_maximum_tokens(10),
99    ///         sampling: Sampling::MOST_LIKELY,
100    ///         special_tokens: false,
101    ///         logprobs: Logprobs::No,
102    ///     };
103    ///     // Execute
104    ///     let model = "luminous-base";
105    ///     let job = task.with_model(model);
106    ///     let response = client.output_of(&job, &How::default()).await.unwrap();
107    ///     // Show result
108    ///     println!("{}", response.completion);
109    /// }
110    /// ```
111    pub fn from_image_path(path: impl AsRef<Path>) -> Result<Self, LoadImageError> {
112        let bytes = image_preprocessing::from_image_path(path.as_ref())?;
113        Ok(Self::from_image_bytes(&bytes))
114    }
115
116    /// Image input for model
117    ///
118    /// The model can only see squared pictures. Images are centercropped. You may want to use this
119    /// method instead of [`Self::from_image_path`] in case you have the image in memory already
120    /// and do not want to load it from a file again.
121    pub fn from_image(image: &DynamicImage) -> Result<Self, LoadImageError> {
122        let bytes = image_preprocessing::preprocess_image(image);
123        Ok(Self::from_image_bytes(&bytes))
124    }
125
126    /// Generates an image input from the binary representation of the image.
127    ///
128    /// Using this constructor you must use a binary representation compatible with the API. Png is
129    /// guaranteed to be supported, and all others formats are converted into it. Furthermore, the
130    /// model can only look at square shaped pictures. If the picture is not square shaped it will
131    /// be center cropped.
132    fn from_image_bytes(image: &[u8]) -> Self {
133        Modality::Image {
134            data: BASE64_STANDARD.encode(image).into(),
135        }
136    }
137
138    /// Create a semantically idetical entry of modality which borrows the contents of this one.
139    ///
140    /// It is very practical to allow Modality of e.g. Text to take both ownership of the string it
141    /// contains as well as borrow a slice. However then we are creating a body from the user input
142    /// we want to avoid copying everything and needing to allocate for that modality again. This is
143    /// there this borrow function really shines.
144    pub fn borrow(&self) -> Modality<'_> {
145        match self {
146            Modality::Text { data } => Modality::Text {
147                data: Cow::Borrowed(data.borrow()),
148            },
149            Modality::Image { data } => Modality::Image {
150                data: Cow::Borrowed(data.borrow()),
151            },
152        }
153    }
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn can_concatenate_prompt_items() {
162        let mut prompt =
163            Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
164        prompt.join_consecutive_text_items("");
165
166        assert_eq!(prompt.0, vec![Modality::from_text("foobar")]);
167    }
168
169    #[test]
170    fn can_concatenate_prompt_items_with_custom_separator() {
171        let mut prompt =
172            Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
173        prompt.join_consecutive_text_items("\n");
174
175        assert_eq!(prompt.0, vec![Modality::from_text("foo\nbar")]);
176    }
177}