aleph_alpha_client/prompt.rs
1use std::{
2 borrow::{Borrow, Cow},
3 path::Path,
4};
5
6use base64::{prelude::BASE64_STANDARD, Engine};
7use image::DynamicImage;
8use itertools::Itertools;
9use serde::Serialize;
10
11use crate::image_preprocessing::{self, LoadImageError};
12
13/// A prompt which is passed to the model for inference. Usually it is one text item, but it could
14/// also be a combination of several modalities like images and text.
15#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
16pub struct Prompt<'a>(Vec<Modality<'a>>);
17
18impl<'a> Prompt<'a> {
19 /// Create a prompt from a single text item.
20 pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
21 Self(vec![Modality::from_text(text)])
22 }
23
24 /// Create a multimodal prompt from a list of individual items with any modality.
25 pub fn from_vec(items: Vec<Modality<'a>>) -> Self {
26 Self(items)
27 }
28
29 /// Allows you to borrow the contents of the prompt without allocating a new one.
30 pub fn borrow(&'a self) -> Prompt<'a> {
31 Self(self.0.iter().map(|item| item.borrow()).collect())
32 }
33
34 /// When constructing prompts programatically, it can be beneficial to append several
35 /// text items in a prompt. For example, if doing a fewshot prompt as the first item,
36 /// and user input as a second item.
37 ///
38 /// However, because of how tokenization works, having each item tokenized separately
39 /// can sometimes have strange side effects (tokenizing two partial strings does not
40 /// necessarily produce the same tokens as tokenizing the strings joined together).
41 ///
42 /// This method will take an existing prompt and merge any consecutive prompt items
43 /// by a given separator. You can use an empty string for the separator if you want
44 /// to just concatenate them.
45 pub fn join_consecutive_text_items(&mut self, separator: &str) {
46 self.0 = self
47 .0
48 .drain(..)
49 .coalesce(|a, b| match (a, b) {
50 (Modality::Text { mut data }, Modality::Text { data: other }) => {
51 data.to_mut().push_str(separator);
52 data.to_mut().push_str(&other);
53 Ok(Modality::Text { data })
54 }
55 (a, b) => Err((a, b)),
56 })
57 .collect::<Vec<_>>();
58 }
59}
60
61/// The prompt for models can be a combination of different modalities (Text and Image). The type of
62/// modalities which are supported depend on the Model in question.
63#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
64#[serde(tag = "type", rename_all = "snake_case")]
65pub enum Modality<'a> {
66 /// The only type of prompt which can be used with pure language models
67 Text { data: Cow<'a, str> },
68 /// An image input into the model. See [`Modality::from_image_path`].
69 Image { data: Cow<'a, str> },
70}
71
72impl<'a> Modality<'a> {
73 /// Instantiates a text prompt
74 pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
75 Modality::Text { data: text.into() }
76 }
77
78 /// Image input for model, from file path.
79 ///
80 /// The model can only see squared pictures. Images are centercropped.
81 ///
82 /// ```no_run
83 /// use aleph_alpha_client::{Client, How, Modality, Prompt, Sampling, Stopping, TaskCompletion,
84 /// Task, Logprobs};
85 /// use dotenvy::dotenv;
86 /// use std::path::PathBuf;
87 ///
88 /// #[tokio::main(flavor = "current_thread")]
89 /// async fn main() {
90 /// // Create client
91 /// let client = Client::from_env().unwrap();
92 /// // Define task
93 /// let task = TaskCompletion {
94 /// prompt: Prompt::from_vec(vec![
95 /// Modality::from_image_path("cat.png").unwrap(),
96 /// Modality::from_text("A picture of "),
97 /// ]),
98 /// stopping: Stopping::from_maximum_tokens(10),
99 /// sampling: Sampling::MOST_LIKELY,
100 /// special_tokens: false,
101 /// logprobs: Logprobs::No,
102 /// };
103 /// // Execute
104 /// let model = "luminous-base";
105 /// let job = task.with_model(model);
106 /// let response = client.output_of(&job, &How::default()).await.unwrap();
107 /// // Show result
108 /// println!("{}", response.completion);
109 /// }
110 /// ```
111 pub fn from_image_path(path: impl AsRef<Path>) -> Result<Self, LoadImageError> {
112 let bytes = image_preprocessing::from_image_path(path.as_ref())?;
113 Ok(Self::from_image_bytes(&bytes))
114 }
115
116 /// Image input for model
117 ///
118 /// The model can only see squared pictures. Images are centercropped. You may want to use this
119 /// method instead of [`Self::from_image_path`] in case you have the image in memory already
120 /// and do not want to load it from a file again.
121 pub fn from_image(image: &DynamicImage) -> Result<Self, LoadImageError> {
122 let bytes = image_preprocessing::preprocess_image(image);
123 Ok(Self::from_image_bytes(&bytes))
124 }
125
126 /// Generates an image input from the binary representation of the image.
127 ///
128 /// Using this constructor you must use a binary representation compatible with the API. Png is
129 /// guaranteed to be supported, and all others formats are converted into it. Furthermore, the
130 /// model can only look at square shaped pictures. If the picture is not square shaped it will
131 /// be center cropped.
132 fn from_image_bytes(image: &[u8]) -> Self {
133 Modality::Image {
134 data: BASE64_STANDARD.encode(image).into(),
135 }
136 }
137
138 /// Create a semantically idetical entry of modality which borrows the contents of this one.
139 ///
140 /// It is very practical to allow Modality of e.g. Text to take both ownership of the string it
141 /// contains as well as borrow a slice. However then we are creating a body from the user input
142 /// we want to avoid copying everything and needing to allocate for that modality again. This is
143 /// there this borrow function really shines.
144 pub fn borrow(&self) -> Modality<'_> {
145 match self {
146 Modality::Text { data } => Modality::Text {
147 data: Cow::Borrowed(data.borrow()),
148 },
149 Modality::Image { data } => Modality::Image {
150 data: Cow::Borrowed(data.borrow()),
151 },
152 }
153 }
154}
155
156#[cfg(test)]
157mod tests {
158 use super::*;
159
160 #[test]
161 fn can_concatenate_prompt_items() {
162 let mut prompt =
163 Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
164 prompt.join_consecutive_text_items("");
165
166 assert_eq!(prompt.0, vec![Modality::from_text("foobar")]);
167 }
168
169 #[test]
170 fn can_concatenate_prompt_items_with_custom_separator() {
171 let mut prompt =
172 Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
173 prompt.join_consecutive_text_items("\n");
174
175 assert_eq!(prompt.0, vec![Modality::from_text("foo\nbar")]);
176 }
177}