aleph_alpha_client/prompt.rs
1use std::{
2 borrow::{Borrow, Cow},
3 path::Path,
4};
5
6use base64::{prelude::BASE64_STANDARD, Engine};
7use image::DynamicImage;
8use itertools::Itertools;
9use serde::Serialize;
10
11use crate::image_preprocessing::{self, LoadImageError};
12
13/// A prompt which is passed to the model for inference. Usually it is one text item, but it could
14/// also be a combination of several modalities like images and text.
15#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
16pub struct Prompt<'a>(Vec<Modality<'a>>);
17
18impl<'a> Prompt<'a> {
19 /// Create a prompt from a single text item.
20 pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
21 Self(vec![Modality::from_text(text)])
22 }
23
24 /// Create a multimodal prompt from a list of individual items with any modality.
25 pub fn from_vec(items: Vec<Modality<'a>>) -> Self {
26 Self(items)
27 }
28
29 /// Allows you to borrow the contents of the prompt without allocating a new one.
30 pub fn borrow(&'a self) -> Prompt<'a> {
31 Self(self.0.iter().map(|item| item.borrow()).collect())
32 }
33
34 /// When constructing prompts programatically, it can be beneficial to append several
35 /// text items in a prompt. For example, if doing a fewshot prompt as the first item,
36 /// and user input as a second item.
37 ///
38 /// However, because of how tokenization works, having each item tokenized separately
39 /// can sometimes have strange side effects (tokenizing two partial strings does not
40 /// necessarily produce the same tokens as tokenizing the strings joined together).
41 ///
42 /// This method will take an existing prompt and merge any consecutive prompt items
43 /// by a given separator. You can use an empty string for the separator if you want
44 /// to just concatenate them.
45 pub fn join_consecutive_text_items(&mut self, separator: &str) {
46 self.0 = self
47 .0
48 .drain(..)
49 .coalesce(|a, b| match (a, b) {
50 (Modality::Text { mut data }, Modality::Text { data: other }) => {
51 data.to_mut().push_str(separator);
52 data.to_mut().push_str(&other);
53 Ok(Modality::Text { data })
54 }
55 (a, b) => Err((a, b)),
56 })
57 .collect::<Vec<_>>();
58 }
59}
60
61/// The prompt for models can be a combination of different modalities (Text and Image). The type of
62/// modalities which are supported depend on the Model in question.
63#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
64#[serde(tag = "type", rename_all = "snake_case")]
65pub enum Modality<'a> {
66 /// The only type of prompt which can be used with pure language models
67 Text { data: Cow<'a, str> },
68 /// An image input into the model. See [`Modality::from_image_path`].
69 Image { data: Cow<'a, str> },
70}
71
72impl<'a> Modality<'a> {
73 /// Instantiates a text prompt
74 pub fn from_text(text: impl Into<Cow<'a, str>>) -> Self {
75 Modality::Text { data: text.into() }
76 }
77
78 /// Image input for model, from file path.
79 ///
80 /// The model can only see squared pictures. Images are centercropped.
81 ///
82 /// ```no_run
83 /// use aleph_alpha_client::{Client, How, Modality, Prompt, Sampling, Stopping, TaskCompletion,
84 /// Task, Logprobs};
85 /// use dotenvy::dotenv;
86 /// use std::path::PathBuf;
87 ///
88 /// #[tokio::main(flavor = "current_thread")]
89 /// async fn main() {
90 /// // Create client
91 /// let client = Client::from_env().unwrap();
92 /// // Define task
93 /// let task = TaskCompletion {
94 /// prompt: Prompt::from_vec(vec![
95 /// Modality::from_image_path("cat.png").unwrap(),
96 /// Modality::from_text("A picture of "),
97 /// ]),
98 /// stopping: Stopping::from_maximum_tokens(10),
99 /// sampling: Sampling::MOST_LIKELY,
100 /// special_tokens: false,
101 /// logprobs: Logprobs::No,
102 /// echo: false,
103 /// };
104 /// // Execute
105 /// let model = "luminous-base";
106 /// let job = task.with_model(model);
107 /// let response = client.output_of(&job, &How::default()).await.unwrap();
108 /// // Show result
109 /// println!("{}", response.completion);
110 /// }
111 /// ```
112 pub fn from_image_path(path: impl AsRef<Path>) -> Result<Self, LoadImageError> {
113 let bytes = image_preprocessing::from_image_path(path.as_ref())?;
114 Ok(Self::from_image_bytes(&bytes))
115 }
116
117 /// Image input for model
118 ///
119 /// The model can only see squared pictures. Images are centercropped. You may want to use this
120 /// method instead of [`Self::from_image_path`] in case you have the image in memory already
121 /// and do not want to load it from a file again.
122 pub fn from_image(image: &DynamicImage) -> Result<Self, LoadImageError> {
123 let bytes = image_preprocessing::preprocess_image(image);
124 Ok(Self::from_image_bytes(&bytes))
125 }
126
127 /// Generates an image input from the binary representation of the image.
128 ///
129 /// Using this constructor you must use a binary representation compatible with the API. Png is
130 /// guaranteed to be supported, and all others formats are converted into it. Furthermore, the
131 /// model can only look at square shaped pictures. If the picture is not square shaped it will
132 /// be center cropped.
133 fn from_image_bytes(image: &[u8]) -> Self {
134 Modality::Image {
135 data: BASE64_STANDARD.encode(image).into(),
136 }
137 }
138
139 /// Create a semantically idetical entry of modality which borrows the contents of this one.
140 ///
141 /// It is very practical to allow Modality of e.g. Text to take both ownership of the string it
142 /// contains as well as borrow a slice. However then we are creating a body from the user input
143 /// we want to avoid copying everything and needing to allocate for that modality again. This is
144 /// there this borrow function really shines.
145 pub fn borrow(&self) -> Modality<'_> {
146 match self {
147 Modality::Text { data } => Modality::Text {
148 data: Cow::Borrowed(data.borrow()),
149 },
150 Modality::Image { data } => Modality::Image {
151 data: Cow::Borrowed(data.borrow()),
152 },
153 }
154 }
155}
156
157#[cfg(test)]
158mod tests {
159 use super::*;
160
161 #[test]
162 fn can_concatenate_prompt_items() {
163 let mut prompt =
164 Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
165 prompt.join_consecutive_text_items("");
166
167 assert_eq!(prompt.0, vec![Modality::from_text("foobar")]);
168 }
169
170 #[test]
171 fn can_concatenate_prompt_items_with_custom_separator() {
172 let mut prompt =
173 Prompt::from_vec(vec![Modality::from_text("foo"), Modality::from_text("bar")]);
174 prompt.join_consecutive_text_items("\n");
175
176 assert_eq!(prompt.0, vec![Modality::from_text("foo\nbar")]);
177 }
178}