1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq)]
pub enum Task {
Untitled,
/// Image classification task.
/// Input: image
/// Output: a label representing the class of the image
ImageClassification,
/// Multi-label image tagging task.
/// Input: image
/// Output: multiple labels representing different categories in the image
ImageTagging,
/// Image captioning task, generating descriptions with different levels of detail.
/// Input: image
/// Output: a text description, `u8` represents the level of detail:
/// 0 for brief, 1 for detailed, 2 for more detailed
Caption(u8),
/// Region proposal task, detecting all objects in the image.
/// Input: image
/// Output: bounding boxes (bboxes)
RegionProposal,
/// Object detection task, detecting all objects in the image.
/// Input: image
/// Output: bounding boxes (bboxes), class labels, and optional scores for the detected objects
ObjectDetection,
/// Open set detection task, detecting and classifying objects in an image, with the ability to handle unseen or unknown objects.
/// Input: image
/// Output: bounding boxes, class labels (including an "unknown" category for unfamiliar objects), and detection scores
/// Open set detection task, with String query
OpenSetDetection(String),
/// Task for generating brief descriptions of dense regions in the image.
/// Input: image
/// Output: bounding boxes (bboxes), brief phrase labels, and optional scores for detected regions
DenseRegionCaption,
/// Keypoint detection task, detecting keypoints in an image.
/// This can include human body parts (e.g., hands, feet, joints) or other objects.
/// Input: image
/// Output: coordinates of detected keypoints
KeypointsDetection,
/// Semantic segmentation task, segmenting the image into different semantic regions.
/// Input: image
/// Output: per-pixel class labels indicating object or background
SemanticSegmentation,
/// Instance segmentation task, detecting and segmenting individual object instances.
/// Input: image
/// Output: pixel masks for each object instance
InstanceSegmentation,
/// Depth estimation task, predicting the distance of each pixel from the camera.
/// Input: image
/// Output: a depth map where each pixel has a depth value
DepthEstimation,
/// Surface normal prediction task, predicting the surface normal vector for each pixel.
/// Input: image
/// Output: a normal map where each pixel has a surface normal vector
SurfaceNormalPrediction,
/// Image-to-image generation task, transforming one image into another.
/// Input: image
/// Output: a generated image
ImageToImageGeneration,
/// Text-to-image generation task, generating an image based on a text description.
/// Input: text
/// Output: a generated image
TextToImageGeneration,
/// Inpainting task, filling in missing or corrupted parts of an image.
/// Input: image with missing or corrupted regions
/// Output: a complete image with the missing parts filled in
Inpainting,
/// Super-resolution task, enhancing the resolution of an image.
/// Input: low-resolution image
/// Output: high-resolution image
SuperResolution,
/// Image denoising task, removing noise from an image.
/// Input: noisy image
/// Output: denoised image
Denoising,
/// Phrase grounding task, finding the region in an image corresponding to a text description.
/// Input: image and text
/// Output: image region and the corresponding phrase
/// caption to phrase grounding
CaptionToPhraseGrounding(String),
/// Referring expression segmentation task, segmenting objects in the image based on a text description.
/// Input: image and referring expression
/// Output: a segmentation mask for the object referred to by the text
ReferringExpressionSegmentation(String),
/// Region-to-segmentation task, similar to combining object detection with segmentation (e.g., YOLO + SAM).
/// Input: image and region proposals
/// Output: segmentation masks for the regions
/// Region, bbox: top-left, bottom-right
RegionToSegmentation(usize, usize, usize, usize),
/// Region-to-category classification task, classifying the object in a given region of the image.
/// Input: image and region
/// Output: class label for the region
/// Region, bbox: top-left, bottom-right
RegionToCategory(usize, usize, usize, usize),
/// Region-to-description task, generating a detailed description for a given region in the image.
/// Input: image and region
/// Output: a detailed textual description for the region
/// Region, bbox: top-left, bottom-right
RegionToDescription(usize, usize, usize, usize),
/// Visual question answering (VQA) task, answering questions related to an image.
/// Input: image and question text
/// Output: the answer to the question
Vqa(String),
/// Optical character recognition (OCR) task, recognizing text in an image.
/// Input: image
/// Output: recognized text
Ocr,
/// OCR task with region information, recognizing text and returning its location in the image.
/// Input: image
/// Output: recognized text and its bounding box in the image
OcrWithRegion,
}
impl Task {
pub fn prompt_for_florence2(&self) -> anyhow::Result<String> {
let prompt = match self {
Self::Untitled => anyhow::bail!("No task specified."),
Self::Caption(0) => "What does the image describe?".to_string(),
Self::Caption(1) => "Describe in detail what is shown in the image.".to_string(),
Self::Caption(2) => "Describe with a paragraph what is shown in the image.".to_string(),
Self::Ocr => "What is the text in the image?".to_string(),
Self::OcrWithRegion => "What is the text in the image, with regions?".to_string(),
Self::ObjectDetection => {
"Locate the objects with category name in the image.".to_string()
}
Self::DenseRegionCaption => {
"Locate the objects in the image, with their descriptions.".to_string()
}
Self::RegionProposal => "Locate the region proposals in the image.".to_string(),
Self::OpenSetDetection(text) => {
format!("Locate {} in the image.", text)
}
Self::CaptionToPhraseGrounding(text) => {
format!("Locate the phrases in the caption: {}", text)
}
Self::ReferringExpressionSegmentation(text) => {
format!("Locate {} in the image with mask", text)
}
Self::RegionToSegmentation(x0, y0, x1, y1) => {
format!(
"What is the polygon mask of region <loc_{}><loc_{}><loc_{}><loc_{}>",
x0, y0, x1, y1
)
}
Self::RegionToCategory(x0, y0, x1, y1) => {
format!(
"What is the region <loc_{}><loc_{}><loc_{}><loc_{}>?",
x0, y0, x1, y1
)
}
Self::RegionToDescription(x0, y0, x1, y1) => {
format!(
"What does the region <loc_{}><loc_{}><loc_{}><loc_{}> describe?",
x0, y0, x1, y1
)
}
_ => anyhow::bail!("Unsupported task."),
};
Ok(prompt)
}
}