1#![cfg_attr(not(feature = "ollama"), allow(dead_code))]
2
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::ops::Range;
7
8use crate::detect::normalize;
9use crate::types::{Finding, FindingKind, FindingSource};
10
11#[derive(Debug, Clone, Default)]
12pub struct LlmConfig {
13 pub base_url: String,
14 pub model: String,
15}
16
17#[derive(Debug, Clone, Serialize)]
18struct ChatRequest<'a> {
19 model: &'a str,
20 messages: Vec<Message<'a>>,
21 stream: bool,
22 temperature: f32,
23 response_format: ResponseFormat<'a>,
24}
25
26#[derive(Debug, Clone, Serialize)]
27struct Message<'a> {
28 role: &'a str,
29 content: String,
30}
31
32#[derive(Debug, Clone, Serialize)]
33struct ResponseFormat<'a> {
34 #[serde(rename = "type")]
35 kind: &'a str,
36}
37
38#[derive(Debug, Clone, Deserialize)]
39struct ChatResponse {
40 choices: Vec<Choice>,
41}
42
43#[derive(Debug, Clone, Deserialize)]
44struct Choice {
45 message: ChatMessage,
46}
47
48#[derive(Debug, Clone, Deserialize)]
49struct ChatMessage {
50 content: String,
51}
52
53#[derive(Debug, Clone, Deserialize)]
54struct CandidateEnvelope {
55 candidates: Vec<Candidate>,
56}
57
58#[derive(Debug, Clone, Deserialize)]
59struct Candidate {
60 kind: String,
61 value: String,
62 confidence: Option<u8>,
63}
64
65#[cfg(feature = "ollama")]
66pub fn discover_candidates(
67 config: &LlmConfig,
68 text: &str,
69 person_detection: bool,
70) -> Result<Vec<Finding>> {
71 let allowed_kinds = if person_detection {
72 "person, organization"
73 } else {
74 "organization"
75 };
76 let prompt = format!(
77 "Find sensitive items in the input text. Return JSON only with a top-level key named candidates. \
78 Each candidate must include kind, value, confidence. Allowed kinds: {allowed_kinds}. \
79 Only include exact values copied from the input text.\n\nInput:\n{text}"
80 );
81 let request = ChatRequest {
82 model: &config.model,
83 messages: vec![
84 Message {
85 role: "system",
86 content: "Return compact JSON only. Do not rewrite the source text.".to_string(),
87 },
88 Message {
89 role: "user",
90 content: prompt,
91 },
92 ],
93 stream: false,
94 temperature: 0.0,
95 response_format: ResponseFormat {
96 kind: "json_object",
97 },
98 };
99 let endpoint = format!(
100 "{}/v1/chat/completions",
101 config.base_url.trim_end_matches('/')
102 );
103 let client = reqwest::blocking::Client::new();
104 let response: ChatResponse = client
105 .post(endpoint)
106 .json(&request)
107 .send()
108 .context("failed to call Ollama")?
109 .error_for_status()
110 .context("Ollama returned an error response")?
111 .json()
112 .context("failed to decode Ollama response")?;
113 let content = response
114 .choices
115 .into_iter()
116 .next()
117 .context("Ollama response did not contain any choices")?
118 .message
119 .content;
120 parse_candidates(text, &content, person_detection)
121}
122
123#[cfg(not(feature = "ollama"))]
124pub fn discover_candidates(
125 _config: &LlmConfig,
126 _text: &str,
127 _person_detection: bool,
128) -> Result<Vec<Finding>> {
129 anyhow::bail!("this binary was built without the `ollama` feature")
130}
131
132fn parse_candidates(text: &str, content: &str, person_detection: bool) -> Result<Vec<Finding>> {
133 let envelope: CandidateEnvelope =
134 serde_json::from_str(content).context("failed to parse LLM JSON response")?;
135 let mut findings = Vec::new();
136 let mut occupied_ranges: Vec<Range<usize>> = Vec::new();
137 let match_positions = build_match_positions(
138 text,
139 envelope
140 .candidates
141 .iter()
142 .map(|candidate| candidate.value.as_str()),
143 );
144 let mut consumed_positions = HashMap::<String, usize>::new();
145
146 for candidate in envelope.candidates {
147 let Some(kind) = map_kind(&candidate.kind, person_detection) else {
148 continue;
149 };
150 if let Some(start) = find_next_unoccupied_match(
151 &candidate.value,
152 &match_positions,
153 &mut consumed_positions,
154 &occupied_ranges,
155 ) {
156 let range = start..start + candidate.value.len();
157 occupied_ranges.push(range.clone());
158 findings.push(Finding {
159 kind,
160 source: FindingSource::Llm,
161 match_text: candidate.value.clone(),
162 normalized_key: normalize(kind, &candidate.value),
163 confidence: candidate.confidence.unwrap_or(60).min(100),
164 start: range.start,
165 end: range.end,
166 });
167 }
168 }
169
170 Ok(findings)
171}
172
173fn build_match_positions<'a>(
174 text: &str,
175 values: impl IntoIterator<Item = &'a str>,
176) -> HashMap<String, Vec<usize>> {
177 let mut positions = HashMap::new();
178 for value in values {
179 if value.is_empty() || positions.contains_key(value) {
180 continue;
181 }
182 positions.insert(value.to_string(), collect_match_positions(text, value));
183 }
184 positions
185}
186
187fn find_next_unoccupied_match(
188 value: &str,
189 match_positions: &HashMap<String, Vec<usize>>,
190 consumed_positions: &mut HashMap<String, usize>,
191 occupied_ranges: &[Range<usize>],
192) -> Option<usize> {
193 let positions = match_positions.get(value)?;
194 let next_index = consumed_positions.entry(value.to_string()).or_insert(0);
195
196 while *next_index < positions.len() {
197 let start = positions[*next_index];
198 *next_index += 1;
199 let candidate = start..start + value.len();
200 if occupied_ranges
201 .iter()
202 .all(|used| candidate.end <= used.start || used.end <= candidate.start)
203 {
204 return Some(start);
205 }
206 }
207
208 None
209}
210
211fn collect_match_positions(text: &str, value: &str) -> Vec<usize> {
212 let mut positions = Vec::new();
213 if value.is_empty() {
214 return positions;
215 }
216
217 let mut search_start = 0;
218 while search_start <= text.len() {
219 let Some(offset) = text[search_start..].find(value) else {
220 break;
221 };
222 let start = search_start + offset;
223 positions.push(start);
224 search_start = start + 1;
225 }
226
227 positions
228}
229
230fn map_kind(kind: &str, person_detection: bool) -> Option<FindingKind> {
231 match kind {
232 "person" if person_detection => Some(FindingKind::Person),
233 "organization" => Some(FindingKind::Organization),
234 _ => None,
235 }
236}
237
238#[cfg(test)]
239mod tests {
240 use crate::FindingKind;
241
242 use super::parse_candidates;
243
244 #[test]
245 fn parse_candidates_maps_duplicate_values_to_distinct_occurrences() {
246 let text = "Alice met Alice at Acme. Alice returned to Acme.";
247 let content = r#"{
248 "candidates": [
249 {"kind":"person","value":"Alice","confidence":70},
250 {"kind":"person","value":"Alice","confidence":65},
251 {"kind":"organization","value":"Acme","confidence":80},
252 {"kind":"organization","value":"Acme","confidence":75}
253 ]
254 }"#;
255
256 let findings = parse_candidates(text, content, true).expect("parse candidates");
257 let spans = findings
258 .iter()
259 .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
260 .collect::<Vec<_>>();
261
262 assert_eq!(
263 spans,
264 vec![
265 ("Alice", 0, 5),
266 ("Alice", 10, 15),
267 ("Acme", 19, 23),
268 ("Acme", 43, 47),
269 ]
270 );
271 }
272
273 #[test]
274 fn parse_candidates_drops_extra_duplicate_values_without_available_matches() {
275 let text = "Alice joined Acme with Alice.";
276 let content = r#"{
277 "candidates": [
278 {"kind":"person","value":"Alice","confidence":70},
279 {"kind":"person","value":"Alice","confidence":65},
280 {"kind":"person","value":"Alice","confidence":60}
281 ]
282 }"#;
283
284 let findings = parse_candidates(text, content, true).expect("parse candidates");
285 let alice_positions = findings
286 .iter()
287 .map(|finding| (finding.start, finding.end))
288 .collect::<Vec<_>>();
289
290 assert_eq!(alice_positions, vec![(0, 5), (23, 28)]);
291 }
292
293 #[test]
294 fn parse_candidates_keeps_different_values_from_stealing_each_other() {
295 let text = "Alice Acme Alice";
296 let content = r#"{
297 "candidates": [
298 {"kind":"organization","value":"Acme","confidence":80},
299 {"kind":"person","value":"Alice","confidence":70},
300 {"kind":"person","value":"Alice","confidence":65}
301 ]
302 }"#;
303
304 let findings = parse_candidates(text, content, true).expect("parse candidates");
305 let spans = findings
306 .iter()
307 .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
308 .collect::<Vec<_>>();
309
310 assert_eq!(
311 spans,
312 vec![("Acme", 6, 10), ("Alice", 0, 5), ("Alice", 11, 16)]
313 );
314 }
315
316 #[test]
317 fn parse_candidates_skips_people_when_person_detection_is_disabled() {
318 let text = "Alice met Acme.";
319 let content = r#"{
320 "candidates": [
321 {"kind":"person","value":"Alice","confidence":70},
322 {"kind":"organization","value":"Acme","confidence":80}
323 ]
324 }"#;
325
326 let findings = parse_candidates(text, content, false).expect("parse candidates");
327 let values = findings
328 .iter()
329 .map(|finding| (finding.kind, finding.match_text.as_str()))
330 .collect::<Vec<_>>();
331
332 assert_eq!(values, vec![(FindingKind::Organization, "Acme")]);
333 }
334}