1#![cfg_attr(not(feature = "ollama"), allow(dead_code))]
2
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::ops::Range;
7
8use crate::detect::normalize;
9use crate::types::{Finding, FindingKind, FindingSource, RedactionRules};
10
11#[derive(Debug, Clone, Default)]
12pub struct LlmConfig {
13 pub base_url: String,
14 pub model: String,
15}
16
17#[derive(Debug, Clone, Serialize)]
18struct ChatRequest<'a> {
19 model: &'a str,
20 messages: Vec<Message<'a>>,
21 stream: bool,
22 temperature: f32,
23 response_format: ResponseFormat<'a>,
24}
25
26#[derive(Debug, Clone, Serialize)]
27struct Message<'a> {
28 role: &'a str,
29 content: String,
30}
31
32#[derive(Debug, Clone, Serialize)]
33struct ResponseFormat<'a> {
34 #[serde(rename = "type")]
35 kind: &'a str,
36}
37
38#[derive(Debug, Clone, Deserialize)]
39struct ChatResponse {
40 choices: Vec<Choice>,
41}
42
43#[derive(Debug, Clone, Deserialize)]
44struct Choice {
45 message: ChatMessage,
46}
47
48#[derive(Debug, Clone, Deserialize)]
49struct ChatMessage {
50 content: String,
51}
52
53#[derive(Debug, Clone, Deserialize)]
54struct CandidateEnvelope {
55 candidates: Vec<Candidate>,
56}
57
58#[derive(Debug, Clone, Deserialize)]
59struct Candidate {
60 kind: String,
61 value: String,
62 confidence: Option<u8>,
63}
64
65#[cfg(feature = "ollama")]
66pub fn discover_candidates(
67 config: &LlmConfig,
68 text: &str,
69 rules: RedactionRules,
70) -> Result<Vec<Finding>> {
71 let allowed_kinds = allowed_llm_kinds(rules);
72 if allowed_kinds.is_empty() {
73 return Ok(Vec::new());
74 }
75 let allowed_kinds = allowed_kinds.join(", ");
76 let prompt = format!(
77 "Find sensitive items in the input text. Return JSON only with a top-level key named candidates. \
78 Each candidate must include kind, value, confidence. Allowed kinds: {allowed_kinds}. \
79 Only include exact values copied from the input text.\n\nInput:\n{text}"
80 );
81 let request = ChatRequest {
82 model: &config.model,
83 messages: vec![
84 Message {
85 role: "system",
86 content: "Return compact JSON only. Do not rewrite the source text.".to_string(),
87 },
88 Message {
89 role: "user",
90 content: prompt,
91 },
92 ],
93 stream: false,
94 temperature: 0.0,
95 response_format: ResponseFormat {
96 kind: "json_object",
97 },
98 };
99 let endpoint = format!(
100 "{}/v1/chat/completions",
101 config.base_url.trim_end_matches('/')
102 );
103 let client = reqwest::blocking::Client::new();
104 let response: ChatResponse = client
105 .post(endpoint)
106 .json(&request)
107 .send()
108 .context("failed to call Ollama")?
109 .error_for_status()
110 .context("Ollama returned an error response")?
111 .json()
112 .context("failed to decode Ollama response")?;
113 let content = response
114 .choices
115 .into_iter()
116 .next()
117 .context("Ollama response did not contain any choices")?
118 .message
119 .content;
120 parse_candidates(text, &content, rules)
121}
122
123#[cfg(not(feature = "ollama"))]
124pub fn discover_candidates(
125 _config: &LlmConfig,
126 _text: &str,
127 _rules: RedactionRules,
128) -> Result<Vec<Finding>> {
129 anyhow::bail!("this binary was built without the `ollama` feature")
130}
131
132fn allowed_llm_kinds(rules: RedactionRules) -> Vec<&'static str> {
133 let mut kinds = Vec::new();
134 if rules.person {
135 kinds.push("person");
136 }
137 if rules.organization {
138 kinds.push("organization");
139 }
140 kinds
141}
142
143fn parse_candidates(text: &str, content: &str, rules: RedactionRules) -> Result<Vec<Finding>> {
144 let envelope: CandidateEnvelope =
145 serde_json::from_str(content).context("failed to parse LLM JSON response")?;
146 let mut findings = Vec::new();
147 let mut occupied_ranges: Vec<Range<usize>> = Vec::new();
148 let match_positions = build_match_positions(
149 text,
150 envelope
151 .candidates
152 .iter()
153 .map(|candidate| candidate.value.as_str()),
154 );
155 let mut consumed_positions = HashMap::<String, usize>::new();
156
157 for candidate in envelope.candidates {
158 let Some(kind) = map_kind(&candidate.kind, rules) else {
159 continue;
160 };
161 if let Some(start) = find_next_unoccupied_match(
162 &candidate.value,
163 &match_positions,
164 &mut consumed_positions,
165 &occupied_ranges,
166 ) {
167 let range = start..start + candidate.value.len();
168 occupied_ranges.push(range.clone());
169 findings.push(Finding {
170 kind,
171 source: FindingSource::Llm,
172 match_text: candidate.value.clone(),
173 normalized_key: normalize(kind, &candidate.value),
174 confidence: candidate.confidence.unwrap_or(60).min(100),
175 start: range.start,
176 end: range.end,
177 });
178 }
179 }
180
181 Ok(findings)
182}
183
184fn build_match_positions<'a>(
185 text: &str,
186 values: impl IntoIterator<Item = &'a str>,
187) -> HashMap<String, Vec<usize>> {
188 let mut positions = HashMap::new();
189 for value in values {
190 if value.is_empty() || positions.contains_key(value) {
191 continue;
192 }
193 positions.insert(value.to_string(), collect_match_positions(text, value));
194 }
195 positions
196}
197
198fn find_next_unoccupied_match(
199 value: &str,
200 match_positions: &HashMap<String, Vec<usize>>,
201 consumed_positions: &mut HashMap<String, usize>,
202 occupied_ranges: &[Range<usize>],
203) -> Option<usize> {
204 let positions = match_positions.get(value)?;
205 let next_index = consumed_positions.entry(value.to_string()).or_insert(0);
206
207 while *next_index < positions.len() {
208 let start = positions[*next_index];
209 *next_index += 1;
210 let candidate = start..start + value.len();
211 if occupied_ranges
212 .iter()
213 .all(|used| candidate.end <= used.start || used.end <= candidate.start)
214 {
215 return Some(start);
216 }
217 }
218
219 None
220}
221
222fn collect_match_positions(text: &str, value: &str) -> Vec<usize> {
223 let mut positions = Vec::new();
224 if value.is_empty() {
225 return positions;
226 }
227
228 let mut search_start = 0;
229 while search_start <= text.len() {
230 let Some(offset) = text[search_start..].find(value) else {
231 break;
232 };
233 let start = search_start + offset;
234 positions.push(start);
235 search_start = start + 1;
236 }
237
238 positions
239}
240
241fn map_kind(kind: &str, rules: RedactionRules) -> Option<FindingKind> {
242 match kind {
243 "person" if rules.person => Some(FindingKind::Person),
244 "organization" if rules.organization => Some(FindingKind::Organization),
245 _ => None,
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use crate::{FindingKind, RedactionRules};
252
253 use super::parse_candidates;
254
255 #[test]
256 fn parse_candidates_maps_duplicate_values_to_distinct_occurrences() {
257 let text = "Alice met Alice at Acme. Alice returned to Acme.";
258 let content = r#"{
259 "candidates": [
260 {"kind":"person","value":"Alice","confidence":70},
261 {"kind":"person","value":"Alice","confidence":65},
262 {"kind":"organization","value":"Acme","confidence":80},
263 {"kind":"organization","value":"Acme","confidence":75}
264 ]
265 }"#;
266
267 let findings = parse_candidates(
268 text,
269 content,
270 RedactionRules::default().with_kind(FindingKind::Person, true).with_kind(FindingKind::Organization, true),
271 )
272 .expect("parse candidates");
273 let spans = findings
274 .iter()
275 .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
276 .collect::<Vec<_>>();
277
278 assert_eq!(
279 spans,
280 vec![
281 ("Alice", 0, 5),
282 ("Alice", 10, 15),
283 ("Acme", 19, 23),
284 ("Acme", 43, 47),
285 ]
286 );
287 }
288
289 #[test]
290 fn parse_candidates_drops_extra_duplicate_values_without_available_matches() {
291 let text = "Alice joined Acme with Alice.";
292 let content = r#"{
293 "candidates": [
294 {"kind":"person","value":"Alice","confidence":70},
295 {"kind":"person","value":"Alice","confidence":65},
296 {"kind":"person","value":"Alice","confidence":60}
297 ]
298 }"#;
299
300 let findings = parse_candidates(
301 text,
302 content,
303 RedactionRules::default().with_kind(FindingKind::Person, true),
304 )
305 .expect("parse candidates");
306 let alice_positions = findings
307 .iter()
308 .map(|finding| (finding.start, finding.end))
309 .collect::<Vec<_>>();
310
311 assert_eq!(alice_positions, vec![(0, 5), (23, 28)]);
312 }
313
314 #[test]
315 fn parse_candidates_keeps_different_values_from_stealing_each_other() {
316 let text = "Alice Acme Alice";
317 let content = r#"{
318 "candidates": [
319 {"kind":"organization","value":"Acme","confidence":80},
320 {"kind":"person","value":"Alice","confidence":70},
321 {"kind":"person","value":"Alice","confidence":65}
322 ]
323 }"#;
324
325 let findings = parse_candidates(
326 text,
327 content,
328 RedactionRules::default().with_kind(FindingKind::Person, true).with_kind(FindingKind::Organization, true),
329 )
330 .expect("parse candidates");
331 let spans = findings
332 .iter()
333 .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
334 .collect::<Vec<_>>();
335
336 assert_eq!(
337 spans,
338 vec![("Acme", 6, 10), ("Alice", 0, 5), ("Alice", 11, 16)]
339 );
340 }
341
342 #[test]
343 fn parse_candidates_skips_people_when_person_detection_is_disabled() {
344 let text = "Alice met Acme.";
345 let content = r#"{
346 "candidates": [
347 {"kind":"person","value":"Alice","confidence":70},
348 {"kind":"organization","value":"Acme","confidence":80}
349 ]
350 }"#;
351
352 let findings = parse_candidates(
353 text,
354 content,
355 RedactionRules::default().with_kind(FindingKind::Organization, true),
356 )
357 .expect("parse candidates");
358 let values = findings
359 .iter()
360 .map(|finding| (finding.kind, finding.match_text.as_str()))
361 .collect::<Vec<_>>();
362
363 assert_eq!(values, vec![(FindingKind::Organization, "Acme")]);
364 }
365}