Skip to main content

cloudiful_redactor/llm/
mod.rs

1#![cfg_attr(not(feature = "ollama"), allow(dead_code))]
2
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::ops::Range;
7
8use crate::detect::normalize;
9use crate::types::{Finding, FindingKind, FindingSource, RedactionRules};
10
11#[derive(Debug, Clone, Default)]
12pub struct LlmConfig {
13    pub base_url: String,
14    pub model: String,
15}
16
17#[derive(Debug, Clone, Serialize)]
18struct ChatRequest<'a> {
19    model: &'a str,
20    messages: Vec<Message<'a>>,
21    stream: bool,
22    temperature: f32,
23    response_format: ResponseFormat<'a>,
24}
25
26#[derive(Debug, Clone, Serialize)]
27struct Message<'a> {
28    role: &'a str,
29    content: String,
30}
31
32#[derive(Debug, Clone, Serialize)]
33struct ResponseFormat<'a> {
34    #[serde(rename = "type")]
35    kind: &'a str,
36}
37
38#[derive(Debug, Clone, Deserialize)]
39struct ChatResponse {
40    choices: Vec<Choice>,
41}
42
43#[derive(Debug, Clone, Deserialize)]
44struct Choice {
45    message: ChatMessage,
46}
47
48#[derive(Debug, Clone, Deserialize)]
49struct ChatMessage {
50    content: String,
51}
52
53#[derive(Debug, Clone, Deserialize)]
54struct CandidateEnvelope {
55    candidates: Vec<Candidate>,
56}
57
58#[derive(Debug, Clone, Deserialize)]
59struct Candidate {
60    kind: String,
61    value: String,
62    confidence: Option<u8>,
63}
64
65#[cfg(feature = "ollama")]
66pub fn discover_candidates(
67    config: &LlmConfig,
68    text: &str,
69    rules: RedactionRules,
70) -> Result<Vec<Finding>> {
71    let allowed_kinds = allowed_llm_kinds(rules);
72    if allowed_kinds.is_empty() {
73        return Ok(Vec::new());
74    }
75    let allowed_kinds = allowed_kinds.join(", ");
76    let prompt = format!(
77        "Find sensitive items in the input text. Return JSON only with a top-level key named candidates. \
78         Each candidate must include kind, value, confidence. Allowed kinds: {allowed_kinds}. \
79         Only include exact values copied from the input text.\n\nInput:\n{text}"
80    );
81    let request = ChatRequest {
82        model: &config.model,
83        messages: vec![
84            Message {
85                role: "system",
86                content: "Return compact JSON only. Do not rewrite the source text.".to_string(),
87            },
88            Message {
89                role: "user",
90                content: prompt,
91            },
92        ],
93        stream: false,
94        temperature: 0.0,
95        response_format: ResponseFormat {
96            kind: "json_object",
97        },
98    };
99    let endpoint = format!(
100        "{}/v1/chat/completions",
101        config.base_url.trim_end_matches('/')
102    );
103    let client = reqwest::blocking::Client::new();
104    let response: ChatResponse = client
105        .post(endpoint)
106        .json(&request)
107        .send()
108        .context("failed to call Ollama")?
109        .error_for_status()
110        .context("Ollama returned an error response")?
111        .json()
112        .context("failed to decode Ollama response")?;
113    let content = response
114        .choices
115        .into_iter()
116        .next()
117        .context("Ollama response did not contain any choices")?
118        .message
119        .content;
120    parse_candidates(text, &content, rules)
121}
122
123#[cfg(not(feature = "ollama"))]
124pub fn discover_candidates(
125    _config: &LlmConfig,
126    _text: &str,
127    _rules: RedactionRules,
128) -> Result<Vec<Finding>> {
129    anyhow::bail!("this binary was built without the `ollama` feature")
130}
131
132fn allowed_llm_kinds(rules: RedactionRules) -> Vec<&'static str> {
133    let mut kinds = Vec::new();
134    if rules.person {
135        kinds.push("person");
136    }
137    if rules.organization {
138        kinds.push("organization");
139    }
140    kinds
141}
142
143fn parse_candidates(text: &str, content: &str, rules: RedactionRules) -> Result<Vec<Finding>> {
144    let envelope: CandidateEnvelope =
145        serde_json::from_str(content).context("failed to parse LLM JSON response")?;
146    let mut findings = Vec::new();
147    let mut occupied_ranges: Vec<Range<usize>> = Vec::new();
148    let match_positions = build_match_positions(
149        text,
150        envelope
151            .candidates
152            .iter()
153            .map(|candidate| candidate.value.as_str()),
154    );
155    let mut consumed_positions = HashMap::<String, usize>::new();
156
157    for candidate in envelope.candidates {
158        let Some(kind) = map_kind(&candidate.kind, rules) else {
159            continue;
160        };
161        if let Some(start) = find_next_unoccupied_match(
162            &candidate.value,
163            &match_positions,
164            &mut consumed_positions,
165            &occupied_ranges,
166        ) {
167            let range = start..start + candidate.value.len();
168            occupied_ranges.push(range.clone());
169            findings.push(Finding {
170                kind,
171                source: FindingSource::Llm,
172                match_text: candidate.value.clone(),
173                normalized_key: normalize(kind, &candidate.value),
174                confidence: candidate.confidence.unwrap_or(60).min(100),
175                start: range.start,
176                end: range.end,
177            });
178        }
179    }
180
181    Ok(findings)
182}
183
184fn build_match_positions<'a>(
185    text: &str,
186    values: impl IntoIterator<Item = &'a str>,
187) -> HashMap<String, Vec<usize>> {
188    let mut positions = HashMap::new();
189    for value in values {
190        if value.is_empty() || positions.contains_key(value) {
191            continue;
192        }
193        positions.insert(value.to_string(), collect_match_positions(text, value));
194    }
195    positions
196}
197
198fn find_next_unoccupied_match(
199    value: &str,
200    match_positions: &HashMap<String, Vec<usize>>,
201    consumed_positions: &mut HashMap<String, usize>,
202    occupied_ranges: &[Range<usize>],
203) -> Option<usize> {
204    let positions = match_positions.get(value)?;
205    let next_index = consumed_positions.entry(value.to_string()).or_insert(0);
206
207    while *next_index < positions.len() {
208        let start = positions[*next_index];
209        *next_index += 1;
210        let candidate = start..start + value.len();
211        if occupied_ranges
212            .iter()
213            .all(|used| candidate.end <= used.start || used.end <= candidate.start)
214        {
215            return Some(start);
216        }
217    }
218
219    None
220}
221
222fn collect_match_positions(text: &str, value: &str) -> Vec<usize> {
223    let mut positions = Vec::new();
224    if value.is_empty() {
225        return positions;
226    }
227
228    let mut search_start = 0;
229    while search_start <= text.len() {
230        let Some(offset) = text[search_start..].find(value) else {
231            break;
232        };
233        let start = search_start + offset;
234        positions.push(start);
235        search_start = start + 1;
236    }
237
238    positions
239}
240
241fn map_kind(kind: &str, rules: RedactionRules) -> Option<FindingKind> {
242    match kind {
243        "person" if rules.person => Some(FindingKind::Person),
244        "organization" if rules.organization => Some(FindingKind::Organization),
245        _ => None,
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use crate::{FindingKind, RedactionRules};
252
253    use super::parse_candidates;
254
255    #[test]
256    fn parse_candidates_maps_duplicate_values_to_distinct_occurrences() {
257        let text = "Alice met Alice at Acme. Alice returned to Acme.";
258        let content = r#"{
259            "candidates": [
260                {"kind":"person","value":"Alice","confidence":70},
261                {"kind":"person","value":"Alice","confidence":65},
262                {"kind":"organization","value":"Acme","confidence":80},
263                {"kind":"organization","value":"Acme","confidence":75}
264            ]
265        }"#;
266
267        let findings = parse_candidates(
268            text,
269            content,
270            RedactionRules::default().with_kind(FindingKind::Person, true).with_kind(FindingKind::Organization, true),
271        )
272        .expect("parse candidates");
273        let spans = findings
274            .iter()
275            .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
276            .collect::<Vec<_>>();
277
278        assert_eq!(
279            spans,
280            vec![
281                ("Alice", 0, 5),
282                ("Alice", 10, 15),
283                ("Acme", 19, 23),
284                ("Acme", 43, 47),
285            ]
286        );
287    }
288
289    #[test]
290    fn parse_candidates_drops_extra_duplicate_values_without_available_matches() {
291        let text = "Alice joined Acme with Alice.";
292        let content = r#"{
293            "candidates": [
294                {"kind":"person","value":"Alice","confidence":70},
295                {"kind":"person","value":"Alice","confidence":65},
296                {"kind":"person","value":"Alice","confidence":60}
297            ]
298        }"#;
299
300        let findings = parse_candidates(
301            text,
302            content,
303            RedactionRules::default().with_kind(FindingKind::Person, true),
304        )
305        .expect("parse candidates");
306        let alice_positions = findings
307            .iter()
308            .map(|finding| (finding.start, finding.end))
309            .collect::<Vec<_>>();
310
311        assert_eq!(alice_positions, vec![(0, 5), (23, 28)]);
312    }
313
314    #[test]
315    fn parse_candidates_keeps_different_values_from_stealing_each_other() {
316        let text = "Alice Acme Alice";
317        let content = r#"{
318            "candidates": [
319                {"kind":"organization","value":"Acme","confidence":80},
320                {"kind":"person","value":"Alice","confidence":70},
321                {"kind":"person","value":"Alice","confidence":65}
322            ]
323        }"#;
324
325        let findings = parse_candidates(
326            text,
327            content,
328            RedactionRules::default().with_kind(FindingKind::Person, true).with_kind(FindingKind::Organization, true),
329        )
330        .expect("parse candidates");
331        let spans = findings
332            .iter()
333            .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
334            .collect::<Vec<_>>();
335
336        assert_eq!(
337            spans,
338            vec![("Acme", 6, 10), ("Alice", 0, 5), ("Alice", 11, 16)]
339        );
340    }
341
342    #[test]
343    fn parse_candidates_skips_people_when_person_detection_is_disabled() {
344        let text = "Alice met Acme.";
345        let content = r#"{
346            "candidates": [
347                {"kind":"person","value":"Alice","confidence":70},
348                {"kind":"organization","value":"Acme","confidence":80}
349            ]
350        }"#;
351
352        let findings = parse_candidates(
353            text,
354            content,
355            RedactionRules::default().with_kind(FindingKind::Organization, true),
356        )
357        .expect("parse candidates");
358        let values = findings
359            .iter()
360            .map(|finding| (finding.kind, finding.match_text.as_str()))
361            .collect::<Vec<_>>();
362
363        assert_eq!(values, vec![(FindingKind::Organization, "Acme")]);
364    }
365}