Skip to main content

cloudiful_redactor/llm/
mod.rs

1#![cfg_attr(not(feature = "ollama"), allow(dead_code))]
2
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::ops::Range;
7
8use crate::detect::normalize;
9use crate::types::{Finding, FindingKind, FindingSource};
10
11#[derive(Debug, Clone, Default)]
12pub struct LlmConfig {
13    pub base_url: String,
14    pub model: String,
15}
16
17#[derive(Debug, Clone, Serialize)]
18struct ChatRequest<'a> {
19    model: &'a str,
20    messages: Vec<Message<'a>>,
21    stream: bool,
22    temperature: f32,
23    response_format: ResponseFormat<'a>,
24}
25
26#[derive(Debug, Clone, Serialize)]
27struct Message<'a> {
28    role: &'a str,
29    content: String,
30}
31
32#[derive(Debug, Clone, Serialize)]
33struct ResponseFormat<'a> {
34    #[serde(rename = "type")]
35    kind: &'a str,
36}
37
38#[derive(Debug, Clone, Deserialize)]
39struct ChatResponse {
40    choices: Vec<Choice>,
41}
42
43#[derive(Debug, Clone, Deserialize)]
44struct Choice {
45    message: ChatMessage,
46}
47
48#[derive(Debug, Clone, Deserialize)]
49struct ChatMessage {
50    content: String,
51}
52
53#[derive(Debug, Clone, Deserialize)]
54struct CandidateEnvelope {
55    candidates: Vec<Candidate>,
56}
57
58#[derive(Debug, Clone, Deserialize)]
59struct Candidate {
60    kind: String,
61    value: String,
62    confidence: Option<u8>,
63}
64
65#[cfg(feature = "ollama")]
66pub fn discover_candidates(
67    config: &LlmConfig,
68    text: &str,
69    person_detection: bool,
70) -> Result<Vec<Finding>> {
71    let allowed_kinds = if person_detection {
72        "person, organization"
73    } else {
74        "organization"
75    };
76    let prompt = format!(
77        "Find sensitive items in the input text. Return JSON only with a top-level key named candidates. \
78         Each candidate must include kind, value, confidence. Allowed kinds: {allowed_kinds}. \
79         Only include exact values copied from the input text.\n\nInput:\n{text}"
80    );
81    let request = ChatRequest {
82        model: &config.model,
83        messages: vec![
84            Message {
85                role: "system",
86                content: "Return compact JSON only. Do not rewrite the source text.".to_string(),
87            },
88            Message {
89                role: "user",
90                content: prompt,
91            },
92        ],
93        stream: false,
94        temperature: 0.0,
95        response_format: ResponseFormat {
96            kind: "json_object",
97        },
98    };
99    let endpoint = format!(
100        "{}/v1/chat/completions",
101        config.base_url.trim_end_matches('/')
102    );
103    let client = reqwest::blocking::Client::new();
104    let response: ChatResponse = client
105        .post(endpoint)
106        .json(&request)
107        .send()
108        .context("failed to call Ollama")?
109        .error_for_status()
110        .context("Ollama returned an error response")?
111        .json()
112        .context("failed to decode Ollama response")?;
113    let content = response
114        .choices
115        .into_iter()
116        .next()
117        .context("Ollama response did not contain any choices")?
118        .message
119        .content;
120    parse_candidates(text, &content, person_detection)
121}
122
123#[cfg(not(feature = "ollama"))]
124pub fn discover_candidates(
125    _config: &LlmConfig,
126    _text: &str,
127    _person_detection: bool,
128) -> Result<Vec<Finding>> {
129    anyhow::bail!("this binary was built without the `ollama` feature")
130}
131
132fn parse_candidates(text: &str, content: &str, person_detection: bool) -> Result<Vec<Finding>> {
133    let envelope: CandidateEnvelope =
134        serde_json::from_str(content).context("failed to parse LLM JSON response")?;
135    let mut findings = Vec::new();
136    let mut occupied_ranges: Vec<Range<usize>> = Vec::new();
137    let match_positions = build_match_positions(
138        text,
139        envelope
140            .candidates
141            .iter()
142            .map(|candidate| candidate.value.as_str()),
143    );
144    let mut consumed_positions = HashMap::<String, usize>::new();
145
146    for candidate in envelope.candidates {
147        let Some(kind) = map_kind(&candidate.kind, person_detection) else {
148            continue;
149        };
150        if let Some(start) = find_next_unoccupied_match(
151            &candidate.value,
152            &match_positions,
153            &mut consumed_positions,
154            &occupied_ranges,
155        ) {
156            let range = start..start + candidate.value.len();
157            occupied_ranges.push(range.clone());
158            findings.push(Finding {
159                kind,
160                source: FindingSource::Llm,
161                match_text: candidate.value.clone(),
162                normalized_key: normalize(kind, &candidate.value),
163                confidence: candidate.confidence.unwrap_or(60).min(100),
164                start: range.start,
165                end: range.end,
166            });
167        }
168    }
169
170    Ok(findings)
171}
172
173fn build_match_positions<'a>(
174    text: &str,
175    values: impl IntoIterator<Item = &'a str>,
176) -> HashMap<String, Vec<usize>> {
177    let mut positions = HashMap::new();
178    for value in values {
179        if value.is_empty() || positions.contains_key(value) {
180            continue;
181        }
182        positions.insert(value.to_string(), collect_match_positions(text, value));
183    }
184    positions
185}
186
187fn find_next_unoccupied_match(
188    value: &str,
189    match_positions: &HashMap<String, Vec<usize>>,
190    consumed_positions: &mut HashMap<String, usize>,
191    occupied_ranges: &[Range<usize>],
192) -> Option<usize> {
193    let positions = match_positions.get(value)?;
194    let next_index = consumed_positions.entry(value.to_string()).or_insert(0);
195
196    while *next_index < positions.len() {
197        let start = positions[*next_index];
198        *next_index += 1;
199        let candidate = start..start + value.len();
200        if occupied_ranges
201            .iter()
202            .all(|used| candidate.end <= used.start || used.end <= candidate.start)
203        {
204            return Some(start);
205        }
206    }
207
208    None
209}
210
211fn collect_match_positions(text: &str, value: &str) -> Vec<usize> {
212    let mut positions = Vec::new();
213    if value.is_empty() {
214        return positions;
215    }
216
217    let mut search_start = 0;
218    while search_start <= text.len() {
219        let Some(offset) = text[search_start..].find(value) else {
220            break;
221        };
222        let start = search_start + offset;
223        positions.push(start);
224        search_start = start + 1;
225    }
226
227    positions
228}
229
230fn map_kind(kind: &str, person_detection: bool) -> Option<FindingKind> {
231    match kind {
232        "person" if person_detection => Some(FindingKind::Person),
233        "organization" => Some(FindingKind::Organization),
234        _ => None,
235    }
236}
237
238#[cfg(test)]
239mod tests {
240    use crate::FindingKind;
241
242    use super::parse_candidates;
243
244    #[test]
245    fn parse_candidates_maps_duplicate_values_to_distinct_occurrences() {
246        let text = "Alice met Alice at Acme. Alice returned to Acme.";
247        let content = r#"{
248            "candidates": [
249                {"kind":"person","value":"Alice","confidence":70},
250                {"kind":"person","value":"Alice","confidence":65},
251                {"kind":"organization","value":"Acme","confidence":80},
252                {"kind":"organization","value":"Acme","confidence":75}
253            ]
254        }"#;
255
256        let findings = parse_candidates(text, content, true).expect("parse candidates");
257        let spans = findings
258            .iter()
259            .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
260            .collect::<Vec<_>>();
261
262        assert_eq!(
263            spans,
264            vec![
265                ("Alice", 0, 5),
266                ("Alice", 10, 15),
267                ("Acme", 19, 23),
268                ("Acme", 43, 47),
269            ]
270        );
271    }
272
273    #[test]
274    fn parse_candidates_drops_extra_duplicate_values_without_available_matches() {
275        let text = "Alice joined Acme with Alice.";
276        let content = r#"{
277            "candidates": [
278                {"kind":"person","value":"Alice","confidence":70},
279                {"kind":"person","value":"Alice","confidence":65},
280                {"kind":"person","value":"Alice","confidence":60}
281            ]
282        }"#;
283
284        let findings = parse_candidates(text, content, true).expect("parse candidates");
285        let alice_positions = findings
286            .iter()
287            .map(|finding| (finding.start, finding.end))
288            .collect::<Vec<_>>();
289
290        assert_eq!(alice_positions, vec![(0, 5), (23, 28)]);
291    }
292
293    #[test]
294    fn parse_candidates_keeps_different_values_from_stealing_each_other() {
295        let text = "Alice Acme Alice";
296        let content = r#"{
297            "candidates": [
298                {"kind":"organization","value":"Acme","confidence":80},
299                {"kind":"person","value":"Alice","confidence":70},
300                {"kind":"person","value":"Alice","confidence":65}
301            ]
302        }"#;
303
304        let findings = parse_candidates(text, content, true).expect("parse candidates");
305        let spans = findings
306            .iter()
307            .map(|finding| (finding.match_text.as_str(), finding.start, finding.end))
308            .collect::<Vec<_>>();
309
310        assert_eq!(
311            spans,
312            vec![("Acme", 6, 10), ("Alice", 0, 5), ("Alice", 11, 16)]
313        );
314    }
315
316    #[test]
317    fn parse_candidates_skips_people_when_person_detection_is_disabled() {
318        let text = "Alice met Acme.";
319        let content = r#"{
320            "candidates": [
321                {"kind":"person","value":"Alice","confidence":70},
322                {"kind":"organization","value":"Acme","confidence":80}
323            ]
324        }"#;
325
326        let findings = parse_candidates(text, content, false).expect("parse candidates");
327        let values = findings
328            .iter()
329            .map(|finding| (finding.kind, finding.match_text.as_str()))
330            .collect::<Vec<_>>();
331
332        assert_eq!(values, vec![(FindingKind::Organization, "Acme")]);
333    }
334}