skilllite_agent/long_text/
mod.rs1use anyhow::Result;
16
17use super::llm::LlmClient;
18use super::types::{self, chunk_str, safe_slice_from, safe_truncate, ChatMessage};
19
20mod filter;
21
22pub async fn maybe_process_user_input(client: &LlmClient, model: &str, input: &str) -> String {
31 let max_chars = types::get_user_input_max_chars();
32 if input.len() <= max_chars {
33 return input.to_string();
34 }
35
36 tracing::warn!(
37 len = input.len(),
38 max_chars,
39 "User input exceeds limit — summarizing via LLM"
40 );
41
42 let summary = summarize_long_content(client, model, input).await;
43 format!(
44 "[注:用户输入过长({} 字符),已通过 LLM 总结,原始内容已压缩]\n\n{}",
45 input.len(),
46 summary
47 )
48}
49
50pub fn truncate_content(content: &str, max_chars: usize) -> String {
52 if content.len() <= max_chars {
53 return content.to_string();
54 }
55 format!(
56 "{}\n\n[... 结果已截断,原文共 {} 字符,仅保留前 {} 字符 ...]",
57 safe_truncate(content, max_chars),
58 content.len(),
59 max_chars
60 )
61}
62
63pub async fn summarize_long_content(client: &LlmClient, model: &str, content: &str) -> String {
65 let chunk_size = types::get_chunk_size();
66 let head_chunks_count = types::get_head_chunks();
67 let tail_chunks_count = types::get_tail_chunks();
68 let max_output_chars = types::get_max_output_chars();
69
70 let total_len = content.len();
71 let head_size = head_chunks_count * chunk_size;
72 let tail_size = tail_chunks_count * chunk_size;
73
74 let (chunks, truncated_note) = select_chunks(
75 content,
76 chunk_size,
77 head_size,
78 tail_size,
79 total_len,
80 head_chunks_count,
81 tail_chunks_count,
82 );
83
84 if chunks.is_empty() {
85 return "(内容为空)".to_string();
86 }
87
88 let map_model = types::get_map_model(model);
90 if map_model != model {
91 tracing::debug!(
92 "MapReduce: Map stage using {} (Reduce will use {})",
93 map_model,
94 model
95 );
96 }
97
98 let mut chunk_summaries = Vec::new();
99 for (idx, chunk) in chunks.iter().enumerate() {
100 match summarize_single_chunk(client, &map_model, chunk).await {
101 Ok(summary) if !summary.is_empty() => chunk_summaries.push(summary),
102 Ok(_) => chunk_summaries.push(format!("[段 {} 总结为空]", idx + 1)),
103 Err(e) => {
104 tracing::warn!("Chunk {} summarization failed: {}", idx + 1, e);
105 chunk_summaries.push(format!("[段 {} 总结失败]", idx + 1));
106 }
107 }
108 }
109
110 let combined = if truncated_note.is_empty() {
111 chunk_summaries.join("\n\n")
112 } else {
113 format!("{}{}", chunk_summaries.join("\n\n"), truncated_note)
114 };
115
116 if combined.len() <= max_output_chars {
117 return combined;
118 }
119
120 match merge_summaries(client, model, &combined).await {
122 Ok(merged) => {
123 let result = if merged.is_empty() {
124 truncate_content(&combined, max_output_chars)
125 } else {
126 merged
127 };
128 if truncated_note.is_empty() {
129 result
130 } else {
131 format!("{}{}", result, truncated_note)
132 }
133 }
134 Err(e) => {
135 tracing::warn!("Final merge failed: {}", e);
136 format!(
137 "{}{}\n\n[... 总结后仍过长,已截断 ...]",
138 truncate_content(&combined, max_output_chars),
139 truncated_note
140 )
141 }
142 }
143}
144
145fn select_chunks(
147 content: &str,
148 chunk_size: usize,
149 head_size: usize,
150 tail_size: usize,
151 total_len: usize,
152 head_chunks_count: usize,
153 tail_chunks_count: usize,
154) -> (Vec<String>, String) {
155 let strategy = types::get_long_text_strategy();
156 let all_chunks: Vec<String> = chunk_str(content, chunk_size)
157 .into_iter()
158 .filter(|s| !s.trim().is_empty())
159 .map(|s| s.to_string())
160 .collect();
161
162 if all_chunks.is_empty() {
163 return (Vec::new(), String::new());
164 }
165
166 match strategy {
167 types::LongTextStrategy::HeadTailOnly => select_head_tail_only(
168 content,
169 &all_chunks,
170 chunk_size,
171 head_size,
172 tail_size,
173 total_len,
174 head_chunks_count,
175 tail_chunks_count,
176 ),
177 types::LongTextStrategy::HeadTailExtract => {
178 select_by_score(&all_chunks, total_len, head_chunks_count, tail_chunks_count)
179 }
180 types::LongTextStrategy::MapReduceFull => select_all_chunks(&all_chunks, total_len),
181 }
182}
183
184fn select_all_chunks(all_chunks: &[String], total_len: usize) -> (Vec<String>, String) {
185 let n = all_chunks.len();
186 let note = format!(
187 "\n\n[注:原文 {} 字符,全量 MapReduce 共 {} 段]",
188 total_len, n
189 );
190 (all_chunks.to_vec(), note)
191}
192
193#[allow(clippy::too_many_arguments)]
194fn select_head_tail_only(
195 content: &str,
196 all_chunks: &[String],
197 chunk_size: usize,
198 head_size: usize,
199 tail_size: usize,
200 total_len: usize,
201 _head_chunks_count: usize,
202 _tail_chunks_count: usize,
203) -> (Vec<String>, String) {
204 if total_len <= head_size + tail_size {
205 (all_chunks.to_vec(), String::new())
206 } else {
207 let mut chunks = Vec::new();
208 let head_content = safe_truncate(content, head_size);
209 for chunk in chunk_str(head_content, chunk_size) {
210 if !chunk.trim().is_empty() {
211 chunks.push(chunk.to_string());
212 }
213 }
214 let tail_content = safe_slice_from(content, total_len.saturating_sub(tail_size));
215 for chunk in chunk_str(tail_content, chunk_size) {
216 if !chunk.trim().is_empty() {
217 chunks.push(chunk.to_string());
218 }
219 }
220 let note = format!("\n\n[注:原文 {} 字符,仅总结开头与结尾]", total_len);
221 (chunks, note)
222 }
223}
224
225fn select_by_score(
226 all_chunks: &[String],
227 total_len: usize,
228 head_chunks_count: usize,
229 tail_chunks_count: usize,
230) -> (Vec<String>, String) {
231 let total_chunks = all_chunks.len();
232 let top_k = types::get_extract_top_k(total_chunks, head_chunks_count, tail_chunks_count);
233
234 let mut scored: Vec<(usize, String, f64)> = all_chunks
235 .iter()
236 .enumerate()
237 .map(|(i, c)| (i, c.clone(), filter::score_chunk(c, i, total_chunks)))
238 .collect();
239
240 scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
241 let selected: Vec<(usize, String)> = scored
242 .into_iter()
243 .take(top_k)
244 .map(|(i, c, _)| (i, c))
245 .collect();
246 let mut ordered: Vec<(usize, String)> = selected;
247 ordered.sort_by_key(|(i, _)| *i);
248
249 let chunks: Vec<String> = ordered.into_iter().map(|(_, c)| c).collect();
250 let note = format!(
251 "\n\n[注:原文 {} 字符,共 {} 段,按信息量选取 {} 段]",
252 total_len, total_chunks, top_k
253 );
254 (chunks, note)
255}
256
257async fn summarize_single_chunk(client: &LlmClient, model: &str, chunk: &str) -> Result<String> {
258 let prompt = format!(
259 "Summarize the key information from this text excerpt. Keep it concise (under 500 chars).\n\
260 Focus on: rankings, statistics, facts, dates, names, key findings. Preserve numbers.\n\
261 Output in the same language as the input. Output summary only, no preamble.\n\n\
262 ---\n{}",
263 chunk
264 );
265
266 let messages = vec![ChatMessage::user(&prompt)];
267 let resp = client
268 .chat_completion(model, &messages, None, Some(0.3))
269 .await?;
270
271 let text = resp
272 .choices
273 .first()
274 .and_then(|c| c.message.content.clone())
275 .unwrap_or_default()
276 .trim()
277 .to_string();
278
279 Ok(text)
280}
281
282async fn merge_summaries(client: &LlmClient, model: &str, combined: &str) -> Result<String> {
283 let prompt = format!(
284 "The following are summaries of different parts of a long document.\n\
285 Merge them into one concise summary (under 3000 chars). Preserve all key facts, numbers, rankings.\n\
286 Output in the same language. Output summary only.\n\n\
287 ---\n{}",
288 combined
289 );
290
291 let messages = vec![ChatMessage::user(&prompt)];
292 let resp = client
293 .chat_completion(model, &messages, None, Some(0.3))
294 .await?;
295
296 let text = resp
297 .choices
298 .first()
299 .and_then(|c| c.message.content.clone())
300 .unwrap_or_default()
301 .trim()
302 .to_string();
303
304 Ok(text)
305}