1use anyhow::{Context, Result};
6use probe_code::extract::symbol_finder::find_symbol_in_file;
7use probe_code::language::parser::parse_file_for_code_blocks;
8use probe_code::models::SearchResult;
9use std::collections::HashSet;
10use std::fs;
11use std::path::Path;
12
13pub fn process_file_for_extraction(
25 path: &Path,
26 start_line: Option<usize>,
27 end_line: Option<usize>,
28 symbol: Option<&str>,
29 allow_tests: bool,
30 context_lines: usize,
31 specific_lines: Option<&HashSet<usize>>,
32) -> Result<SearchResult> {
33 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
35
36 if debug_mode {
37 println!("\n[DEBUG] ===== Processing File for Extraction =====");
38 println!("[DEBUG] File path: {path:?}");
39 println!("[DEBUG] Start line: {start_line:?}");
40 println!("[DEBUG] End line: {end_line:?}");
41 println!("[DEBUG] Symbol: {symbol:?}");
42 println!("[DEBUG] Allow tests: {allow_tests}");
43 println!("[DEBUG] Context lines: {context_lines}");
44 println!("[DEBUG] Specific lines: {specific_lines:?}");
45 }
46
47 if !path.exists() {
49 if debug_mode {
50 println!("[DEBUG] Error: File does not exist");
51 }
52 return Err(anyhow::anyhow!("File does not exist: {:?}", path));
53 }
54
55 let content = fs::read_to_string(path).context(format!("Failed to read file: {path:?}"))?;
57 let lines: Vec<&str> = content.lines().collect();
58
59 if debug_mode {
60 println!("[DEBUG] File read successfully");
61 println!("[DEBUG] File size: {} bytes", content.len());
62 println!("[DEBUG] Line count: {}", lines.len());
63 }
64
65 if let Some(symbol_name) = symbol {
67 if debug_mode {
68 println!("[DEBUG] Looking for symbol: {symbol_name}");
69 }
70 return find_symbol_in_file(path, symbol_name, &content, allow_tests, context_lines);
72 }
73
74 if let (Some(start), Some(end)) = (start_line, end_line) {
76 if debug_mode {
77 println!("[DEBUG] Extracting line range: {start}-{end} (with AST merging)");
78 }
79
80 let mut clamped_start = start.clamp(1, lines.len());
83
84 let mut clamped_end = end.clamp(clamped_start, lines.len());
86
87 if clamped_start > lines.len() {
89 clamped_start = lines.len();
90 }
91
92 if clamped_end < clamped_start {
94 clamped_end = clamped_start;
95 }
96
97 if debug_mode && (clamped_start != start || clamped_end != end) {
98 println!(
99 "[DEBUG] Requested lines {start}-{end} out of range; clamping to {clamped_start}-{clamped_end}"
100 );
101 }
102
103 let start = clamped_start;
105 let end = clamped_end;
106
107 let mut needed_lines = HashSet::new();
109 for l in start..=end {
110 needed_lines.insert(l);
111 }
112
113 if let Some(lines_set) = specific_lines {
115 for &line in lines_set {
116 needed_lines.insert(line);
117 }
118 }
119
120 let code_blocks_result = parse_file_for_code_blocks(
121 &content,
122 file_extension(path),
123 &needed_lines,
124 allow_tests,
125 None,
126 );
127
128 match code_blocks_result {
129 Ok(blocks) if !blocks.is_empty() => {
130 let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
133 let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
134
135 let max_end = std::cmp::min(max_end, lines.len() - 1);
137
138 let min_start = std::cmp::min(min_start, max_end);
140
141 let merged_start = min_start + 1;
143 let merged_end = max_end + 1;
144
145 if debug_mode {
146 println!(
147 "[DEBUG] Found {} overlapping AST blocks, merging into lines {}-{}",
148 blocks.len(),
149 merged_start,
150 merged_end
151 );
152 }
153
154 let merged_content = lines[min_start..=max_end].join("\n");
155
156 let filename = path
158 .file_name()
159 .map(|f| f.to_string_lossy().to_string())
160 .unwrap_or_default();
161 let tokenized_content =
162 crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
163
164 Ok(SearchResult {
165 file: path.to_string_lossy().to_string(),
166 lines: (merged_start, merged_end),
167 node_type: "merged_ast_range".to_string(),
168 code: merged_content,
169 matched_by_filename: None,
170 rank: None,
171 score: None,
172 tfidf_score: None,
173 bm25_score: None,
174 tfidf_rank: None,
175 bm25_rank: None,
176 new_score: None,
177 hybrid2_rank: None,
178 combined_score_rank: None,
179 file_unique_terms: None,
180 file_total_matches: None,
181 file_match_rank: None,
182 block_unique_terms: None,
183 block_total_matches: None,
184 parent_file_id: None,
185 block_id: None,
186 matched_keywords: None,
187 tokenized_content: Some(tokenized_content),
188 })
189 }
190 _ => {
191 if debug_mode {
193 println!(
194 "[DEBUG] No AST blocks found for the range {start}-{end}, falling back to literal lines"
195 );
196 }
197 let start_idx = start - 1;
198 let end_idx = end;
199 let range_content = lines[start_idx..end_idx].join("\n");
200 let filename = path
202 .file_name()
203 .map(|f| f.to_string_lossy().to_string())
204 .unwrap_or_default();
205 let tokenized_content =
206 crate::ranking::preprocess_text_with_filename(&range_content, &filename);
207
208 Ok(SearchResult {
209 file: path.to_string_lossy().to_string(),
210 lines: (start, end),
211 node_type: "range".to_string(),
212 code: range_content,
213 matched_by_filename: None,
214 rank: None,
215 score: None,
216 tfidf_score: None,
217 bm25_score: None,
218 tfidf_rank: None,
219 bm25_rank: None,
220 new_score: None,
221 hybrid2_rank: None,
222 combined_score_rank: None,
223 file_unique_terms: None,
224 file_total_matches: None,
225 file_match_rank: None,
226 block_unique_terms: None,
227 block_total_matches: None,
228 parent_file_id: None,
229 block_id: None,
230 matched_keywords: None,
231 tokenized_content: Some(tokenized_content),
232 })
233 }
234 }
235 }
236 else if let Some(line_num) = start_line {
238 if debug_mode {
239 println!("[DEBUG] Single line extraction requested: line {line_num}");
240 }
241 let clamped_line_num = line_num.clamp(1, lines.len());
243
244 if debug_mode && clamped_line_num != line_num {
245 println!(
246 "[DEBUG] Requested line {line_num} out of bounds; clamping to {clamped_line_num}"
247 );
248 }
249
250 let line_num = clamped_line_num;
252
253 let mut needed_lines = HashSet::new();
255 needed_lines.insert(line_num);
256
257 if let Some(lines_set) = specific_lines {
259 for &line in lines_set {
260 needed_lines.insert(line);
261 }
262 }
263
264 match parse_file_for_code_blocks(
265 &content,
266 file_extension(path),
267 &needed_lines,
268 allow_tests,
269 None,
270 ) {
271 Ok(blocks) if !blocks.is_empty() => {
272 let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
275 let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
276
277 let max_end = std::cmp::min(max_end, lines.len() - 1);
279
280 let min_start = std::cmp::min(min_start, max_end);
282
283 let merged_start = min_start + 1;
284 let merged_end = max_end + 1;
285
286 if debug_mode {
287 println!(
288 "[DEBUG] Found {} AST blocks for line {}, merging into lines {}-{}",
289 blocks.len(),
290 line_num,
291 merged_start,
292 merged_end
293 );
294 }
295 let merged_content = lines[min_start..=max_end].join("\n");
296
297 let filename = path
299 .file_name()
300 .map(|f| f.to_string_lossy().to_string())
301 .unwrap_or_default();
302 let tokenized_content =
303 crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
304
305 return Ok(SearchResult {
306 file: path.to_string_lossy().to_string(),
307 lines: (merged_start, merged_end),
308 node_type: "merged_ast_line".to_string(),
309 code: merged_content,
310 matched_by_filename: None,
311 rank: None,
312 score: None,
313 tfidf_score: None,
314 bm25_score: None,
315 tfidf_rank: None,
316 bm25_rank: None,
317 new_score: None,
318 hybrid2_rank: None,
319 combined_score_rank: None,
320 file_unique_terms: None,
321 file_total_matches: None,
322 file_match_rank: None,
323 block_unique_terms: None,
324 block_total_matches: None,
325 parent_file_id: None,
326 block_id: None,
327 matched_keywords: None,
328 tokenized_content: Some(tokenized_content),
329 });
330 }
331 _ => {
332 if debug_mode {
334 println!(
335 "[DEBUG] No AST blocks found for line {line_num}, using context-based fallback"
336 );
337 }
338
339 let file_line_count = lines.len();
341 let start_ctx = if line_num <= context_lines {
342 1
343 } else {
344 line_num - context_lines
345 };
346 let end_ctx = std::cmp::min(line_num + context_lines, file_line_count);
347
348 let start_idx = start_ctx - 1;
349 let end_idx = end_ctx;
350
351 let context_code = lines[start_idx..end_idx].join("\n");
352
353 let filename = path
355 .file_name()
356 .map(|f| f.to_string_lossy().to_string())
357 .unwrap_or_default();
358 let tokenized_content =
359 crate::ranking::preprocess_text_with_filename(&context_code, &filename);
360
361 return Ok(SearchResult {
362 file: path.to_string_lossy().to_string(),
363 lines: (start_ctx, end_ctx),
364 node_type: "context".to_string(),
365 code: context_code,
366 matched_by_filename: None,
367 rank: None,
368 score: None,
369 tfidf_score: None,
370 bm25_score: None,
371 tfidf_rank: None,
372 bm25_rank: None,
373 new_score: None,
374 hybrid2_rank: None,
375 combined_score_rank: None,
376 file_unique_terms: None,
377 file_total_matches: None,
378 file_match_rank: None,
379 block_unique_terms: None,
380 block_total_matches: None,
381 parent_file_id: None,
382 block_id: None,
383 matched_keywords: None,
384 tokenized_content: Some(tokenized_content),
385 });
386 }
387 }
388 } else if let Some(lines_set) = specific_lines {
389 if debug_mode {
391 println!("[DEBUG] Extracting specific lines: {lines_set:?}");
392 }
393
394 if lines_set.is_empty() {
395 if debug_mode {
396 println!("[DEBUG] No specific lines provided, returning entire file content");
397 }
398
399 let filename = path
401 .file_name()
402 .map(|f| f.to_string_lossy().to_string())
403 .unwrap_or_default();
404 let tokenized_content =
405 crate::ranking::preprocess_text_with_filename(&content, &filename);
406
407 return Ok(SearchResult {
408 file: path.to_string_lossy().to_string(),
409 lines: (1, lines.len()),
410 node_type: "file".to_string(),
411 code: content,
412 matched_by_filename: None,
413 rank: None,
414 score: None,
415 tfidf_score: None,
416 bm25_score: None,
417 tfidf_rank: None,
418 bm25_rank: None,
419 new_score: None,
420 hybrid2_rank: None,
421 combined_score_rank: None,
422 file_unique_terms: None,
423 file_total_matches: None,
424 file_match_rank: None,
425 block_unique_terms: None,
426 block_total_matches: None,
427 parent_file_id: None,
428 block_id: None,
429 matched_keywords: None,
430 tokenized_content: Some(tokenized_content),
431 });
432 }
433
434 let mut clamped_lines = HashSet::new();
436 let mut any_clamped = false;
437
438 for &line in lines_set {
439 if line == 0 || line > lines.len() {
440 if line > 0 {
441 clamped_lines.insert(line.min(lines.len()));
443 }
444 any_clamped = true;
445 } else {
446 clamped_lines.insert(line);
447 }
448 }
449
450 if debug_mode && any_clamped {
451 println!(
452 "[DEBUG] Some requested lines were out of bounds; clamping to valid range 1-{}",
453 lines.len()
454 );
455 }
456
457 let lines_set = &clamped_lines;
459
460 let code_blocks_result = parse_file_for_code_blocks(
462 &content,
463 file_extension(path),
464 lines_set,
465 allow_tests,
466 None,
467 );
468
469 match code_blocks_result {
470 Ok(blocks) if !blocks.is_empty() => {
471 let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
473 let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
474
475 let max_end = std::cmp::min(max_end, lines.len() - 1);
477
478 let min_start = std::cmp::min(min_start, max_end);
480
481 let merged_start = min_start + 1;
483 let merged_end = max_end + 1;
484
485 if debug_mode {
486 println!(
487 "[DEBUG] Found {} AST blocks for specific lines, merging into lines {}-{}",
488 blocks.len(),
489 merged_start,
490 merged_end
491 );
492 }
493
494 let merged_content = lines[min_start..=max_end].join("\n");
495
496 let filename = path
498 .file_name()
499 .map(|f| f.to_string_lossy().to_string())
500 .unwrap_or_default();
501 let tokenized_content =
502 crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
503
504 return Ok(SearchResult {
505 file: path.to_string_lossy().to_string(),
506 lines: (merged_start, merged_end),
507 node_type: "merged_ast_specific_lines".to_string(),
508 code: merged_content,
509 matched_by_filename: None,
510 rank: None,
511 score: None,
512 tfidf_score: None,
513 bm25_score: None,
514 tfidf_rank: None,
515 bm25_rank: None,
516 new_score: None,
517 hybrid2_rank: None,
518 combined_score_rank: None,
519 file_unique_terms: None,
520 file_total_matches: None,
521 file_match_rank: None,
522 block_unique_terms: None,
523 block_total_matches: None,
524 parent_file_id: None,
525 block_id: None,
526 matched_keywords: None,
527 tokenized_content: Some(tokenized_content),
528 });
529 }
530 _ => {
531 if debug_mode {
533 println!(
534 "[DEBUG] No AST blocks found for specific lines, falling back to literal lines"
535 );
536 }
537
538 let min_line = *lines_set.iter().min().unwrap_or(&1);
540 let max_line = *lines_set.iter().max().unwrap_or(&lines.len());
541
542 let start = if min_line <= context_lines {
544 1
545 } else {
546 min_line - context_lines
547 };
548 let end = std::cmp::min(max_line + context_lines, lines.len());
549
550 let start_idx = start - 1;
551 let end_idx = end;
552 let range_content = lines[start_idx..end_idx].join("\n");
553
554 let filename = path
556 .file_name()
557 .map(|f| f.to_string_lossy().to_string())
558 .unwrap_or_default();
559 let tokenized_content =
560 crate::ranking::preprocess_text_with_filename(&range_content, &filename);
561
562 return Ok(SearchResult {
563 file: path.to_string_lossy().to_string(),
564 lines: (start, end),
565 node_type: "specific_lines".to_string(),
566 code: range_content,
567 matched_by_filename: None,
568 rank: None,
569 score: None,
570 tfidf_score: None,
571 bm25_score: None,
572 tfidf_rank: None,
573 bm25_rank: None,
574 new_score: None,
575 hybrid2_rank: None,
576 combined_score_rank: None,
577 file_unique_terms: None,
578 file_total_matches: None,
579 file_match_rank: None,
580 block_unique_terms: None,
581 block_total_matches: None,
582 parent_file_id: None,
583 block_id: None,
584 matched_keywords: None,
585 tokenized_content: Some(tokenized_content),
586 });
587 }
588 }
589 } else {
590 if debug_mode {
592 println!("[DEBUG] No line or range specified, returning entire file content");
593 }
594
595 let filename = path
597 .file_name()
598 .map(|f| f.to_string_lossy().to_string())
599 .unwrap_or_default();
600 let tokenized_content = crate::ranking::preprocess_text_with_filename(&content, &filename);
601
602 Ok(SearchResult {
603 file: path.to_string_lossy().to_string(),
604 lines: (1, lines.len()),
605 node_type: "file".to_string(),
606 code: content,
607 matched_by_filename: None,
608 rank: None,
609 score: None,
610 tfidf_score: None,
611 bm25_score: None,
612 tfidf_rank: None,
613 bm25_rank: None,
614 new_score: None,
615 hybrid2_rank: None,
616 combined_score_rank: None,
617 file_unique_terms: None,
618 file_total_matches: None,
619 file_match_rank: None,
620 block_unique_terms: None,
621 block_total_matches: None,
622 parent_file_id: None,
623 block_id: None,
624 matched_keywords: None,
625 tokenized_content: Some(tokenized_content),
626 })
627 }
628}
629
630fn file_extension(path: &Path) -> &str {
632 path.extension().and_then(|ext| ext.to_str()).unwrap_or("")
633}