1use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use docrawl::{crawl, Config, CrawlConfig};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::time::Duration;
12use url::Url;
13use walkdir::WalkDir;
14
15use crate::rag::embeddings::preprocessing;
16use crate::rag::{DocumentChunk, DocumentMetadata, RagConfig, SourceType};
17
18pub struct Indexer {
20 config: RagConfig,
21 index_path: PathBuf,
22}
23
24impl Indexer {
25 pub fn new(config: &RagConfig) -> Result<Self> {
27 let index_path = if config.index_path.to_string_lossy().starts_with("~") {
28 let home = std::env::var("HOME")
30 .or_else(|_| std::env::var("USERPROFILE"))
31 .map_err(|_| anyhow!("Cannot determine home directory"))?;
32 let path_str = config.index_path.to_string_lossy();
33 let without_tilde = path_str.strip_prefix("~/").unwrap_or(&path_str[1..]);
34 PathBuf::from(home).join(without_tilde)
35 } else {
36 config.index_path.clone()
37 };
38
39 if !index_path.exists() {
41 fs::create_dir_all(&index_path)
42 .map_err(|e| anyhow!("Failed to create index directory {:?}: {}", index_path, e))?;
43 }
44
45 Ok(Self {
46 config: config.clone(),
47 index_path,
48 })
49 }
50
51 pub fn get_index_path(&self) -> &PathBuf {
53 &self.index_path
54 }
55
56 pub fn index_document(&self, path: PathBuf) -> Result<Vec<DocumentChunk>> {
58 index_document(path, &self.config)
59 }
60
61 pub fn index_directory(&self, dir_path: PathBuf) -> Result<Vec<DocumentChunk>> {
63 let documents = find_documents(&dir_path)?;
64 let mut all_chunks = Vec::new();
65
66 for doc_path in documents {
67 match self.index_document(doc_path.clone()) {
68 Ok(mut chunks) => all_chunks.append(&mut chunks),
69 Err(e) => {
70 log::warn!("Failed to index {:?}: {}", doc_path, e);
71 continue;
72 }
73 }
74 }
75
76 log::info!(
77 "Indexed {} chunks from {} directory",
78 all_chunks.len(),
79 dir_path.display()
80 );
81 Ok(all_chunks)
82 }
83
84 pub async fn index_url(&self, url: String) -> Result<Vec<DocumentChunk>> {
86 log::info!("Indexing single URL (no crawling): {}", url);
87
88 self.index_url_deep(url, Some(0), false).await
90 }
91
92 pub async fn index_url_deep(
94 &self,
95 url: String,
96 crawl_depth: Option<u32>,
97 crawl_all: bool,
98 ) -> Result<Vec<DocumentChunk>> {
99 log::info!(
100 "Starting docrawl of URL: {} (depth: {:?}, crawl_all: {})",
101 url,
102 crawl_depth,
103 crawl_all
104 );
105
106 let parsed_url =
108 url::Url::parse(&url).map_err(|e| anyhow!("Invalid URL format '{}': {}", url, e))?;
109
110 match parsed_url.scheme() {
112 "http" | "https" => {}
113 scheme => {
114 return Err(anyhow!(
115 "Unsupported URL scheme '{}'. Only HTTP and HTTPS are allowed.",
116 scheme
117 ))
118 }
119 }
120
121 let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
123 std::fs::create_dir_all(&temp_dir)?;
124
125 let base_url = Url::parse(&url)?;
127
128 let config = CrawlConfig {
130 base_url,
131 output_dir: temp_dir.clone(),
132 user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
133 max_depth: if let Some(depth) = crawl_depth {
134 Some(depth as usize)
135 } else if crawl_all {
136 None } else {
138 Some(3) },
140 rate_limit_per_sec: 10,
141 follow_sitemaps: true,
142 concurrency: 4,
143 timeout: Some(Duration::from_secs(30)),
144 resume: false,
145 config: Config::default(),
146 };
147
148 log::info!("Running docrawl on: {}", url);
149 match crawl(config).await {
150 Ok(stats) => {
151 log::info!(
152 "Docrawl completed successfully, processed {} pages",
153 stats.pages
154 );
155 }
156 Err(e) => {
157 let _ = std::fs::remove_dir_all(&temp_dir);
159 return Err(anyhow!("Docrawl failed: {}", e));
160 }
161 }
162
163 let mut all_chunks = Vec::new();
165 let markdown_files = self.find_markdown_files(&temp_dir)?;
166
167 log::info!(
168 "Processing {} markdown files from docrawl",
169 markdown_files.len()
170 );
171
172 for (index, md_file) in markdown_files.iter().enumerate() {
173 log::debug!(
174 "Processing markdown file {}/{}: {}",
175 index + 1,
176 markdown_files.len(),
177 md_file.display()
178 );
179
180 match self.process_markdown_file(md_file, &url).await {
181 Ok(chunks) => {
182 let chunk_count = chunks.len();
183 all_chunks.extend(chunks);
184 log::debug!(
185 "Successfully processed markdown: {} ({} chunks)",
186 md_file.display(),
187 chunk_count
188 );
189 }
190 Err(e) => {
191 log::warn!("Failed to process markdown '{}': {}", md_file.display(), e);
192 }
194 }
195 }
196
197 if let Err(e) = std::fs::remove_dir_all(&temp_dir) {
199 log::warn!("Failed to clean up temporary directory: {}", e);
200 }
201
202 log::info!(
203 "Successfully indexed {} chunks from {} markdown files via docrawl of: {}",
204 all_chunks.len(),
205 markdown_files.len(),
206 url
207 );
208
209 Ok(all_chunks)
210 }
211
212 fn find_markdown_files(&self, dir: &Path) -> Result<Vec<PathBuf>> {
214 let mut markdown_files = Vec::new();
215
216 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
217 let path = entry.path();
218 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md") {
219 markdown_files.push(path.to_path_buf());
220 }
221 }
222
223 Ok(markdown_files)
224 }
225
226 async fn process_markdown_file(
228 &self,
229 md_file: &Path,
230 base_url: &str,
231 ) -> Result<Vec<DocumentChunk>> {
232 let content = std::fs::read_to_string(md_file)?;
234
235 if content.trim().is_empty() {
236 return Err(anyhow!(
237 "Markdown file contains no content: {}",
238 md_file.display()
239 ));
240 }
241
242 let metadata = self.create_markdown_metadata(md_file, &content, base_url)?;
244
245 let (title, sections) = detect_structure(&content, md_file);
247
248 let page_url = self.derive_page_url(md_file, base_url);
250
251 let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
253
254 let mut document_chunks = Vec::new();
256 for (i, chunk_content) in chunks.into_iter().enumerate() {
257 let section = find_section_for_chunk(&chunk_content, §ions);
259
260 let chunk = DocumentChunk {
261 id: format!("{}_{}", page_url, i),
262 content: preprocessing::clean_text(&chunk_content),
263 source_path: PathBuf::from(&page_url),
264 source_type: SourceType::Web,
265 title: title.clone(),
266 section: section.clone(),
267 chunk_index: i,
268 metadata: metadata.clone(),
269 };
270
271 document_chunks.push(chunk);
272 }
273
274 Ok(document_chunks)
275 }
276
277 fn create_markdown_metadata(
279 &self,
280 md_file: &Path,
281 content: &str,
282 base_url: &str,
283 ) -> Result<DocumentMetadata> {
284 let file_metadata = std::fs::metadata(md_file)?;
285 let modified_time = file_metadata.modified()?;
286 let modified_datetime = chrono::DateTime::<chrono::Utc>::from(modified_time);
287
288 let mut tags = extract_tags_from_path(md_file);
290 tags.push("documentation".to_string());
291 tags.push("crawled".to_string());
292
293 if let Some(domain) = extract_domain_from_url(base_url) {
295 tags.push(domain);
296 }
297
298 let language = detect_language(md_file);
300
301 Ok(DocumentMetadata {
302 file_type: "markdown".to_string(),
303 size: content.len() as u64,
304 modified: modified_datetime,
305 tags,
306 language,
307 })
308 }
309
310 fn derive_page_url(&self, md_file: &Path, base_url: &str) -> String {
312 let file_name = md_file
314 .file_stem()
315 .and_then(|s| s.to_str())
316 .unwrap_or("page");
317
318 if base_url.ends_with('/') {
320 format!("{}{}", base_url, file_name)
321 } else {
322 format!("{}/{}", base_url, file_name)
323 }
324 }
325}
326
327const SUPPORTED_EXTENSIONS: &[&str] = &[
329 ".md",
331 ".txt",
332 ".pdf",
333 ".doc",
334 ".docx",
335 ".rst",
336 ".js",
338 ".jsx",
339 ".ts",
340 ".tsx",
341 ".vue",
342 ".svelte",
343 ".html",
344 ".css",
345 ".scss",
346 ".sass",
347 ".less",
348 ".py",
350 ".rb",
351 ".php",
352 ".java",
353 ".scala",
354 ".kotlin",
355 ".groovy",
356 ".c",
358 ".cpp",
359 ".cc",
360 ".cxx",
361 ".h",
362 ".hpp",
363 ".rs",
364 ".go",
365 ".zig",
366 ".ml",
368 ".mli",
369 ".hs",
370 ".elm",
371 ".clj",
372 ".cljs",
373 ".erl",
374 ".ex",
375 ".exs",
376 ".json",
378 ".yaml",
379 ".yml",
380 ".toml",
381 ".xml",
382 ".ini",
383 ".env",
384 ".properties",
385 ".sh",
387 ".bash",
388 ".zsh",
389 ".fish",
390 ".ps1",
391 ".bat",
392 ".cmd",
393 ".swift",
395 ".m",
396 ".mm",
397 ".kt",
398 ".dart",
399 ".sql",
401 ".graphql",
402 ".prisma",
403 ".r",
405 ".R",
406 ".jl",
407 ".lua",
408 ".vim",
409 ".el",
410];
411
412const DEFAULT_CHUNK_SIZE: usize = 500;
414
415const DEFAULT_CHUNK_OVERLAP: usize = 50;
417
418pub fn find_documents(dir_path: &Path) -> Result<Vec<PathBuf>> {
420 if !dir_path.exists() {
421 return Err(anyhow!("Directory does not exist: {:?}", dir_path));
422 }
423
424 if !dir_path.is_dir() {
425 return Err(anyhow!("Path is not a directory: {:?}", dir_path));
426 }
427
428 let mut documents = Vec::new();
429 let max_depth = 10; let max_file_size = 100 * 1024 * 1024; log::info!("Scanning directory for documents: {:?}", dir_path);
433
434 for entry in WalkDir::new(dir_path)
435 .max_depth(max_depth)
436 .follow_links(false) .into_iter()
438 .filter_map(|e| e.ok())
439 {
441 let path = entry.path();
442
443 if !path.is_file() {
445 continue;
446 }
447
448 if !is_supported_file(path) {
450 continue;
451 }
452
453 if let Ok(metadata) = entry.metadata() {
455 if metadata.len() > max_file_size {
456 log::warn!(
457 "Skipping large file ({}MB): {:?}",
458 metadata.len() / 1024 / 1024,
459 path
460 );
461 continue;
462 }
463 }
464
465 if path
467 .file_name()
468 .and_then(|name| name.to_str())
469 .map(|name| name.starts_with('.'))
470 .unwrap_or(false)
471 {
472 log::debug!("Skipping hidden file: {:?}", path);
473 continue;
474 }
475
476 let path_str = path.to_string_lossy();
478 let skip_patterns = [
479 "/target/",
480 "/.git/",
481 "/node_modules/",
482 "/__pycache__/",
483 "/.cache/",
484 "/dist/",
485 "/build/",
486 ];
487
488 if skip_patterns
489 .iter()
490 .any(|pattern| path_str.contains(pattern))
491 {
492 log::debug!("Skipping file in ignored directory: {:?}", path);
493 continue;
494 }
495
496 documents.push(path.to_path_buf());
497 }
498
499 log::info!(
500 "Found {} indexable documents in {:?} (max depth: {})",
501 documents.len(),
502 dir_path,
503 max_depth
504 );
505
506 if documents.is_empty() {
507 log::warn!(
508 "No supported documents found in {:?}. Supported formats: {:?}",
509 dir_path,
510 SUPPORTED_EXTENSIONS
511 );
512 }
513
514 Ok(documents)
515}
516
517pub fn is_supported_file(path: &Path) -> bool {
519 path.extension()
520 .and_then(|ext| ext.to_str())
521 .map(|ext| SUPPORTED_EXTENSIONS.contains(&format!(".{}", ext.to_lowercase()).as_str()))
522 .unwrap_or(false)
523}
524
525pub fn index_document(path: PathBuf, config: &RagConfig) -> Result<Vec<DocumentChunk>> {
527 if !path.exists() {
528 return Err(anyhow!("File does not exist: {:?}", path));
529 }
530
531 if !is_supported_file(&path) {
532 return Err(anyhow!("Unsupported file type: {:?}", path));
533 }
534
535 let extension = path
537 .extension()
538 .and_then(|ext| ext.to_str())
539 .unwrap_or("")
540 .to_lowercase();
541
542 if extension == "pdf" && !config.allow_pdf_processing {
543 log::warn!("PDF processing disabled for security. Skipping: {:?}", path);
544 return Ok(vec![]); }
546
547 let code_extensions = [
549 "js", "jsx", "ts", "tsx", "py", "rb", "php", "java", "scala", "kotlin", "rs", "go", "c",
550 "cpp", "sh", "bash", "ps1",
551 ];
552 if code_extensions.contains(&extension.as_str()) && !config.allow_code_processing {
553 log::warn!("Code processing disabled. Skipping: {:?}", path);
554 return Ok(vec![]); }
556
557 log::info!("Indexing document: {:?}", path);
558
559 let content = extract_text(&path, config)?;
561 if content.trim().is_empty() {
562 return Err(anyhow!("Document contains no text content: {:?}", path));
563 }
564
565 let metadata = extract_metadata(&path)?;
567
568 let (title, sections) = detect_structure(&content, &path);
570
571 let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
573
574 let mut document_chunks = Vec::new();
576 for (i, chunk_content) in chunks.into_iter().enumerate() {
577 let section = find_section_for_chunk(&chunk_content, §ions);
579
580 let chunk = DocumentChunk {
581 id: format!("{}_{}", path.to_string_lossy(), i),
582 content: preprocessing::clean_text(&chunk_content),
583 source_path: path.clone(),
584 source_type: SourceType::Local, title: title.clone(),
586 section: section.clone(),
587 chunk_index: i,
588 metadata: metadata.clone(),
589 };
590
591 document_chunks.push(chunk);
592 }
593
594 log::info!("Created {} chunks from {:?}", document_chunks.len(), path);
595 Ok(document_chunks)
596}
597
598fn extract_text(path: &Path, config: &RagConfig) -> Result<String> {
600 let extension = path
601 .extension()
602 .and_then(|ext| ext.to_str())
603 .unwrap_or("")
604 .to_lowercase();
605
606 match extension.as_str() {
607 "md" | "txt" | "rst" => extract_text_file(path),
608 "pdf" => extract_pdf_text(path),
609 "doc" | "docx" => extract_doc_text(path),
610 "js" | "jsx" | "ts" | "tsx" | "vue" | "svelte" | "html" | "css" | "scss" | "sass"
612 | "less" | "py" | "rb" | "php" | "java" | "scala" | "kotlin" | "groovy" | "c" | "cpp"
613 | "cc" | "cxx" | "h" | "hpp" | "rs" | "go" | "zig" | "ml" | "mli" | "hs" | "elm"
614 | "clj" | "cljs" | "erl" | "ex" | "exs" | "swift" | "m" | "mm" | "kt" | "dart" | "r"
615 | "jl" | "lua" | "vim" | "el" | "sql" | "graphql" | "prisma" => {
616 extract_code_text(path, config)
617 }
618 "json" | "yaml" | "yml" | "toml" | "xml" | "ini" | "properties" => {
620 extract_config_text(path, config)
621 }
622 "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => extract_shell_text(path, config),
624 "env" => extract_env_text(path, config),
626 _ => Err(anyhow!("Unsupported file extension: {}", extension)),
627 }
628}
629
630fn extract_text_file(path: &Path) -> Result<String> {
632 fs::read_to_string(path).map_err(|e| anyhow!("Failed to read text file {:?}: {}", path, e))
633}
634
635fn extract_pdf_text(path: &Path) -> Result<String> {
637 log::info!("Processing PDF file with security validation: {:?}", path);
638
639 validate_pdf_security(path)?;
641
642 let file_name = path
645 .file_stem()
646 .and_then(|name| name.to_str())
647 .unwrap_or("unknown");
648
649 let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
651
652 let mut content = String::new();
654 content.push_str(&format!("PDF Document: {}\n", file_name));
655 content.push_str(&format!("File size: {} bytes\n", file_size));
656 content.push_str(&format!("Location: {}\n", path.display()));
657
658 let searchable_terms: Vec<&str> = file_name
660 .split(|c: char| !c.is_alphanumeric())
661 .filter(|term| term.len() > 2)
662 .collect();
663
664 if !searchable_terms.is_empty() {
665 content.push_str("Keywords: ");
666 content.push_str(&searchable_terms.join(", "));
667 content.push('\n');
668 }
669
670 content.push_str("This document is indexed by filename and metadata.");
672
673 log::info!(
674 "Created indexable content for PDF {:?} ({} characters)",
675 path,
676 content.len()
677 );
678 Ok(content)
679}
680
681fn extract_doc_text(path: &Path) -> Result<String> {
683 log::info!("Processing DOC/DOCX file: {:?}", path);
684
685 let extension = path
686 .extension()
687 .and_then(|ext| ext.to_str())
688 .unwrap_or("")
689 .to_lowercase();
690
691 if extension == "doc" {
693 log::warn!("Legacy DOC format detected: {:?}", path);
694 return create_doc_metadata(path, "DOC (Legacy Word Document)");
695 }
696
697 if extension == "docx" {
699 match extract_docx_text_safe(path) {
701 Ok(content) => {
702 log::info!(
703 "Successfully extracted {} characters from DOCX: {:?}",
704 content.len(),
705 path
706 );
707 return Ok(content);
708 }
709 Err(e) => {
710 log::warn!(
711 "Failed to extract DOCX text, using metadata fallback: {}",
712 e
713 );
714 return create_doc_metadata(path, "DOCX (Word Document)");
715 }
716 }
717 }
718
719 Err(anyhow!("Unsupported document format: {:?}", extension))
720}
721
722fn extract_docx_text_safe(path: &Path) -> Result<String> {
724 use docx_rs::read_docx;
725
726 let file_bytes = std::fs::read(path).map_err(|e| anyhow!("Failed to read DOCX file: {}", e))?;
728
729 let _docx = read_docx(&file_bytes).map_err(|e| anyhow!("Failed to parse DOCX file: {}", e))?;
730
731 let mut text_content = String::new();
733
734 text_content.push_str(&format!("DOCX Document from: {}\n", path.display()));
736 text_content.push_str("Document content successfully parsed.\n");
737 text_content.push_str("Note: Basic DOCX processing - text extraction can be enhanced.");
738
739 Ok(text_content)
740}
741
742fn extract_code_text(path: &Path, config: &RagConfig) -> Result<String> {
744 validate_code_security(path, &config.code_security_level)?;
746
747 let content = fs::read_to_string(path)
749 .map_err(|e| anyhow!("Failed to read code file {:?}: {}", path, e))?;
750
751 let cleaned = if config.mask_secrets {
753 sanitize_code_content(&content)
754 } else {
755 content
756 };
757 Ok(cleaned)
758}
759
760fn extract_config_text(path: &Path, config: &RagConfig) -> Result<String> {
762 let content = fs::read_to_string(path)
763 .map_err(|e| anyhow!("Failed to read config file {:?}: {}", path, e))?;
764
765 let sanitized = if config.mask_secrets {
767 mask_secrets(&content)
768 } else {
769 content
770 };
771 Ok(sanitized)
772}
773
774fn extract_shell_text(path: &Path, config: &RagConfig) -> Result<String> {
776 validate_shell_security(path, &config.code_security_level)?;
778
779 let content = fs::read_to_string(path)
780 .map_err(|e| anyhow!("Failed to read shell script {:?}: {}", path, e))?;
781
782 let sanitized = if config.mask_secrets {
784 sanitize_shell_content(&content)
785 } else {
786 content
787 };
788 Ok(sanitized)
789}
790
791fn extract_env_text(path: &Path, _config: &RagConfig) -> Result<String> {
793 let content = fs::read_to_string(path)
794 .map_err(|e| anyhow!("Failed to read env file {:?}: {}", path, e))?;
795
796 let masked = mask_env_secrets(&content);
798 Ok(masked)
799}
800
801fn validate_code_security(
803 path: &Path,
804 security_level: &crate::rag::CodeSecurityLevel,
805) -> Result<()> {
806 use crate::rag::CodeSecurityLevel;
807 log::debug!("Running security validation on code file: {:?}", path);
808
809 let metadata = fs::metadata(path)?;
811 const MAX_CODE_SIZE: u64 = 100 * 1024 * 1024; if metadata.len() > MAX_CODE_SIZE {
813 return Err(anyhow!(
814 "Code file rejected: Size {} bytes exceeds maximum allowed size of {} bytes",
815 metadata.len(),
816 MAX_CODE_SIZE
817 ));
818 }
819
820 let content = fs::read_to_string(path)
822 .map_err(|e| anyhow!("Failed to read code file for validation: {}", e))?;
823
824 if is_potentially_obfuscated(&content) {
826 match security_level {
827 CodeSecurityLevel::Strict => {
828 return Err(anyhow!(
829 "Code file rejected: Contains potentially obfuscated content"
830 ));
831 }
832 CodeSecurityLevel::Moderate => {
833 log::warn!("Code file may contain obfuscated content: {:?}", path);
834 }
835 CodeSecurityLevel::Permissive => {
836 log::debug!("Obfuscated content check bypassed (permissive mode)");
837 }
838 }
839 }
840
841 validate_urls_in_code(&content, security_level)?;
843
844 check_prompt_injection(&content, security_level)?;
846
847 Ok(())
848}
849
850fn validate_shell_security(
852 path: &Path,
853 security_level: &crate::rag::CodeSecurityLevel,
854) -> Result<()> {
855 use crate::rag::CodeSecurityLevel;
856 log::debug!(
857 "Running enhanced security validation on shell script: {:?}",
858 path
859 );
860
861 let content = fs::read_to_string(path)
862 .map_err(|e| anyhow!("Failed to read shell script for validation: {}", e))?;
863
864 let dangerous_patterns = [
866 r"rm\s+-rf\s+/", r"rm\s+-rf\s+\*", r":\(\)\s*\{\s*:\|\:&\s*\};:", r"mkfs\.", r"dd\s+if=/dev/(zero|random)", r">\s*/dev/sda", r"curl.*\|\s*(ba)?sh", r"wget.*\|\s*(ba)?sh", r"eval\s+.*\$\(", r"python\s+-c.*exec", ];
877
878 let compiled_patterns: Vec<regex::Regex> = dangerous_patterns
879 .iter()
880 .filter_map(|pattern| regex::Regex::new(pattern).ok())
881 .collect();
882
883 for pattern in &compiled_patterns {
884 if pattern.is_match(&content) {
885 match security_level {
886 CodeSecurityLevel::Strict | CodeSecurityLevel::Moderate => {
887 return Err(anyhow!(
888 "Shell script rejected: Contains potentially dangerous command pattern"
889 ));
890 }
891 CodeSecurityLevel::Permissive => {
892 log::warn!("Dangerous shell pattern detected but allowed in permissive mode");
893 }
894 }
895 }
896 }
897
898 Ok(())
899}
900
901fn is_potentially_obfuscated(content: &str) -> bool {
903 let lines: Vec<&str> = content.lines().collect();
905 let mut suspicious_count = 0;
906
907 let hex_regex = regex::Regex::new(r"\\x[0-9a-fA-F]{2}").unwrap();
908
909 for line in lines {
910 if line.trim().starts_with("//")
912 || line.trim().starts_with("#")
913 || line.trim().starts_with("/*")
914 {
915 continue;
916 }
917
918 if line.contains("atob") || line.contains("btoa") || line.contains("base64") {
920 suspicious_count += 1;
921 }
922
923 if hex_regex.is_match(line) {
925 suspicious_count += 1;
926 }
927
928 if line.matches('\\').count() > 10 {
930 suspicious_count += 1;
931 }
932 }
933
934 suspicious_count > 5
935}
936
937fn validate_urls_in_code(
939 content: &str,
940 security_level: &crate::rag::CodeSecurityLevel,
941) -> Result<()> {
942 use crate::rag::CodeSecurityLevel;
943 let url_pattern = regex::Regex::new(r#"https?://[^\s"']+"#).unwrap();
944
945 let suspicious_domains = [
946 "bit.ly",
947 "tinyurl.com",
948 "goo.gl",
949 "ow.ly",
950 "shorte.st",
951 "adf.ly",
952 "bc.vc",
953 "bit.do",
954 "soo.gd",
955 "7.ly",
956 "5z8.info",
957 "DFHGDH", ];
959
960 for url_match in url_pattern.find_iter(content) {
961 let url = url_match.as_str();
962 for domain in &suspicious_domains {
963 if url.contains(domain) {
964 match security_level {
965 CodeSecurityLevel::Strict => {
966 return Err(anyhow!(
967 "Code rejected: Contains suspicious URL shortener: {}",
968 url
969 ));
970 }
971 CodeSecurityLevel::Moderate => {
972 log::warn!("Suspicious URL shortener found in code: {}", url);
973 }
974 CodeSecurityLevel::Permissive => {
975 log::debug!("URL check bypassed (permissive mode): {}", url);
976 }
977 }
978 }
979 }
980 }
981
982 Ok(())
983}
984
985fn check_prompt_injection(
987 content: &str,
988 security_level: &crate::rag::CodeSecurityLevel,
989) -> Result<()> {
990 use crate::rag::CodeSecurityLevel;
991 let injection_patterns = [
992 "ignore previous instructions",
993 "disregard all prior",
994 "forget everything above",
995 "new instructions:",
996 "SYSTEM PROMPT:",
997 "###SYSTEM###",
998 "</system>",
999 "<|im_start|>",
1000 "<|im_end|>",
1001 ];
1002
1003 let content_lower = content.to_lowercase();
1004 for pattern in &injection_patterns {
1005 if content_lower.contains(pattern) {
1006 match security_level {
1007 CodeSecurityLevel::Strict => {
1008 return Err(anyhow!(
1009 "Code rejected: Contains potential prompt injection pattern: {}",
1010 pattern
1011 ));
1012 }
1013 CodeSecurityLevel::Moderate => {
1014 log::warn!("Potential prompt injection pattern detected: {}", pattern);
1015 }
1016 CodeSecurityLevel::Permissive => {
1017 log::debug!("Prompt injection check bypassed (permissive mode)");
1018 }
1019 }
1020 }
1021 }
1022
1023 Ok(())
1024}
1025
1026fn sanitize_code_content(content: &str) -> String {
1028 mask_secrets(content)
1030}
1031
1032fn sanitize_shell_content(content: &str) -> String {
1034 mask_secrets(content)
1036}
1037
1038fn mask_secrets(content: &str) -> String {
1040 let mut result = content.to_string();
1041
1042 let secret_patterns = [
1044 (
1045 r#"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1046 "API_KEY=[MASKED]",
1047 ),
1048 (
1049 r#"(?i)(secret|password|passwd|pwd)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1050 "SECRET=[MASKED]",
1051 ),
1052 (
1053 r#"(?i)(token|auth)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1054 "TOKEN=[MASKED]",
1055 ),
1056 (r"(?i)bearer\s+[a-zA-Z0-9\-._~+/]+", "Bearer [MASKED]"),
1057 (
1058 r"-----BEGIN (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----[\s\S]*?-----END (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----",
1059 "[PRIVATE_KEY_MASKED]",
1060 ),
1061 (r"ghp_[a-zA-Z0-9]{36}", "ghp_[GITHUB_TOKEN_MASKED]"),
1062 (r"sk-[a-zA-Z0-9]{48}", "sk-[OPENAI_KEY_MASKED]"),
1063 ];
1064
1065 for (pattern, replacement) in &secret_patterns {
1066 if let Ok(re) = regex::Regex::new(pattern) {
1067 result = re.replace_all(&result, *replacement).to_string();
1068 }
1069 }
1070
1071 result
1072}
1073
1074fn mask_env_secrets(content: &str) -> String {
1076 let mut result = String::new();
1077
1078 for line in content.lines() {
1079 if line.trim().is_empty() || line.trim().starts_with('#') {
1080 result.push_str(line);
1081 result.push('\n');
1082 continue;
1083 }
1084
1085 if let Some(eq_pos) = line.find('=') {
1086 let key = &line[..eq_pos];
1087 result.push_str(key);
1089 result.push_str("=[MASKED]\n");
1090 } else {
1091 result.push_str(line);
1092 result.push('\n');
1093 }
1094 }
1095
1096 result
1097}
1098
1099fn validate_pdf_security(path: &Path) -> Result<()> {
1101 log::debug!("Running security validation on PDF: {:?}", path);
1102
1103 const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; let metadata = fs::metadata(path)?;
1106 if metadata.len() > MAX_PDF_SIZE {
1107 return Err(anyhow!(
1108 "PDF file rejected: Size {} bytes exceeds maximum allowed size of {} bytes ({}MB)",
1109 metadata.len(),
1110 MAX_PDF_SIZE,
1111 MAX_PDF_SIZE / (1024 * 1024)
1112 ));
1113 }
1114
1115 let mut buffer = vec![0u8; 1024];
1117 let file = fs::File::open(path)?;
1118 use std::io::Read;
1119 let mut reader = std::io::BufReader::new(file);
1120 let bytes_read = reader.read(&mut buffer)?;
1121
1122 if bytes_read < 8 {
1123 return Err(anyhow!("PDF file rejected: File too small or corrupted"));
1124 }
1125
1126 if !buffer.starts_with(b"%PDF-") {
1128 return Err(anyhow!(
1129 "PDF file rejected: Invalid PDF header - not a valid PDF file"
1130 ));
1131 }
1132
1133 if bytes_read >= 8 {
1135 let version_bytes = &buffer[5..8];
1136 if let Ok(version_str) = std::str::from_utf8(version_bytes) {
1137 if let Some(major_char) = version_str.chars().next() {
1139 if let Some(major) = major_char.to_digit(10) {
1140 if !(1..=2).contains(&major) {
1141 return Err(anyhow!(
1143 "PDF file rejected: Unsupported PDF version {}",
1144 version_str
1145 ));
1146 }
1147 }
1148 }
1149 }
1150 }
1151
1152 let content = std::str::from_utf8(&buffer[..bytes_read]).unwrap_or("");
1154
1155 let dangerous_patterns = [
1157 "/JavaScript",
1158 "/JS",
1159 "/OpenAction",
1160 "/AA", "/Launch",
1162 "/GoToE", "/GoToR", "/ImportData",
1165 "/SubmitForm",
1166 "/URI",
1167 "/Sound",
1168 "/Movie",
1169 "/RichMedia",
1170 "/3D",
1171 "/Encrypt",
1172 "eval(",
1173 "unescape(",
1174 "String.fromCharCode(",
1175 "document.write(",
1176 "this.print(",
1177 "app.alert(",
1178 "xfa.host",
1179 "soap.connect",
1180 "util.printf",
1181 ];
1182
1183 for pattern in &dangerous_patterns {
1184 if content.contains(pattern) {
1185 log::warn!(
1186 "PDF security violation: Found suspicious pattern '{}' in {}",
1187 pattern,
1188 path.display()
1189 );
1190 return Err(anyhow!(
1191 "PDF file rejected: Contains potentially malicious content pattern '{}'. PDF may contain embedded JavaScript or other dangerous elements.",
1192 pattern
1193 ));
1194 }
1195 }
1196
1197 let embed_patterns = ["/EmbeddedFile", "/F ", "/UF ", "/Filespec"];
1199 for pattern in &embed_patterns {
1200 if content.contains(pattern) {
1201 log::warn!(
1202 "PDF security violation: Found embedded file pattern '{}' in {}",
1203 pattern,
1204 path.display()
1205 );
1206 return Err(anyhow!(
1207 "PDF file rejected: Contains embedded files which pose security risks"
1208 ));
1209 }
1210 }
1211
1212 let form_patterns = ["/XFA", "/AcroForm", "/Fields"];
1214 for pattern in &form_patterns {
1215 if content.contains(pattern) {
1216 log::warn!(
1217 "PDF security warning: Found form pattern '{}' in {}",
1218 pattern,
1219 path.display()
1220 );
1221 }
1223 }
1224
1225 log::info!("PDF security validation passed for: {:?}", path);
1226 Ok(())
1227}
1228
1229fn create_doc_metadata(path: &Path, doc_type: &str) -> Result<String> {
1231 let file_name = path
1232 .file_stem()
1233 .and_then(|name| name.to_str())
1234 .unwrap_or("unknown");
1235
1236 let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1238
1239 let mut content = String::new();
1241 content.push_str(&format!("{}: {}\n", doc_type, file_name));
1242 content.push_str(&format!("File size: {} bytes\n", file_size));
1243 content.push_str(&format!("Location: {}\n", path.display()));
1244
1245 let searchable_terms: Vec<&str> = file_name
1247 .split(|c: char| !c.is_alphanumeric())
1248 .filter(|term| term.len() > 2)
1249 .collect();
1250
1251 if !searchable_terms.is_empty() {
1252 content.push_str("Keywords: ");
1253 content.push_str(&searchable_terms.join(", "));
1254 content.push('\n');
1255 }
1256
1257 if let Ok(modified) = fs::metadata(path).and_then(|m| m.modified()) {
1259 if let Ok(duration) = modified.duration_since(std::time::SystemTime::UNIX_EPOCH) {
1260 let datetime = chrono::DateTime::from_timestamp(duration.as_secs() as i64, 0)
1261 .unwrap_or_else(chrono::Utc::now);
1262 content.push_str(&format!("Modified: {}\n", datetime.format("%Y-%m-%d")));
1263 }
1264 }
1265
1266 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
1268 content.push_str(&format!("Format: {} document\n", extension.to_uppercase()));
1269 }
1270
1271 Ok(content)
1272}
1273
1274fn extract_metadata(path: &Path) -> Result<DocumentMetadata> {
1276 let metadata = fs::metadata(path)?;
1277
1278 let file_type = path
1279 .extension()
1280 .and_then(|ext| ext.to_str())
1281 .unwrap_or("unknown")
1282 .to_lowercase();
1283
1284 let modified = metadata
1285 .modified()?
1286 .duration_since(std::time::UNIX_EPOCH)?
1287 .as_secs();
1288
1289 let modified_datetime = DateTime::from_timestamp(modified as i64, 0).unwrap_or_else(Utc::now);
1290
1291 let tags = extract_tags_from_path(path);
1293
1294 let language = detect_language(path);
1296
1297 Ok(DocumentMetadata {
1298 file_type,
1299 size: metadata.len(),
1300 modified: modified_datetime,
1301 tags,
1302 language,
1303 })
1304}
1305
1306fn extract_tags_from_path(path: &Path) -> Vec<String> {
1308 let mut tags = Vec::new();
1309
1310 if let Some(parent) = path.parent() {
1312 for component in parent.components() {
1313 if let Some(name) = component.as_os_str().to_str() {
1314 if !name.starts_with('.') && name != "/" {
1315 tags.push(name.to_lowercase());
1316 }
1317 }
1318 }
1319 }
1320
1321 if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1323 if filename.contains("readme") {
1325 tags.push("readme".to_string());
1326 }
1327 if filename.contains("api") {
1328 tags.push("api".to_string());
1329 }
1330 if filename.contains("guide") {
1331 tags.push("guide".to_string());
1332 }
1333 if filename.contains("tutorial") {
1334 tags.push("tutorial".to_string());
1335 }
1336 }
1337
1338 tags
1339}
1340
1341fn detect_language(_path: &Path) -> Option<String> {
1343 Some("en".to_string())
1346}
1347
1348fn detect_structure(content: &str, path: &Path) -> (Option<String>, Vec<String>) {
1350 let lines: Vec<&str> = content.lines().collect();
1351 let mut title = None;
1352 let mut sections = Vec::new();
1353
1354 if path.extension().and_then(|s| s.to_str()) == Some("md") {
1356 for line in &lines {
1357 let trimmed = line.trim();
1358
1359 if title.is_none() && trimmed.starts_with("# ") {
1361 title = Some(trimmed[2..].trim().to_string());
1362 }
1363
1364 if let Some(stripped) = trimmed.strip_prefix("## ") {
1366 sections.push(stripped.trim().to_string());
1367 } else if let Some(stripped) = trimmed.strip_prefix("### ") {
1368 sections.push(stripped.trim().to_string());
1369 }
1370 }
1371 }
1372
1373 if title.is_none() {
1375 if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1376 title = Some(filename.replace(['_', '-'], " "));
1377 }
1378 }
1379
1380 (title, sections)
1381}
1382
1383fn find_section_for_chunk(chunk: &str, sections: &[String]) -> Option<String> {
1385 for section in sections {
1387 if chunk.contains(section) {
1388 return Some(section.clone());
1389 }
1390 }
1391 None
1392}
1393
1394fn chunk_content(content: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
1396 let word_chunk_size = (chunk_size as f32 * 0.75) as usize;
1398 let word_overlap = (overlap as f32 * 0.75) as usize;
1399
1400 crate::rag::embeddings::preprocessing::chunk_text(content, word_chunk_size, word_overlap)
1402}
1403
1404fn extract_domain_from_url(url: &str) -> Option<String> {
1406 if let Some(start) = url.find("://") {
1408 let after_protocol = &url[start + 3..];
1409 if let Some(end) = after_protocol.find('/') {
1410 Some(after_protocol[..end].to_string())
1411 } else {
1412 Some(after_protocol.to_string())
1413 }
1414 } else {
1415 None
1416 }
1417}
1418
1419#[cfg(test)]
1420mod tests {
1421 use super::*;
1422 #[test]
1427 fn test_is_supported_file() {
1428 assert!(is_supported_file(Path::new("test.md")));
1429 assert!(is_supported_file(Path::new("test.txt")));
1430 assert!(is_supported_file(Path::new("test.pdf")));
1431 assert!(is_supported_file(Path::new("test.rs"))); assert!(!is_supported_file(Path::new("test.unknown")));
1433 assert!(!is_supported_file(Path::new("test")));
1434 }
1435
1436 #[test]
1437 fn test_detect_structure() {
1438 let content = r#"# Main Title
1439
1440Some introduction text.
1441
1442## Section 1
1443
1444Content for section 1.
1445
1446## Section 2
1447
1448Content for section 2.
1449
1450### Subsection 2.1
1451
1452More content.
1453"#;
1454
1455 let path = Path::new("test.md");
1456 let (title, sections) = detect_structure(content, path);
1457
1458 assert_eq!(title, Some("Main Title".to_string()));
1459 assert_eq!(sections.len(), 3);
1460 assert!(sections.contains(&"Section 1".to_string()));
1461 assert!(sections.contains(&"Section 2".to_string()));
1462 assert!(sections.contains(&"Subsection 2.1".to_string()));
1463 }
1464
1465 #[test]
1466 fn test_extract_tags_from_path() {
1467 let path = Path::new("/docs/api/authentication/readme.md");
1468 let tags = extract_tags_from_path(path);
1469
1470 assert!(tags.contains(&"docs".to_string()));
1471 assert!(tags.contains(&"api".to_string()));
1472 assert!(tags.contains(&"authentication".to_string()));
1473 assert!(tags.contains(&"readme".to_string()));
1474 }
1475
1476 #[test]
1477 fn test_chunk_content() {
1478 let content = "This is a test document with multiple sentences. Each sentence should be preserved in the chunking process. We want to make sure the chunks are reasonable.";
1479 let chunks = chunk_content(content, 10, 2); assert!(chunks.len() > 1);
1482 assert!(!chunks[0].is_empty());
1483
1484 if chunks.len() > 1 {
1486 let words1: Vec<&str> = chunks[0].split_whitespace().collect();
1487 let words2: Vec<&str> = chunks[1].split_whitespace().collect();
1488
1489 let overlap_found = words1
1491 .iter()
1492 .rev()
1493 .take(5)
1494 .any(|word| words2.iter().take(5).any(|w| w == word));
1495 assert!(overlap_found);
1496 }
1497 }
1498}