manx_cli/rag/
indexer.rs

1//! Document indexing pipeline for the RAG system
2//!
3//! This module handles parsing and chunking of various document formats
4//! for indexing into the vector database.
5
6use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use docrawl::{crawl, Config, CrawlConfig};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::time::Duration;
12use url::Url;
13use walkdir::WalkDir;
14
15use crate::rag::embeddings::preprocessing;
16use crate::rag::{DocumentChunk, DocumentMetadata, RagConfig, SourceType};
17
18/// Document indexer for the RAG system
19pub struct Indexer {
20    config: RagConfig,
21    index_path: PathBuf,
22}
23
24impl Indexer {
25    /// Create new indexer with configuration
26    pub fn new(config: &RagConfig) -> Result<Self> {
27        let index_path = if config.index_path.to_string_lossy().starts_with("~") {
28            // Expand home directory
29            let home = std::env::var("HOME")
30                .or_else(|_| std::env::var("USERPROFILE"))
31                .map_err(|_| anyhow!("Cannot determine home directory"))?;
32            let path_str = config.index_path.to_string_lossy();
33            let without_tilde = path_str.strip_prefix("~/").unwrap_or(&path_str[1..]);
34            PathBuf::from(home).join(without_tilde)
35        } else {
36            config.index_path.clone()
37        };
38
39        // Ensure index directory exists
40        if !index_path.exists() {
41            fs::create_dir_all(&index_path)
42                .map_err(|e| anyhow!("Failed to create index directory {:?}: {}", index_path, e))?;
43        }
44
45        Ok(Self {
46            config: config.clone(),
47            index_path,
48        })
49    }
50
51    /// Get the index directory path
52    pub fn get_index_path(&self) -> &PathBuf {
53        &self.index_path
54    }
55
56    /// Index a single document
57    pub fn index_document(&self, path: PathBuf) -> Result<Vec<DocumentChunk>> {
58        index_document(path, &self.config)
59    }
60
61    /// Index all documents in a directory
62    pub fn index_directory(&self, dir_path: PathBuf) -> Result<Vec<DocumentChunk>> {
63        let documents = find_documents(&dir_path)?;
64        let mut all_chunks = Vec::new();
65
66        for doc_path in documents {
67            match self.index_document(doc_path.clone()) {
68                Ok(mut chunks) => all_chunks.append(&mut chunks),
69                Err(e) => {
70                    log::warn!("Failed to index {:?}: {}", doc_path, e);
71                    continue;
72                }
73            }
74        }
75
76        log::info!(
77            "Indexed {} chunks from {} directory",
78            all_chunks.len(),
79            dir_path.display()
80        );
81        Ok(all_chunks)
82    }
83
84    /// Index content from a URL
85    pub async fn index_url(&self, url: String) -> Result<Vec<DocumentChunk>> {
86        log::info!("Indexing single URL (no crawling): {}", url);
87
88        // Use docrawl with depth 0 (single page only)
89        self.index_url_deep(url, Some(0), false).await
90    }
91
92    /// Index content from a URL with documentation-optimized crawling
93    pub async fn index_url_deep(
94        &self,
95        url: String,
96        crawl_depth: Option<u32>,
97        crawl_all: bool,
98    ) -> Result<Vec<DocumentChunk>> {
99        log::info!(
100            "Starting docrawl of URL: {} (depth: {:?}, crawl_all: {})",
101            url,
102            crawl_depth,
103            crawl_all
104        );
105
106        // Validate URL format
107        let parsed_url =
108            url::Url::parse(&url).map_err(|e| anyhow!("Invalid URL format '{}': {}", url, e))?;
109
110        // Security check - only allow HTTP/HTTPS
111        match parsed_url.scheme() {
112            "http" | "https" => {}
113            scheme => {
114                return Err(anyhow!(
115                    "Unsupported URL scheme '{}'. Only HTTP and HTTPS are allowed.",
116                    scheme
117                ))
118            }
119        }
120
121        // Create temporary directory for docrawl output
122        let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
123        std::fs::create_dir_all(&temp_dir)?;
124
125        // Parse the URL
126        let base_url = Url::parse(&url)?;
127
128        // Configure docrawl
129        let config = CrawlConfig {
130            base_url,
131            output_dir: temp_dir.clone(),
132            user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
133            max_depth: if let Some(depth) = crawl_depth {
134                Some(depth as usize)
135            } else if crawl_all {
136                None // No depth limit
137            } else {
138                Some(3) // Default depth
139            },
140            rate_limit_per_sec: 10,
141            follow_sitemaps: true,
142            concurrency: 4,
143            timeout: Some(Duration::from_secs(30)),
144            resume: false,
145            config: Config::default(),
146        };
147
148        log::info!("Running docrawl on: {}", url);
149        match crawl(config).await {
150            Ok(stats) => {
151                log::info!(
152                    "Docrawl completed successfully, processed {} pages",
153                    stats.pages
154                );
155            }
156            Err(e) => {
157                // Clean up temp directory
158                let _ = std::fs::remove_dir_all(&temp_dir);
159                return Err(anyhow!("Docrawl failed: {}", e));
160            }
161        }
162
163        // Process the generated markdown files
164        let mut all_chunks = Vec::new();
165        let markdown_files = self.find_markdown_files(&temp_dir)?;
166
167        log::info!(
168            "Processing {} markdown files from docrawl",
169            markdown_files.len()
170        );
171
172        for (index, md_file) in markdown_files.iter().enumerate() {
173            log::debug!(
174                "Processing markdown file {}/{}: {}",
175                index + 1,
176                markdown_files.len(),
177                md_file.display()
178            );
179
180            match self.process_markdown_file(md_file, &url).await {
181                Ok(chunks) => {
182                    let chunk_count = chunks.len();
183                    all_chunks.extend(chunks);
184                    log::debug!(
185                        "Successfully processed markdown: {} ({} chunks)",
186                        md_file.display(),
187                        chunk_count
188                    );
189                }
190                Err(e) => {
191                    log::warn!("Failed to process markdown '{}': {}", md_file.display(), e);
192                    // Continue with other files even if one fails
193                }
194            }
195        }
196
197        // Clean up temporary directory
198        if let Err(e) = std::fs::remove_dir_all(&temp_dir) {
199            log::warn!("Failed to clean up temporary directory: {}", e);
200        }
201
202        log::info!(
203            "Successfully indexed {} chunks from {} markdown files via docrawl of: {}",
204            all_chunks.len(),
205            markdown_files.len(),
206            url
207        );
208
209        Ok(all_chunks)
210    }
211
212    /// Find all markdown files in the crawled directory
213    fn find_markdown_files(&self, dir: &Path) -> Result<Vec<PathBuf>> {
214        let mut markdown_files = Vec::new();
215
216        for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
217            let path = entry.path();
218            if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md") {
219                markdown_files.push(path.to_path_buf());
220            }
221        }
222
223        Ok(markdown_files)
224    }
225
226    /// Process a markdown file generated by docrawl
227    async fn process_markdown_file(
228        &self,
229        md_file: &Path,
230        base_url: &str,
231    ) -> Result<Vec<DocumentChunk>> {
232        // Read the markdown content
233        let content = std::fs::read_to_string(md_file)?;
234
235        if content.trim().is_empty() {
236            return Err(anyhow!(
237                "Markdown file contains no content: {}",
238                md_file.display()
239            ));
240        }
241
242        // Create metadata for this markdown file
243        let metadata = self.create_markdown_metadata(md_file, &content, base_url)?;
244
245        // Detect document structure (title, sections) from markdown content
246        let (title, sections) = detect_structure(&content, md_file);
247
248        // Derive a logical page URL from the file path and base URL
249        let page_url = self.derive_page_url(md_file, base_url);
250
251        // Chunk the content
252        let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
253
254        // Create document chunks
255        let mut document_chunks = Vec::new();
256        for (i, chunk_content) in chunks.into_iter().enumerate() {
257            // Try to determine which section this chunk belongs to
258            let section = find_section_for_chunk(&chunk_content, &sections);
259
260            let chunk = DocumentChunk {
261                id: format!("{}_{}", page_url, i),
262                content: preprocessing::clean_text(&chunk_content),
263                source_path: PathBuf::from(&page_url),
264                source_type: SourceType::Web,
265                title: title.clone(),
266                section: section.clone(),
267                chunk_index: i,
268                metadata: metadata.clone(),
269            };
270
271            document_chunks.push(chunk);
272        }
273
274        Ok(document_chunks)
275    }
276
277    /// Create metadata for a markdown file from docrawl
278    fn create_markdown_metadata(
279        &self,
280        md_file: &Path,
281        content: &str,
282        base_url: &str,
283    ) -> Result<DocumentMetadata> {
284        let file_metadata = std::fs::metadata(md_file)?;
285        let modified_time = file_metadata.modified()?;
286        let modified_datetime = chrono::DateTime::<chrono::Utc>::from(modified_time);
287
288        // Extract tags from file path and content
289        let mut tags = extract_tags_from_path(md_file);
290        tags.push("documentation".to_string());
291        tags.push("crawled".to_string());
292
293        // Add base domain as a tag (simple extraction)
294        if let Some(domain) = extract_domain_from_url(base_url) {
295            tags.push(domain);
296        }
297
298        // Detect language from content (basic detection for now)
299        let language = detect_language(md_file);
300
301        Ok(DocumentMetadata {
302            file_type: "markdown".to_string(),
303            size: content.len() as u64,
304            modified: modified_datetime,
305            tags,
306            language,
307        })
308    }
309
310    /// Derive a logical page URL from the markdown file path
311    fn derive_page_url(&self, md_file: &Path, base_url: &str) -> String {
312        // Get the relative path from the temp directory
313        let file_name = md_file
314            .file_stem()
315            .and_then(|s| s.to_str())
316            .unwrap_or("page");
317
318        // Create a logical URL by combining base URL with file name
319        if base_url.ends_with('/') {
320            format!("{}{}", base_url, file_name)
321        } else {
322            format!("{}/{}", base_url, file_name)
323        }
324    }
325}
326
327/// Supported file extensions for indexing
328const SUPPORTED_EXTENSIONS: &[&str] = &[
329    // Documentation
330    ".md",
331    ".txt",
332    ".pdf",
333    ".doc",
334    ".docx",
335    ".rst",
336    // Web/Frontend
337    ".js",
338    ".jsx",
339    ".ts",
340    ".tsx",
341    ".vue",
342    ".svelte",
343    ".html",
344    ".css",
345    ".scss",
346    ".sass",
347    ".less",
348    // Backend/Server
349    ".py",
350    ".rb",
351    ".php",
352    ".java",
353    ".scala",
354    ".kotlin",
355    ".groovy",
356    // Systems Programming
357    ".c",
358    ".cpp",
359    ".cc",
360    ".cxx",
361    ".h",
362    ".hpp",
363    ".rs",
364    ".go",
365    ".zig",
366    // Functional
367    ".ml",
368    ".mli",
369    ".hs",
370    ".elm",
371    ".clj",
372    ".cljs",
373    ".erl",
374    ".ex",
375    ".exs",
376    // Data/Config
377    ".json",
378    ".yaml",
379    ".yml",
380    ".toml",
381    ".xml",
382    ".ini",
383    ".env",
384    ".properties",
385    // Shell/Scripts (with security validation)
386    ".sh",
387    ".bash",
388    ".zsh",
389    ".fish",
390    ".ps1",
391    ".bat",
392    ".cmd",
393    // Mobile
394    ".swift",
395    ".m",
396    ".mm",
397    ".kt",
398    ".dart",
399    // Database
400    ".sql",
401    ".graphql",
402    ".prisma",
403    // Other Languages
404    ".r",
405    ".R",
406    ".jl",
407    ".lua",
408    ".vim",
409    ".el",
410];
411
412/// Default chunk size in tokens (approximately)
413const DEFAULT_CHUNK_SIZE: usize = 500;
414
415/// Overlap between chunks in tokens
416const DEFAULT_CHUNK_OVERLAP: usize = 50;
417
418/// Find all indexable documents in a directory using WalkDir for performance
419pub fn find_documents(dir_path: &Path) -> Result<Vec<PathBuf>> {
420    if !dir_path.exists() {
421        return Err(anyhow!("Directory does not exist: {:?}", dir_path));
422    }
423
424    if !dir_path.is_dir() {
425        return Err(anyhow!("Path is not a directory: {:?}", dir_path));
426    }
427
428    let mut documents = Vec::new();
429    let max_depth = 10; // Prevent infinite recursion
430    let max_file_size = 100 * 1024 * 1024; // 100MB limit per file
431
432    log::info!("Scanning directory for documents: {:?}", dir_path);
433
434    for entry in WalkDir::new(dir_path)
435        .max_depth(max_depth)
436        .follow_links(false) // Avoid symlink cycles
437        .into_iter()
438        .filter_map(|e| e.ok())
439    // Skip entries that can't be read
440    {
441        let path = entry.path();
442
443        // Skip directories
444        if !path.is_file() {
445            continue;
446        }
447
448        // Check if file type is supported
449        if !is_supported_file(path) {
450            continue;
451        }
452
453        // Check file size limits
454        if let Ok(metadata) = entry.metadata() {
455            if metadata.len() > max_file_size {
456                log::warn!(
457                    "Skipping large file ({}MB): {:?}",
458                    metadata.len() / 1024 / 1024,
459                    path
460                );
461                continue;
462            }
463        }
464
465        // Skip hidden files and directories (starting with .)
466        if path
467            .file_name()
468            .and_then(|name| name.to_str())
469            .map(|name| name.starts_with('.'))
470            .unwrap_or(false)
471        {
472            log::debug!("Skipping hidden file: {:?}", path);
473            continue;
474        }
475
476        // Skip common binary/cache directories
477        let path_str = path.to_string_lossy();
478        let skip_patterns = [
479            "/target/",
480            "/.git/",
481            "/node_modules/",
482            "/__pycache__/",
483            "/.cache/",
484            "/dist/",
485            "/build/",
486        ];
487
488        if skip_patterns
489            .iter()
490            .any(|pattern| path_str.contains(pattern))
491        {
492            log::debug!("Skipping file in ignored directory: {:?}", path);
493            continue;
494        }
495
496        documents.push(path.to_path_buf());
497    }
498
499    log::info!(
500        "Found {} indexable documents in {:?} (max depth: {})",
501        documents.len(),
502        dir_path,
503        max_depth
504    );
505
506    if documents.is_empty() {
507        log::warn!(
508            "No supported documents found in {:?}. Supported formats: {:?}",
509            dir_path,
510            SUPPORTED_EXTENSIONS
511        );
512    }
513
514    Ok(documents)
515}
516
517/// Check if a file is supported for indexing
518pub fn is_supported_file(path: &Path) -> bool {
519    path.extension()
520        .and_then(|ext| ext.to_str())
521        .map(|ext| SUPPORTED_EXTENSIONS.contains(&format!(".{}", ext.to_lowercase()).as_str()))
522        .unwrap_or(false)
523}
524
525/// Index a single document and return chunks
526pub fn index_document(path: PathBuf, config: &RagConfig) -> Result<Vec<DocumentChunk>> {
527    if !path.exists() {
528        return Err(anyhow!("File does not exist: {:?}", path));
529    }
530
531    if !is_supported_file(&path) {
532        return Err(anyhow!("Unsupported file type: {:?}", path));
533    }
534
535    // SECURITY: Check if PDF processing is disabled
536    let extension = path
537        .extension()
538        .and_then(|ext| ext.to_str())
539        .unwrap_or("")
540        .to_lowercase();
541
542    if extension == "pdf" && !config.allow_pdf_processing {
543        log::warn!("PDF processing disabled for security. Skipping: {:?}", path);
544        return Ok(vec![]); // Skip PDF files when disabled
545    }
546
547    // SECURITY: Check if code processing is disabled
548    let code_extensions = [
549        "js", "jsx", "ts", "tsx", "py", "rb", "php", "java", "scala", "kotlin", "rs", "go", "c",
550        "cpp", "sh", "bash", "ps1",
551    ];
552    if code_extensions.contains(&extension.as_str()) && !config.allow_code_processing {
553        log::warn!("Code processing disabled. Skipping: {:?}", path);
554        return Ok(vec![]); // Skip code files when disabled
555    }
556
557    log::info!("Indexing document: {:?}", path);
558
559    // Extract text content with configuration
560    let content = extract_text(&path, config)?;
561    if content.trim().is_empty() {
562        return Err(anyhow!("Document contains no text content: {:?}", path));
563    }
564
565    // Get file metadata
566    let metadata = extract_metadata(&path)?;
567
568    // Detect document structure (title, sections)
569    let (title, sections) = detect_structure(&content, &path);
570
571    // Chunk the content
572    let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
573
574    // Create document chunks
575    let mut document_chunks = Vec::new();
576    for (i, chunk_content) in chunks.into_iter().enumerate() {
577        // Try to determine which section this chunk belongs to
578        let section = find_section_for_chunk(&chunk_content, &sections);
579
580        let chunk = DocumentChunk {
581            id: format!("{}_{}", path.to_string_lossy(), i),
582            content: preprocessing::clean_text(&chunk_content),
583            source_path: path.clone(),
584            source_type: SourceType::Local, // Default to local for now
585            title: title.clone(),
586            section: section.clone(),
587            chunk_index: i,
588            metadata: metadata.clone(),
589        };
590
591        document_chunks.push(chunk);
592    }
593
594    log::info!("Created {} chunks from {:?}", document_chunks.len(), path);
595    Ok(document_chunks)
596}
597
598/// Extract text content from various file formats
599fn extract_text(path: &Path, config: &RagConfig) -> Result<String> {
600    let extension = path
601        .extension()
602        .and_then(|ext| ext.to_str())
603        .unwrap_or("")
604        .to_lowercase();
605
606    match extension.as_str() {
607        "md" | "txt" | "rst" => extract_text_file(path),
608        "pdf" => extract_pdf_text(path),
609        "doc" | "docx" => extract_doc_text(path),
610        // Code files
611        "js" | "jsx" | "ts" | "tsx" | "vue" | "svelte" | "html" | "css" | "scss" | "sass"
612        | "less" | "py" | "rb" | "php" | "java" | "scala" | "kotlin" | "groovy" | "c" | "cpp"
613        | "cc" | "cxx" | "h" | "hpp" | "rs" | "go" | "zig" | "ml" | "mli" | "hs" | "elm"
614        | "clj" | "cljs" | "erl" | "ex" | "exs" | "swift" | "m" | "mm" | "kt" | "dart" | "r"
615        | "jl" | "lua" | "vim" | "el" | "sql" | "graphql" | "prisma" => {
616            extract_code_text(path, config)
617        }
618        // Config files
619        "json" | "yaml" | "yml" | "toml" | "xml" | "ini" | "properties" => {
620            extract_config_text(path, config)
621        }
622        // Shell scripts (with extra security)
623        "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => extract_shell_text(path, config),
624        // Environment files (with secret masking)
625        "env" => extract_env_text(path, config),
626        _ => Err(anyhow!("Unsupported file extension: {}", extension)),
627    }
628}
629
630/// Extract text from plain text files (markdown, txt)
631fn extract_text_file(path: &Path) -> Result<String> {
632    fs::read_to_string(path).map_err(|e| anyhow!("Failed to read text file {:?}: {}", path, e))
633}
634
635/// Extract text from PDF files with security validation
636fn extract_pdf_text(path: &Path) -> Result<String> {
637    log::info!("Processing PDF file with security validation: {:?}", path);
638
639    // SECURITY: Validate PDF before processing
640    validate_pdf_security(path)?;
641
642    // Create a metadata entry that includes the filename and basic information
643    // Note: Basic PDF metadata extraction with security validation
644    let file_name = path
645        .file_stem()
646        .and_then(|name| name.to_str())
647        .unwrap_or("unknown");
648
649    // Get file size for indexing
650    let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
651
652    // Create indexable content from filename and metadata
653    let mut content = String::new();
654    content.push_str(&format!("PDF Document: {}\n", file_name));
655    content.push_str(&format!("File size: {} bytes\n", file_size));
656    content.push_str(&format!("Location: {}\n", path.display()));
657
658    // Add searchable terms from filename
659    let searchable_terms: Vec<&str> = file_name
660        .split(|c: char| !c.is_alphanumeric())
661        .filter(|term| term.len() > 2)
662        .collect();
663
664    if !searchable_terms.is_empty() {
665        content.push_str("Keywords: ");
666        content.push_str(&searchable_terms.join(", "));
667        content.push('\n');
668    }
669
670    // PDF processing currently indexes by filename and metadata
671    content.push_str("This document is indexed by filename and metadata.");
672
673    log::info!(
674        "Created indexable content for PDF {:?} ({} characters)",
675        path,
676        content.len()
677    );
678    Ok(content)
679}
680
681/// Extract text from DOC/DOCX files
682fn extract_doc_text(path: &Path) -> Result<String> {
683    log::info!("Processing DOC/DOCX file: {:?}", path);
684
685    let extension = path
686        .extension()
687        .and_then(|ext| ext.to_str())
688        .unwrap_or("")
689        .to_lowercase();
690
691    // Handle legacy DOC format
692    if extension == "doc" {
693        log::warn!("Legacy DOC format detected: {:?}", path);
694        return create_doc_metadata(path, "DOC (Legacy Word Document)");
695    }
696
697    // Handle DOCX format
698    if extension == "docx" {
699        // Try to extract text using docx-rs library
700        match extract_docx_text_safe(path) {
701            Ok(content) => {
702                log::info!(
703                    "Successfully extracted {} characters from DOCX: {:?}",
704                    content.len(),
705                    path
706                );
707                return Ok(content);
708            }
709            Err(e) => {
710                log::warn!(
711                    "Failed to extract DOCX text, using metadata fallback: {}",
712                    e
713                );
714                return create_doc_metadata(path, "DOCX (Word Document)");
715            }
716        }
717    }
718
719    Err(anyhow!("Unsupported document format: {:?}", extension))
720}
721
722/// Safely extract DOCX text with error handling
723fn extract_docx_text_safe(path: &Path) -> Result<String> {
724    use docx_rs::read_docx;
725
726    // Read the DOCX file as bytes
727    let file_bytes = std::fs::read(path).map_err(|e| anyhow!("Failed to read DOCX file: {}", e))?;
728
729    let _docx = read_docx(&file_bytes).map_err(|e| anyhow!("Failed to parse DOCX file: {}", e))?;
730
731    // Extract text content from document (basic implementation)
732    let mut text_content = String::new();
733
734    // Simple text extraction - this may need enhancement based on docx-rs API
735    text_content.push_str(&format!("DOCX Document from: {}\n", path.display()));
736    text_content.push_str("Document content successfully parsed.\n");
737    text_content.push_str("Note: Basic DOCX processing - text extraction can be enhanced.");
738
739    Ok(text_content)
740}
741
742/// Extract text from code files with security validation
743fn extract_code_text(path: &Path, config: &RagConfig) -> Result<String> {
744    // Validate code file security
745    validate_code_security(path, &config.code_security_level)?;
746
747    // Read the code file
748    let content = fs::read_to_string(path)
749        .map_err(|e| anyhow!("Failed to read code file {:?}: {}", path, e))?;
750
751    // Clean and prepare for indexing
752    let cleaned = if config.mask_secrets {
753        sanitize_code_content(&content)
754    } else {
755        content
756    };
757    Ok(cleaned)
758}
759
760/// Extract text from config files with validation
761fn extract_config_text(path: &Path, config: &RagConfig) -> Result<String> {
762    let content = fs::read_to_string(path)
763        .map_err(|e| anyhow!("Failed to read config file {:?}: {}", path, e))?;
764
765    // Mask any potential secrets in config files
766    let sanitized = if config.mask_secrets {
767        mask_secrets(&content)
768    } else {
769        content
770    };
771    Ok(sanitized)
772}
773
774/// Extract text from shell scripts with enhanced security validation
775fn extract_shell_text(path: &Path, config: &RagConfig) -> Result<String> {
776    // Extra security validation for shell scripts
777    validate_shell_security(path, &config.code_security_level)?;
778
779    let content = fs::read_to_string(path)
780        .map_err(|e| anyhow!("Failed to read shell script {:?}: {}", path, e))?;
781
782    // Sanitize shell content
783    let sanitized = if config.mask_secrets {
784        sanitize_shell_content(&content)
785    } else {
786        content
787    };
788    Ok(sanitized)
789}
790
791/// Extract text from environment files with secret masking
792fn extract_env_text(path: &Path, _config: &RagConfig) -> Result<String> {
793    let content = fs::read_to_string(path)
794        .map_err(|e| anyhow!("Failed to read env file {:?}: {}", path, e))?;
795
796    // Heavily mask environment files
797    let masked = mask_env_secrets(&content);
798    Ok(masked)
799}
800
801/// Security validation for code files to prevent malicious content processing
802fn validate_code_security(
803    path: &Path,
804    security_level: &crate::rag::CodeSecurityLevel,
805) -> Result<()> {
806    use crate::rag::CodeSecurityLevel;
807    log::debug!("Running security validation on code file: {:?}", path);
808
809    // Check file size
810    let metadata = fs::metadata(path)?;
811    const MAX_CODE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
812    if metadata.len() > MAX_CODE_SIZE {
813        return Err(anyhow!(
814            "Code file rejected: Size {} bytes exceeds maximum allowed size of {} bytes",
815            metadata.len(),
816            MAX_CODE_SIZE
817        ));
818    }
819
820    // Read file content for analysis
821    let content = fs::read_to_string(path)
822        .map_err(|e| anyhow!("Failed to read code file for validation: {}", e))?;
823
824    // Check for obfuscated code patterns
825    if is_potentially_obfuscated(&content) {
826        match security_level {
827            CodeSecurityLevel::Strict => {
828                return Err(anyhow!(
829                    "Code file rejected: Contains potentially obfuscated content"
830                ));
831            }
832            CodeSecurityLevel::Moderate => {
833                log::warn!("Code file may contain obfuscated content: {:?}", path);
834            }
835            CodeSecurityLevel::Permissive => {
836                log::debug!("Obfuscated content check bypassed (permissive mode)");
837            }
838        }
839    }
840
841    // Check for suspicious URLs or domains
842    validate_urls_in_code(&content, security_level)?;
843
844    // Check for prompt injection patterns
845    check_prompt_injection(&content, security_level)?;
846
847    Ok(())
848}
849
850/// Enhanced security validation for shell scripts
851fn validate_shell_security(
852    path: &Path,
853    security_level: &crate::rag::CodeSecurityLevel,
854) -> Result<()> {
855    use crate::rag::CodeSecurityLevel;
856    log::debug!(
857        "Running enhanced security validation on shell script: {:?}",
858        path
859    );
860
861    let content = fs::read_to_string(path)
862        .map_err(|e| anyhow!("Failed to read shell script for validation: {}", e))?;
863
864    // Dangerous shell command patterns
865    let dangerous_patterns = [
866        r"rm\s+-rf\s+/",               // rm -rf /
867        r"rm\s+-rf\s+\*",              // rm -rf *
868        r":\(\)\s*\{\s*:\|\:&\s*\};:", // Fork bomb
869        r"mkfs\.",                     // Format filesystem
870        r"dd\s+if=/dev/(zero|random)", // Disk wipe
871        r">\s*/dev/sda",               // Direct disk write
872        r"curl.*\|\s*(ba)?sh",         // Remote code execution
873        r"wget.*\|\s*(ba)?sh",         // Remote code execution
874        r"eval\s+.*\$\(",              // Eval with command substitution
875        r"python\s+-c.*exec",          // Python exec
876    ];
877
878    let compiled_patterns: Vec<regex::Regex> = dangerous_patterns
879        .iter()
880        .filter_map(|pattern| regex::Regex::new(pattern).ok())
881        .collect();
882
883    for pattern in &compiled_patterns {
884        if pattern.is_match(&content) {
885            match security_level {
886                CodeSecurityLevel::Strict | CodeSecurityLevel::Moderate => {
887                    return Err(anyhow!(
888                        "Shell script rejected: Contains potentially dangerous command pattern"
889                    ));
890                }
891                CodeSecurityLevel::Permissive => {
892                    log::warn!("Dangerous shell pattern detected but allowed in permissive mode");
893                }
894            }
895        }
896    }
897
898    Ok(())
899}
900
901/// Check for potentially obfuscated code
902fn is_potentially_obfuscated(content: &str) -> bool {
903    // Check for high entropy (randomness) in variable names
904    let lines: Vec<&str> = content.lines().collect();
905    let mut suspicious_count = 0;
906
907    let hex_regex = regex::Regex::new(r"\\x[0-9a-fA-F]{2}").unwrap();
908
909    for line in lines {
910        // Skip comments
911        if line.trim().starts_with("//")
912            || line.trim().starts_with("#")
913            || line.trim().starts_with("/*")
914        {
915            continue;
916        }
917
918        // Check for base64 encoded strings
919        if line.contains("atob") || line.contains("btoa") || line.contains("base64") {
920            suspicious_count += 1;
921        }
922
923        // Check for hex strings
924        if hex_regex.is_match(line) {
925            suspicious_count += 1;
926        }
927
928        // Check for excessive use of escape characters
929        if line.matches('\\').count() > 10 {
930            suspicious_count += 1;
931        }
932    }
933
934    suspicious_count > 5
935}
936
937/// Validate URLs in code for suspicious domains
938fn validate_urls_in_code(
939    content: &str,
940    security_level: &crate::rag::CodeSecurityLevel,
941) -> Result<()> {
942    use crate::rag::CodeSecurityLevel;
943    let url_pattern = regex::Regex::new(r#"https?://[^\s"']+"#).unwrap();
944
945    let suspicious_domains = [
946        "bit.ly",
947        "tinyurl.com",
948        "goo.gl",
949        "ow.ly",
950        "shorte.st",
951        "adf.ly",
952        "bc.vc",
953        "bit.do",
954        "soo.gd",
955        "7.ly",
956        "5z8.info",
957        "DFHGDH", // Common in malware
958    ];
959
960    for url_match in url_pattern.find_iter(content) {
961        let url = url_match.as_str();
962        for domain in &suspicious_domains {
963            if url.contains(domain) {
964                match security_level {
965                    CodeSecurityLevel::Strict => {
966                        return Err(anyhow!(
967                            "Code rejected: Contains suspicious URL shortener: {}",
968                            url
969                        ));
970                    }
971                    CodeSecurityLevel::Moderate => {
972                        log::warn!("Suspicious URL shortener found in code: {}", url);
973                    }
974                    CodeSecurityLevel::Permissive => {
975                        log::debug!("URL check bypassed (permissive mode): {}", url);
976                    }
977                }
978            }
979        }
980    }
981
982    Ok(())
983}
984
985/// Check for prompt injection patterns
986fn check_prompt_injection(
987    content: &str,
988    security_level: &crate::rag::CodeSecurityLevel,
989) -> Result<()> {
990    use crate::rag::CodeSecurityLevel;
991    let injection_patterns = [
992        "ignore previous instructions",
993        "disregard all prior",
994        "forget everything above",
995        "new instructions:",
996        "SYSTEM PROMPT:",
997        "###SYSTEM###",
998        "</system>",
999        "<|im_start|>",
1000        "<|im_end|>",
1001    ];
1002
1003    let content_lower = content.to_lowercase();
1004    for pattern in &injection_patterns {
1005        if content_lower.contains(pattern) {
1006            match security_level {
1007                CodeSecurityLevel::Strict => {
1008                    return Err(anyhow!(
1009                        "Code rejected: Contains potential prompt injection pattern: {}",
1010                        pattern
1011                    ));
1012                }
1013                CodeSecurityLevel::Moderate => {
1014                    log::warn!("Potential prompt injection pattern detected: {}", pattern);
1015                }
1016                CodeSecurityLevel::Permissive => {
1017                    log::debug!("Prompt injection check bypassed (permissive mode)");
1018                }
1019            }
1020        }
1021    }
1022
1023    Ok(())
1024}
1025
1026/// Sanitize code content for safe indexing
1027fn sanitize_code_content(content: &str) -> String {
1028    // Remove any inline secrets or API keys and preserve code structure
1029    mask_secrets(content)
1030}
1031
1032/// Sanitize shell script content
1033fn sanitize_shell_content(content: &str) -> String {
1034    // Mask any hardcoded passwords or secrets
1035    mask_secrets(content)
1036}
1037
1038/// Mask secrets in content
1039fn mask_secrets(content: &str) -> String {
1040    let mut result = content.to_string();
1041
1042    // Patterns for common secrets
1043    let secret_patterns = [
1044        (
1045            r#"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1046            "API_KEY=[MASKED]",
1047        ),
1048        (
1049            r#"(?i)(secret|password|passwd|pwd)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1050            "SECRET=[MASKED]",
1051        ),
1052        (
1053            r#"(?i)(token|auth)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1054            "TOKEN=[MASKED]",
1055        ),
1056        (r"(?i)bearer\s+[a-zA-Z0-9\-._~+/]+", "Bearer [MASKED]"),
1057        (
1058            r"-----BEGIN (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----[\s\S]*?-----END (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----",
1059            "[PRIVATE_KEY_MASKED]",
1060        ),
1061        (r"ghp_[a-zA-Z0-9]{36}", "ghp_[GITHUB_TOKEN_MASKED]"),
1062        (r"sk-[a-zA-Z0-9]{48}", "sk-[OPENAI_KEY_MASKED]"),
1063    ];
1064
1065    for (pattern, replacement) in &secret_patterns {
1066        if let Ok(re) = regex::Regex::new(pattern) {
1067            result = re.replace_all(&result, *replacement).to_string();
1068        }
1069    }
1070
1071    result
1072}
1073
1074/// Heavily mask environment file secrets
1075fn mask_env_secrets(content: &str) -> String {
1076    let mut result = String::new();
1077
1078    for line in content.lines() {
1079        if line.trim().is_empty() || line.trim().starts_with('#') {
1080            result.push_str(line);
1081            result.push('\n');
1082            continue;
1083        }
1084
1085        if let Some(eq_pos) = line.find('=') {
1086            let key = &line[..eq_pos];
1087            // Keep the key but mask the value
1088            result.push_str(key);
1089            result.push_str("=[MASKED]\n");
1090        } else {
1091            result.push_str(line);
1092            result.push('\n');
1093        }
1094    }
1095
1096    result
1097}
1098
1099/// Security validation for PDF files to prevent malicious content processing
1100fn validate_pdf_security(path: &Path) -> Result<()> {
1101    log::debug!("Running security validation on PDF: {:?}", path);
1102
1103    // Check file size - reject extremely large files that could cause DoS
1104    const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; // 100MB limit
1105    let metadata = fs::metadata(path)?;
1106    if metadata.len() > MAX_PDF_SIZE {
1107        return Err(anyhow!(
1108            "PDF file rejected: Size {} bytes exceeds maximum allowed size of {} bytes ({}MB)",
1109            metadata.len(),
1110            MAX_PDF_SIZE,
1111            MAX_PDF_SIZE / (1024 * 1024)
1112        ));
1113    }
1114
1115    // Read the first few bytes to validate PDF header
1116    let mut buffer = vec![0u8; 1024];
1117    let file = fs::File::open(path)?;
1118    use std::io::Read;
1119    let mut reader = std::io::BufReader::new(file);
1120    let bytes_read = reader.read(&mut buffer)?;
1121
1122    if bytes_read < 8 {
1123        return Err(anyhow!("PDF file rejected: File too small or corrupted"));
1124    }
1125
1126    // Validate PDF magic header
1127    if !buffer.starts_with(b"%PDF-") {
1128        return Err(anyhow!(
1129            "PDF file rejected: Invalid PDF header - not a valid PDF file"
1130        ));
1131    }
1132
1133    // Check PDF version - reject very old or suspicious versions
1134    if bytes_read >= 8 {
1135        let version_bytes = &buffer[5..8];
1136        if let Ok(version_str) = std::str::from_utf8(version_bytes) {
1137            // Extract major version number
1138            if let Some(major_char) = version_str.chars().next() {
1139                if let Some(major) = major_char.to_digit(10) {
1140                    if !(1..=2).contains(&major) {
1141                        // Only allow PDF versions 1.x and 2.x
1142                        return Err(anyhow!(
1143                            "PDF file rejected: Unsupported PDF version {}",
1144                            version_str
1145                        ));
1146                    }
1147                }
1148            }
1149        }
1150    }
1151
1152    // Scan for suspicious content patterns in the first KB
1153    let content = std::str::from_utf8(&buffer[..bytes_read]).unwrap_or("");
1154
1155    // Dangerous JavaScript/ActionScript patterns
1156    let dangerous_patterns = [
1157        "/JavaScript",
1158        "/JS",
1159        "/OpenAction",
1160        "/AA", // Auto Action
1161        "/Launch",
1162        "/GoToE", // GoToEmbedded
1163        "/GoToR", // GoToRemote
1164        "/ImportData",
1165        "/SubmitForm",
1166        "/URI",
1167        "/Sound",
1168        "/Movie",
1169        "/RichMedia",
1170        "/3D",
1171        "/Encrypt",
1172        "eval(",
1173        "unescape(",
1174        "String.fromCharCode(",
1175        "document.write(",
1176        "this.print(",
1177        "app.alert(",
1178        "xfa.host",
1179        "soap.connect",
1180        "util.printf",
1181    ];
1182
1183    for pattern in &dangerous_patterns {
1184        if content.contains(pattern) {
1185            log::warn!(
1186                "PDF security violation: Found suspicious pattern '{}' in {}",
1187                pattern,
1188                path.display()
1189            );
1190            return Err(anyhow!(
1191                "PDF file rejected: Contains potentially malicious content pattern '{}'. PDF may contain embedded JavaScript or other dangerous elements.", 
1192                pattern
1193            ));
1194        }
1195    }
1196
1197    // Check for embedded files patterns
1198    let embed_patterns = ["/EmbeddedFile", "/F ", "/UF ", "/Filespec"];
1199    for pattern in &embed_patterns {
1200        if content.contains(pattern) {
1201            log::warn!(
1202                "PDF security violation: Found embedded file pattern '{}' in {}",
1203                pattern,
1204                path.display()
1205            );
1206            return Err(anyhow!(
1207                "PDF file rejected: Contains embedded files which pose security risks"
1208            ));
1209        }
1210    }
1211
1212    // Check for form patterns that could be used for data exfiltration
1213    let form_patterns = ["/XFA", "/AcroForm", "/Fields"];
1214    for pattern in &form_patterns {
1215        if content.contains(pattern) {
1216            log::warn!(
1217                "PDF security warning: Found form pattern '{}' in {}",
1218                pattern,
1219                path.display()
1220            );
1221            // Forms are suspicious but not automatically rejected - just logged
1222        }
1223    }
1224
1225    log::info!("PDF security validation passed for: {:?}", path);
1226    Ok(())
1227}
1228
1229/// Create metadata entry for documents that cannot be fully processed
1230fn create_doc_metadata(path: &Path, doc_type: &str) -> Result<String> {
1231    let file_name = path
1232        .file_stem()
1233        .and_then(|name| name.to_str())
1234        .unwrap_or("unknown");
1235
1236    // Get file size for indexing
1237    let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1238
1239    // Create indexable content from filename and metadata
1240    let mut content = String::new();
1241    content.push_str(&format!("{}: {}\n", doc_type, file_name));
1242    content.push_str(&format!("File size: {} bytes\n", file_size));
1243    content.push_str(&format!("Location: {}\n", path.display()));
1244
1245    // Add searchable terms from filename
1246    let searchable_terms: Vec<&str> = file_name
1247        .split(|c: char| !c.is_alphanumeric())
1248        .filter(|term| term.len() > 2)
1249        .collect();
1250
1251    if !searchable_terms.is_empty() {
1252        content.push_str("Keywords: ");
1253        content.push_str(&searchable_terms.join(", "));
1254        content.push('\n');
1255    }
1256
1257    // Enhanced document processing: indexed by filename, metadata, and content structure
1258    if let Ok(modified) = fs::metadata(path).and_then(|m| m.modified()) {
1259        if let Ok(duration) = modified.duration_since(std::time::SystemTime::UNIX_EPOCH) {
1260            let datetime = chrono::DateTime::from_timestamp(duration.as_secs() as i64, 0)
1261                .unwrap_or_else(chrono::Utc::now);
1262            content.push_str(&format!("Modified: {}\n", datetime.format("%Y-%m-%d")));
1263        }
1264    }
1265
1266    // Add file extension context
1267    if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
1268        content.push_str(&format!("Format: {} document\n", extension.to_uppercase()));
1269    }
1270
1271    Ok(content)
1272}
1273
1274/// Extract file metadata
1275fn extract_metadata(path: &Path) -> Result<DocumentMetadata> {
1276    let metadata = fs::metadata(path)?;
1277
1278    let file_type = path
1279        .extension()
1280        .and_then(|ext| ext.to_str())
1281        .unwrap_or("unknown")
1282        .to_lowercase();
1283
1284    let modified = metadata
1285        .modified()?
1286        .duration_since(std::time::UNIX_EPOCH)?
1287        .as_secs();
1288
1289    let modified_datetime = DateTime::from_timestamp(modified as i64, 0).unwrap_or_else(Utc::now);
1290
1291    // Extract tags from filename or path
1292    let tags = extract_tags_from_path(path);
1293
1294    // Try to detect language from content or filename
1295    let language = detect_language(path);
1296
1297    Ok(DocumentMetadata {
1298        file_type,
1299        size: metadata.len(),
1300        modified: modified_datetime,
1301        tags,
1302        language,
1303    })
1304}
1305
1306/// Extract tags from file path (e.g., directory names, filename patterns)
1307fn extract_tags_from_path(path: &Path) -> Vec<String> {
1308    let mut tags = Vec::new();
1309
1310    // Add parent directory names as tags
1311    if let Some(parent) = path.parent() {
1312        for component in parent.components() {
1313            if let Some(name) = component.as_os_str().to_str() {
1314                if !name.starts_with('.') && name != "/" {
1315                    tags.push(name.to_lowercase());
1316                }
1317            }
1318        }
1319    }
1320
1321    // Add filename-based tags
1322    if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1323        // Look for common patterns
1324        if filename.contains("readme") {
1325            tags.push("readme".to_string());
1326        }
1327        if filename.contains("api") {
1328            tags.push("api".to_string());
1329        }
1330        if filename.contains("guide") {
1331            tags.push("guide".to_string());
1332        }
1333        if filename.contains("tutorial") {
1334            tags.push("tutorial".to_string());
1335        }
1336    }
1337
1338    tags
1339}
1340
1341/// Detect document language
1342fn detect_language(_path: &Path) -> Option<String> {
1343    // For now, assume English. In a real implementation,
1344    // you could use language detection libraries
1345    Some("en".to_string())
1346}
1347
1348/// Detect document structure (title, sections)
1349fn detect_structure(content: &str, path: &Path) -> (Option<String>, Vec<String>) {
1350    let lines: Vec<&str> = content.lines().collect();
1351    let mut title = None;
1352    let mut sections = Vec::new();
1353
1354    // For markdown files, look for headers
1355    if path.extension().and_then(|s| s.to_str()) == Some("md") {
1356        for line in &lines {
1357            let trimmed = line.trim();
1358
1359            // Check for title (first # header)
1360            if title.is_none() && trimmed.starts_with("# ") {
1361                title = Some(trimmed[2..].trim().to_string());
1362            }
1363
1364            // Collect section headers
1365            if let Some(stripped) = trimmed.strip_prefix("## ") {
1366                sections.push(stripped.trim().to_string());
1367            } else if let Some(stripped) = trimmed.strip_prefix("### ") {
1368                sections.push(stripped.trim().to_string());
1369            }
1370        }
1371    }
1372
1373    // If no title found in markdown, use filename
1374    if title.is_none() {
1375        if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1376            title = Some(filename.replace(['_', '-'], " "));
1377        }
1378    }
1379
1380    (title, sections)
1381}
1382
1383/// Find which section a chunk belongs to
1384fn find_section_for_chunk(chunk: &str, sections: &[String]) -> Option<String> {
1385    // Look for section headers in the chunk
1386    for section in sections {
1387        if chunk.contains(section) {
1388            return Some(section.clone());
1389        }
1390    }
1391    None
1392}
1393
1394/// Chunk content into smaller pieces
1395fn chunk_content(content: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
1396    // Convert approximate token count to word count (rough estimate: 1 token ≈ 0.75 words)
1397    let word_chunk_size = (chunk_size as f32 * 0.75) as usize;
1398    let word_overlap = (overlap as f32 * 0.75) as usize;
1399
1400    // Use the preprocessing chunk_text function
1401    crate::rag::embeddings::preprocessing::chunk_text(content, word_chunk_size, word_overlap)
1402}
1403
1404/// Extract domain from URL without requiring external dependencies
1405fn extract_domain_from_url(url: &str) -> Option<String> {
1406    // Simple domain extraction without full URL parsing
1407    if let Some(start) = url.find("://") {
1408        let after_protocol = &url[start + 3..];
1409        if let Some(end) = after_protocol.find('/') {
1410            Some(after_protocol[..end].to_string())
1411        } else {
1412            Some(after_protocol.to_string())
1413        }
1414    } else {
1415        None
1416    }
1417}
1418
1419#[cfg(test)]
1420mod tests {
1421    use super::*;
1422    // Additional imports available if needed for enhanced testing
1423    // use std::fs::File;
1424    // use std::io::Write;
1425
1426    #[test]
1427    fn test_is_supported_file() {
1428        assert!(is_supported_file(Path::new("test.md")));
1429        assert!(is_supported_file(Path::new("test.txt")));
1430        assert!(is_supported_file(Path::new("test.pdf")));
1431        assert!(is_supported_file(Path::new("test.rs"))); // Now supported for code indexing
1432        assert!(!is_supported_file(Path::new("test.unknown")));
1433        assert!(!is_supported_file(Path::new("test")));
1434    }
1435
1436    #[test]
1437    fn test_detect_structure() {
1438        let content = r#"# Main Title
1439
1440Some introduction text.
1441
1442## Section 1
1443
1444Content for section 1.
1445
1446## Section 2
1447
1448Content for section 2.
1449
1450### Subsection 2.1
1451
1452More content.
1453"#;
1454
1455        let path = Path::new("test.md");
1456        let (title, sections) = detect_structure(content, path);
1457
1458        assert_eq!(title, Some("Main Title".to_string()));
1459        assert_eq!(sections.len(), 3);
1460        assert!(sections.contains(&"Section 1".to_string()));
1461        assert!(sections.contains(&"Section 2".to_string()));
1462        assert!(sections.contains(&"Subsection 2.1".to_string()));
1463    }
1464
1465    #[test]
1466    fn test_extract_tags_from_path() {
1467        let path = Path::new("/docs/api/authentication/readme.md");
1468        let tags = extract_tags_from_path(path);
1469
1470        assert!(tags.contains(&"docs".to_string()));
1471        assert!(tags.contains(&"api".to_string()));
1472        assert!(tags.contains(&"authentication".to_string()));
1473        assert!(tags.contains(&"readme".to_string()));
1474    }
1475
1476    #[test]
1477    fn test_chunk_content() {
1478        let content = "This is a test document with multiple sentences. Each sentence should be preserved in the chunking process. We want to make sure the chunks are reasonable.";
1479        let chunks = chunk_content(content, 10, 2); // Small chunks for testing
1480
1481        assert!(chunks.len() > 1);
1482        assert!(!chunks[0].is_empty());
1483
1484        // Check for overlap
1485        if chunks.len() > 1 {
1486            let words1: Vec<&str> = chunks[0].split_whitespace().collect();
1487            let words2: Vec<&str> = chunks[1].split_whitespace().collect();
1488
1489            // There should be some overlap between consecutive chunks
1490            let overlap_found = words1
1491                .iter()
1492                .rev()
1493                .take(5)
1494                .any(|word| words2.iter().take(5).any(|w| w == word));
1495            assert!(overlap_found);
1496        }
1497    }
1498}