Skip to main content

manx_cli/rag/
indexer.rs

1//! Document indexing pipeline for the RAG system
2//!
3//! This module handles parsing and chunking of various document formats
4//! for indexing into the vector database.
5
6use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use docrawl::{crawl, Config, CrawlConfig};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::time::Duration;
12use url::Url;
13use walkdir::WalkDir;
14
15use crate::rag::embeddings::preprocessing;
16use crate::rag::{DocumentChunk, DocumentMetadata, RagConfig, SourceType};
17
18/// Document indexer for the RAG system
19pub struct Indexer {
20    config: RagConfig,
21    index_path: PathBuf,
22}
23
24impl Indexer {
25    /// Create new indexer with configuration
26    pub fn new(config: &RagConfig) -> Result<Self> {
27        let index_path = if config.index_path.to_string_lossy().starts_with("~") {
28            // Expand home directory
29            let home = std::env::var("HOME")
30                .or_else(|_| std::env::var("USERPROFILE"))
31                .map_err(|_| anyhow!("Cannot determine home directory"))?;
32            let path_str = config.index_path.to_string_lossy();
33            let without_tilde = path_str.strip_prefix("~/").unwrap_or(&path_str[1..]);
34            PathBuf::from(home).join(without_tilde)
35        } else {
36            config.index_path.clone()
37        };
38
39        // Ensure index directory exists
40        if !index_path.exists() {
41            fs::create_dir_all(&index_path)
42                .map_err(|e| anyhow!("Failed to create index directory {:?}: {}", index_path, e))?;
43        }
44
45        Ok(Self {
46            config: config.clone(),
47            index_path,
48        })
49    }
50
51    /// Get the index directory path
52    pub fn get_index_path(&self) -> &PathBuf {
53        &self.index_path
54    }
55
56    /// Index a single document
57    pub fn index_document(&self, path: PathBuf) -> Result<Vec<DocumentChunk>> {
58        index_document(path, &self.config)
59    }
60
61    /// Index all documents in a directory
62    pub fn index_directory(&self, dir_path: PathBuf) -> Result<Vec<DocumentChunk>> {
63        let documents = find_documents(&dir_path)?;
64        let mut all_chunks = Vec::new();
65
66        for doc_path in documents {
67            match self.index_document(doc_path.clone()) {
68                Ok(mut chunks) => all_chunks.append(&mut chunks),
69                Err(e) => {
70                    log::warn!("Failed to index {:?}: {}", doc_path, e);
71                    continue;
72                }
73            }
74        }
75
76        log::info!(
77            "Indexed {} chunks from {} directory",
78            all_chunks.len(),
79            dir_path.display()
80        );
81        Ok(all_chunks)
82    }
83
84    /// Index a single URL without invoking the crawler (depth 0 semantics)
85    pub async fn index_single_url_no_crawl(&self, url: &str) -> Result<Vec<DocumentChunk>> {
86        log::info!("Fetching single URL without crawl: {}", url);
87
88        // Validate URL
89        let _ = Url::parse(url).map_err(|e| anyhow!("Invalid URL '{}': {}", url, e))?;
90
91        // Fetch page
92        let client = reqwest::Client::builder()
93            .user_agent("Manx/0.5.0 (Single Page Indexer)")
94            .timeout(Duration::from_secs(30))
95            .build()?;
96
97        let resp = client.get(url).send().await?;
98        if !resp.status().is_success() {
99            return Err(anyhow!("Failed to fetch URL {}: {}", url, resp.status()));
100        }
101        let html = resp.text().await?;
102
103        // Extract title (best-effort)
104        let page_title = extract_html_title(&html).or_else(|| extract_h1(&html));
105
106        // Convert HTML to plain text (markdown-like)
107        let text = clean_html_to_text(&html);
108        if text.trim().is_empty() {
109            return Err(anyhow!("Fetched page contains no indexable text: {}", url));
110        }
111
112        // Chunk the content
113        let chunks = chunk_content(&text, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
114
115        // Build metadata
116        let mut tags = vec!["documentation".to_string(), "single-page".to_string()];
117        if let Some(domain) = extract_domain_from_url(url) {
118            tags.push(domain);
119        }
120
121        let metadata = DocumentMetadata {
122            file_type: "html".to_string(),
123            size: text.len() as u64,
124            modified: Utc::now(),
125            tags,
126            language: Some("en".to_string()),
127        };
128
129        // Create document chunks
130        let mut document_chunks = Vec::new();
131        for (i, chunk_content) in chunks.into_iter().enumerate() {
132            let chunk = DocumentChunk {
133                id: format!("{}_{}", url, i),
134                content: preprocessing::clean_text(&chunk_content),
135                source_path: PathBuf::from(url),
136                source_type: SourceType::Web,
137                title: page_title.clone(),
138                section: None,
139                chunk_index: i,
140                metadata: metadata.clone(),
141            };
142            document_chunks.push(chunk);
143        }
144
145        log::info!(
146            "Indexed {} chunks from single URL without crawl: {}",
147            document_chunks.len(),
148            url
149        );
150        Ok(document_chunks)
151    }
152
153    /// Index content from a URL
154    #[allow(dead_code)]
155    pub async fn index_url(&self, url: String) -> Result<Vec<DocumentChunk>> {
156        log::info!("Indexing single URL (no crawling): {}", url);
157        // Use docrawl with depth 0 (single page only)
158        self.index_url_deep(url, Some(0), false).await
159    }
160
161    /// Index content from a URL with documentation-optimized crawling
162    pub async fn index_url_deep(
163        &self,
164        url: String,
165        crawl_depth: Option<u32>,
166        crawl_all: bool,
167    ) -> Result<Vec<DocumentChunk>> {
168        log::info!(
169            "Starting docrawl of URL: {} (depth: {:?}, crawl_all: {})",
170            url,
171            crawl_depth,
172            crawl_all
173        );
174
175        // Validate URL format
176        let parsed_url =
177            url::Url::parse(&url).map_err(|e| anyhow!("Invalid URL format '{}': {}", url, e))?;
178
179        // Security check - only allow HTTP/HTTPS
180        match parsed_url.scheme() {
181            "http" | "https" => {}
182            scheme => {
183                return Err(anyhow!(
184                    "Unsupported URL scheme '{}'. Only HTTP and HTTPS are allowed.",
185                    scheme
186                ))
187            }
188        }
189
190        // Create temporary directory for docrawl output
191        let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
192        std::fs::create_dir_all(&temp_dir)?;
193
194        // Parse the URL
195        let base_url = Url::parse(&url)?;
196
197        // Configure doccrawl with silenced output so Manx renders progress
198        let config = CrawlConfig {
199            base_url,
200            output_dir: temp_dir.clone(),
201            user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
202            max_depth: if let Some(depth) = crawl_depth {
203                Some(depth as usize)
204            } else if crawl_all {
205                None // No depth limit
206            } else {
207                Some(3) // Default depth
208            },
209            silence: true, // Silence docrawl; Manx renders its own progress UI
210            rate_limit_per_sec: 10,
211            follow_sitemaps: true,
212            concurrency: 4,
213            timeout: Some(Duration::from_secs(30)),
214            resume: false,
215            config: Config::default(),
216        };
217
218        log::info!("Running docrawl on: {}", url);
219        match crawl(config).await {
220            Ok(stats) => {
221                log::info!(
222                    "Docrawl completed successfully, processed {} pages",
223                    stats.pages
224                );
225            }
226            Err(e) => {
227                // Clean up temp directory
228                let _ = std::fs::remove_dir_all(&temp_dir);
229                return Err(anyhow!("Docrawl failed: {}", e));
230            }
231        }
232
233        // Process the generated markdown files
234        let mut all_chunks = Vec::new();
235        let markdown_files = self.find_markdown_files(&temp_dir)?;
236
237        log::info!(
238            "Processing {} markdown files from docrawl",
239            markdown_files.len()
240        );
241
242        for (index, md_file) in markdown_files.iter().enumerate() {
243            log::debug!(
244                "Processing markdown file {}/{}: {}",
245                index + 1,
246                markdown_files.len(),
247                md_file.display()
248            );
249
250            match self.process_markdown_file(md_file, &url).await {
251                Ok(chunks) => {
252                    let chunk_count = chunks.len();
253                    all_chunks.extend(chunks);
254                    log::debug!(
255                        "Successfully processed markdown: {} ({} chunks)",
256                        md_file.display(),
257                        chunk_count
258                    );
259                }
260                Err(e) => {
261                    log::warn!("Failed to process markdown '{}': {}", md_file.display(), e);
262                    // Continue with other files even if one fails
263                }
264            }
265        }
266
267        // Clean up temporary directory
268        if let Err(e) = std::fs::remove_dir_all(&temp_dir) {
269            log::warn!("Failed to clean up temporary directory: {}", e);
270        }
271
272        log::info!(
273            "Successfully indexed {} chunks from {} markdown files via docrawl of: {}",
274            all_chunks.len(),
275            markdown_files.len(),
276            url
277        );
278
279        Ok(all_chunks)
280    }
281
282    /// Find all markdown files in the crawled directory
283    fn find_markdown_files(&self, dir: &Path) -> Result<Vec<PathBuf>> {
284        let mut markdown_files = Vec::new();
285
286        for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
287            let path = entry.path();
288            if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md") {
289                markdown_files.push(path.to_path_buf());
290            }
291        }
292
293        Ok(markdown_files)
294    }
295
296    /// Process a markdown file generated by docrawl
297    pub(crate) async fn process_markdown_file(
298        &self,
299        md_file: &Path,
300        base_url: &str,
301    ) -> Result<Vec<DocumentChunk>> {
302        // Read the markdown content
303        let content = std::fs::read_to_string(md_file)?;
304
305        if content.trim().is_empty() {
306            return Err(anyhow!(
307                "Markdown file contains no content: {}",
308                md_file.display()
309            ));
310        }
311
312        // Create metadata for this markdown file
313        let metadata = self.create_markdown_metadata(md_file, &content, base_url)?;
314
315        // Detect document structure (title, sections) from markdown content
316        let (title, sections) = detect_structure(&content, md_file);
317
318        // Derive a logical page URL from the file path and base URL
319        let page_url = self.derive_page_url(md_file, base_url);
320
321        // Chunk the content
322        let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
323
324        // Create document chunks
325        let mut document_chunks = Vec::new();
326        for (i, chunk_content) in chunks.into_iter().enumerate() {
327            // Try to determine which section this chunk belongs to
328            let section = find_section_for_chunk(&chunk_content, &sections);
329
330            let chunk = DocumentChunk {
331                id: format!("{}_{}", page_url, i),
332                content: preprocessing::clean_text(&chunk_content),
333                source_path: PathBuf::from(&page_url),
334                source_type: SourceType::Web,
335                title: title.clone(),
336                section: section.clone(),
337                chunk_index: i,
338                metadata: metadata.clone(),
339            };
340
341            document_chunks.push(chunk);
342        }
343
344        Ok(document_chunks)
345    }
346
347    /// Create metadata for a markdown file from docrawl
348    fn create_markdown_metadata(
349        &self,
350        md_file: &Path,
351        content: &str,
352        base_url: &str,
353    ) -> Result<DocumentMetadata> {
354        let file_metadata = std::fs::metadata(md_file)?;
355        let modified_time = file_metadata.modified()?;
356        let modified_datetime = chrono::DateTime::<chrono::Utc>::from(modified_time);
357
358        // Extract tags from file path and content
359        let mut tags = extract_tags_from_path(md_file);
360        tags.push("documentation".to_string());
361        tags.push("crawled".to_string());
362
363        // Add base domain as a tag (simple extraction)
364        if let Some(domain) = extract_domain_from_url(base_url) {
365            tags.push(domain);
366        }
367
368        // Detect language from content (basic detection for now)
369        let language = detect_language(md_file);
370
371        Ok(DocumentMetadata {
372            file_type: "markdown".to_string(),
373            size: content.len() as u64,
374            modified: modified_datetime,
375            tags,
376            language,
377        })
378    }
379
380    /// Derive a logical page URL from the markdown file path
381    fn derive_page_url(&self, md_file: &Path, base_url: &str) -> String {
382        // Get the relative path from the temp directory
383        let file_name = md_file
384            .file_stem()
385            .and_then(|s| s.to_str())
386            .unwrap_or("page");
387
388        // Create a logical URL by combining base URL with file name
389        if base_url.ends_with('/') {
390            format!("{}{}", base_url, file_name)
391        } else {
392            format!("{}/{}", base_url, file_name)
393        }
394    }
395
396    /// Shallow crawl: fetch base page and first-level same-host links (depth 1)
397    pub async fn index_shallow_url(
398        &self,
399        url: &str,
400        max_pages: Option<usize>,
401    ) -> Result<Vec<DocumentChunk>> {
402        use indicatif::{ProgressBar, ProgressStyle};
403        use scraper::{Html, Selector};
404        use tokio::task::JoinSet;
405
406        let client = reqwest::Client::builder()
407            .user_agent("Manx/0.5.0 (Shallow Crawler)")
408            .timeout(Duration::from_secs(30))
409            .build()?;
410
411        // Fetch base page
412        let resp = client.get(url).send().await?;
413        if !resp.status().is_success() {
414            return Err(anyhow!("Failed to fetch URL {}: {}", url, resp.status()));
415        }
416        let final_url = resp.url().clone();
417        let base_html = resp.text().await?;
418
419        eprintln!("\n🌐 Shallow crawl starting: {}", url);
420
421        // Helper to build chunks from HTML
422        let mut all_chunks: Vec<DocumentChunk> = Vec::new();
423        let make_chunks = |page_url: &str, html: &str| -> Result<Vec<DocumentChunk>> {
424            let page_title = extract_html_title(html).or_else(|| extract_h1(html));
425            let text = clean_html_to_text(html);
426            if text.trim().is_empty() {
427                return Ok(vec![]);
428            }
429            let chunks = chunk_content(&text, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
430
431            // Build metadata
432            let mut tags = vec!["documentation".to_string(), "shallow-crawl".to_string()];
433            if let Some(domain) = extract_domain_from_url(page_url) {
434                tags.push(domain);
435            }
436            let metadata = DocumentMetadata {
437                file_type: "html".to_string(),
438                size: text.len() as u64,
439                modified: Utc::now(),
440                tags,
441                language: Some("en".to_string()),
442            };
443
444            let mut docs = Vec::new();
445            for (i, chunk_content) in chunks.into_iter().enumerate() {
446                docs.push(DocumentChunk {
447                    id: format!("{}_{}", page_url, i),
448                    content: preprocessing::clean_text(&chunk_content),
449                    source_path: PathBuf::from(page_url),
450                    source_type: SourceType::Web,
451                    title: page_title.clone(),
452                    section: None,
453                    chunk_index: i,
454                    metadata: metadata.clone(),
455                });
456            }
457            Ok(docs)
458        };
459
460        // Include base page
461        eprintln!("🔎 Fetching base page: {}", final_url);
462        all_chunks.extend(make_chunks(final_url.as_str(), &base_html)?);
463
464        // Parse links from base page
465        let doc = Html::parse_document(&base_html);
466        let a_sel = Selector::parse("a[href]").unwrap();
467        let base_host = final_url.host_str().unwrap_or("");
468        let norm = |h: &str| h.strip_prefix("www.").unwrap_or(h).to_string();
469        let base_norm = norm(base_host);
470
471        use std::collections::HashSet;
472        let mut seen: HashSet<String> = HashSet::new();
473        seen.insert(final_url.as_str().to_string());
474
475        let page_cap = max_pages.unwrap_or(usize::MAX);
476        let mut targets: Vec<url::Url> = Vec::new();
477        for a in doc.select(&a_sel) {
478            if let Some(href) = a.value().attr("href") {
479                // Resolve relative links
480                if let Ok(abs) = final_url.join(href) {
481                    // Same-host (treat www. as equivalent)
482                    if let Some(h) = abs.host_str() {
483                        if norm(h) != base_norm {
484                            continue;
485                        }
486                    } else {
487                        continue;
488                    }
489                    // Deduplicate
490                    let s = abs.as_str().to_string();
491                    if !seen.insert(s.clone()) {
492                        continue;
493                    }
494                    targets.push(abs);
495                    if seen.len() >= page_cap {
496                        break;
497                    }
498                }
499            }
500        }
501
502        eprintln!("🔗 Found {} same-host links", targets.len());
503
504        // Fetch first-level pages with small concurrency
505        let pb = if !targets.is_empty() {
506            let pb = ProgressBar::new(targets.len() as u64);
507            pb.set_style(
508                ProgressStyle::default_bar()
509                    .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} pages ({percent}%) | {msg}")
510                    .unwrap()
511                    .progress_chars("█▉▊▋▌▍▎▏  "),
512            );
513            pb.set_message("Fetching pages...");
514            Some(pb)
515        } else {
516            None
517        };
518        let mut set = JoinSet::new();
519        let client2 = client.clone();
520        for t in targets.into_iter() {
521            let client3 = client2.clone();
522            set.spawn(async move {
523                let r = client3.get(t.clone()).send().await.ok()?;
524                if !r.status().is_success() {
525                    return None;
526                }
527                let html = r.text().await.ok()?;
528                Some((t.to_string(), html))
529            });
530        }
531
532        let mut fetched = 0usize;
533        while let Some(res) = set.join_next().await {
534            if let Ok(Some((page_url, html))) = res {
535                if let Ok(mut chunks) = make_chunks(&page_url, &html) {
536                    all_chunks.append(&mut chunks);
537                }
538            }
539            fetched += 1;
540            if let Some(pb) = &pb {
541                pb.set_position(fetched as u64);
542            }
543        }
544
545        if let Some(pb) = pb {
546            pb.finish_with_message("✓ Shallow crawl completed");
547        }
548
549        Ok(all_chunks)
550    }
551}
552
553/// Supported file extensions for indexing
554const SUPPORTED_EXTENSIONS: &[&str] = &[
555    // Documentation
556    ".md",
557    ".txt",
558    ".pdf",
559    ".doc",
560    ".docx",
561    ".rst",
562    // Web/Frontend
563    ".js",
564    ".jsx",
565    ".ts",
566    ".tsx",
567    ".vue",
568    ".svelte",
569    ".html",
570    ".css",
571    ".scss",
572    ".sass",
573    ".less",
574    // Backend/Server
575    ".py",
576    ".rb",
577    ".php",
578    ".java",
579    ".scala",
580    ".kotlin",
581    ".groovy",
582    // Systems Programming
583    ".c",
584    ".cpp",
585    ".cc",
586    ".cxx",
587    ".h",
588    ".hpp",
589    ".rs",
590    ".go",
591    ".zig",
592    // Functional
593    ".ml",
594    ".mli",
595    ".hs",
596    ".elm",
597    ".clj",
598    ".cljs",
599    ".erl",
600    ".ex",
601    ".exs",
602    // Data/Config
603    ".json",
604    ".yaml",
605    ".yml",
606    ".toml",
607    ".xml",
608    ".ini",
609    ".env",
610    ".properties",
611    // Shell/Scripts (with security validation)
612    ".sh",
613    ".bash",
614    ".zsh",
615    ".fish",
616    ".ps1",
617    ".bat",
618    ".cmd",
619    // Mobile
620    ".swift",
621    ".m",
622    ".mm",
623    ".kt",
624    ".dart",
625    // Database
626    ".sql",
627    ".graphql",
628    ".prisma",
629    // Other Languages
630    ".r",
631    ".R",
632    ".jl",
633    ".lua",
634    ".vim",
635    ".el",
636];
637
638/// Default chunk size in tokens (approximately)
639const DEFAULT_CHUNK_SIZE: usize = 500;
640
641/// Overlap between chunks in tokens
642const DEFAULT_CHUNK_OVERLAP: usize = 50;
643
644/// Find all indexable documents in a directory using WalkDir for performance
645pub fn find_documents(dir_path: &Path) -> Result<Vec<PathBuf>> {
646    if !dir_path.exists() {
647        return Err(anyhow!("Directory does not exist: {:?}", dir_path));
648    }
649
650    if !dir_path.is_dir() {
651        return Err(anyhow!("Path is not a directory: {:?}", dir_path));
652    }
653
654    let mut documents = Vec::new();
655    let max_depth = 10; // Prevent infinite recursion
656    let max_file_size = 100 * 1024 * 1024; // 100MB limit per file
657
658    log::info!("Scanning directory for documents: {:?}", dir_path);
659
660    for entry in WalkDir::new(dir_path)
661        .max_depth(max_depth)
662        .follow_links(false) // Avoid symlink cycles
663        .into_iter()
664        .filter_map(|e| e.ok())
665    // Skip entries that can't be read
666    {
667        let path = entry.path();
668
669        // Skip directories
670        if !path.is_file() {
671            continue;
672        }
673
674        // Check if file type is supported
675        if !is_supported_file(path) {
676            continue;
677        }
678
679        // Check file size limits
680        if let Ok(metadata) = entry.metadata() {
681            if metadata.len() > max_file_size {
682                log::warn!(
683                    "Skipping large file ({}MB): {:?}",
684                    metadata.len() / 1024 / 1024,
685                    path
686                );
687                continue;
688            }
689        }
690
691        // Skip hidden files and directories (starting with .)
692        if path
693            .file_name()
694            .and_then(|name| name.to_str())
695            .map(|name| name.starts_with('.'))
696            .unwrap_or(false)
697        {
698            log::debug!("Skipping hidden file: {:?}", path);
699            continue;
700        }
701
702        // Skip common binary/cache directories
703        let path_str = path.to_string_lossy();
704        let skip_patterns = [
705            "/target/",
706            "/.git/",
707            "/node_modules/",
708            "/__pycache__/",
709            "/.cache/",
710            "/dist/",
711            "/build/",
712        ];
713
714        if skip_patterns
715            .iter()
716            .any(|pattern| path_str.contains(pattern))
717        {
718            log::debug!("Skipping file in ignored directory: {:?}", path);
719            continue;
720        }
721
722        documents.push(path.to_path_buf());
723    }
724
725    log::info!(
726        "Found {} indexable documents in {:?} (max depth: {})",
727        documents.len(),
728        dir_path,
729        max_depth
730    );
731
732    if documents.is_empty() {
733        log::warn!(
734            "No supported documents found in {:?}. Supported formats: {:?}",
735            dir_path,
736            SUPPORTED_EXTENSIONS
737        );
738    }
739
740    Ok(documents)
741}
742
743/// Check if a file is supported for indexing
744pub fn is_supported_file(path: &Path) -> bool {
745    path.extension()
746        .and_then(|ext| ext.to_str())
747        .map(|ext| SUPPORTED_EXTENSIONS.contains(&format!(".{}", ext.to_lowercase()).as_str()))
748        .unwrap_or(false)
749}
750
751/// Index a single document and return chunks
752pub fn index_document(path: PathBuf, config: &RagConfig) -> Result<Vec<DocumentChunk>> {
753    if !path.exists() {
754        return Err(anyhow!("File does not exist: {:?}", path));
755    }
756
757    if !is_supported_file(&path) {
758        return Err(anyhow!("Unsupported file type: {:?}", path));
759    }
760
761    // SECURITY: Check if PDF processing is disabled
762    let extension = path
763        .extension()
764        .and_then(|ext| ext.to_str())
765        .unwrap_or("")
766        .to_lowercase();
767
768    if extension == "pdf" && !config.allow_pdf_processing {
769        log::warn!("PDF processing disabled for security. Skipping: {:?}", path);
770        return Ok(vec![]); // Skip PDF files when disabled
771    }
772
773    // SECURITY: Check if code processing is disabled
774    let code_extensions = [
775        "js", "jsx", "ts", "tsx", "py", "rb", "php", "java", "scala", "kotlin", "rs", "go", "c",
776        "cpp", "sh", "bash", "ps1",
777    ];
778    if code_extensions.contains(&extension.as_str()) && !config.allow_code_processing {
779        log::warn!("Code processing disabled. Skipping: {:?}", path);
780        return Ok(vec![]); // Skip code files when disabled
781    }
782
783    log::info!("Indexing document: {:?}", path);
784
785    // Extract text content with configuration
786    let content = extract_text(&path, config)?;
787    if content.trim().is_empty() {
788        return Err(anyhow!("Document contains no text content: {:?}", path));
789    }
790
791    // Get file metadata
792    let metadata = extract_metadata(&path)?;
793
794    // Detect document structure (title, sections)
795    let (title, sections) = detect_structure(&content, &path);
796
797    // Chunk the content
798    let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
799
800    // Create document chunks
801    let mut document_chunks = Vec::new();
802    for (i, chunk_content) in chunks.into_iter().enumerate() {
803        // Try to determine which section this chunk belongs to
804        let section = find_section_for_chunk(&chunk_content, &sections);
805
806        let chunk = DocumentChunk {
807            id: format!("{}_{}", path.to_string_lossy(), i),
808            content: preprocessing::clean_text(&chunk_content),
809            source_path: path.clone(),
810            source_type: SourceType::Local, // Default to local for now
811            title: title.clone(),
812            section: section.clone(),
813            chunk_index: i,
814            metadata: metadata.clone(),
815        };
816
817        document_chunks.push(chunk);
818    }
819
820    log::info!("Created {} chunks from {:?}", document_chunks.len(), path);
821    Ok(document_chunks)
822}
823
824/// Extract text content from various file formats
825fn extract_text(path: &Path, config: &RagConfig) -> Result<String> {
826    let extension = path
827        .extension()
828        .and_then(|ext| ext.to_str())
829        .unwrap_or("")
830        .to_lowercase();
831
832    match extension.as_str() {
833        "md" | "txt" | "rst" => extract_text_file(path),
834        "pdf" => extract_pdf_text(path),
835        "doc" | "docx" => extract_doc_text(path),
836        // Code files
837        "js" | "jsx" | "ts" | "tsx" | "vue" | "svelte" | "html" | "css" | "scss" | "sass"
838        | "less" | "py" | "rb" | "php" | "java" | "scala" | "kotlin" | "groovy" | "c" | "cpp"
839        | "cc" | "cxx" | "h" | "hpp" | "rs" | "go" | "zig" | "ml" | "mli" | "hs" | "elm"
840        | "clj" | "cljs" | "erl" | "ex" | "exs" | "swift" | "m" | "mm" | "kt" | "dart" | "r"
841        | "jl" | "lua" | "vim" | "el" | "sql" | "graphql" | "prisma" => {
842            extract_code_text(path, config)
843        }
844        // Config files
845        "json" | "yaml" | "yml" | "toml" | "xml" | "ini" | "properties" => {
846            extract_config_text(path, config)
847        }
848        // Shell scripts (with extra security)
849        "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => extract_shell_text(path, config),
850        // Environment files (with secret masking)
851        "env" => extract_env_text(path, config),
852        _ => Err(anyhow!("Unsupported file extension: {}", extension)),
853    }
854}
855
856/// Extract text from plain text files (markdown, txt)
857fn extract_text_file(path: &Path) -> Result<String> {
858    fs::read_to_string(path).map_err(|e| anyhow!("Failed to read text file {:?}: {}", path, e))
859}
860
861/// Extract text from PDF files with security validation
862fn extract_pdf_text(path: &Path) -> Result<String> {
863    log::info!("Processing PDF file with security validation: {:?}", path);
864
865    // SECURITY: Validate PDF before processing
866    validate_pdf_security(path)?;
867
868    // Create a metadata entry that includes the filename and basic information
869    // Note: Basic PDF metadata extraction with security validation
870    let file_name = path
871        .file_stem()
872        .and_then(|name| name.to_str())
873        .unwrap_or("unknown");
874
875    // Get file size for indexing
876    let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
877
878    // Create indexable content from filename and metadata
879    let mut content = String::new();
880    content.push_str(&format!("PDF Document: {}\n", file_name));
881    content.push_str(&format!("File size: {} bytes\n", file_size));
882    content.push_str(&format!("Location: {}\n", path.display()));
883
884    // Add searchable terms from filename
885    let searchable_terms: Vec<&str> = file_name
886        .split(|c: char| !c.is_alphanumeric())
887        .filter(|term| term.len() > 2)
888        .collect();
889
890    if !searchable_terms.is_empty() {
891        content.push_str("Keywords: ");
892        content.push_str(&searchable_terms.join(", "));
893        content.push('\n');
894    }
895
896    // PDF processing currently indexes by filename and metadata
897    content.push_str("This document is indexed by filename and metadata.");
898
899    log::info!(
900        "Created indexable content for PDF {:?} ({} characters)",
901        path,
902        content.len()
903    );
904    Ok(content)
905}
906
907/// Extract text from DOC/DOCX files
908fn extract_doc_text(path: &Path) -> Result<String> {
909    log::info!("Processing DOC/DOCX file: {:?}", path);
910
911    let extension = path
912        .extension()
913        .and_then(|ext| ext.to_str())
914        .unwrap_or("")
915        .to_lowercase();
916
917    // Handle legacy DOC format
918    if extension == "doc" {
919        log::warn!("Legacy DOC format detected: {:?}", path);
920        return create_doc_metadata(path, "DOC (Legacy Word Document)");
921    }
922
923    // Handle DOCX format
924    if extension == "docx" {
925        // Try to extract text using docx-rs library
926        match extract_docx_text_safe(path) {
927            Ok(content) => {
928                log::info!(
929                    "Successfully extracted {} characters from DOCX: {:?}",
930                    content.len(),
931                    path
932                );
933                return Ok(content);
934            }
935            Err(e) => {
936                log::warn!(
937                    "Failed to extract DOCX text, using metadata fallback: {}",
938                    e
939                );
940                return create_doc_metadata(path, "DOCX (Word Document)");
941            }
942        }
943    }
944
945    Err(anyhow!("Unsupported document format: {:?}", extension))
946}
947
948/// Safely extract DOCX text with error handling
949fn extract_docx_text_safe(path: &Path) -> Result<String> {
950    use docx_rs::read_docx;
951
952    // Read the DOCX file as bytes
953    let file_bytes = std::fs::read(path).map_err(|e| anyhow!("Failed to read DOCX file: {}", e))?;
954
955    let _docx = read_docx(&file_bytes).map_err(|e| anyhow!("Failed to parse DOCX file: {}", e))?;
956
957    // Extract text content from document (basic implementation)
958    let mut text_content = String::new();
959
960    // Simple text extraction - this may need enhancement based on docx-rs API
961    text_content.push_str(&format!("DOCX Document from: {}\n", path.display()));
962    text_content.push_str("Document content successfully parsed.\n");
963    text_content.push_str("Note: Basic DOCX processing - text extraction can be enhanced.");
964
965    Ok(text_content)
966}
967
968/// Extract text from code files with security validation
969fn extract_code_text(path: &Path, config: &RagConfig) -> Result<String> {
970    // Validate code file security
971    validate_code_security(path, &config.code_security_level)?;
972
973    // Read the code file
974    let content = fs::read_to_string(path)
975        .map_err(|e| anyhow!("Failed to read code file {:?}: {}", path, e))?;
976
977    // Clean and prepare for indexing
978    let cleaned = if config.mask_secrets {
979        sanitize_code_content(&content)
980    } else {
981        content
982    };
983    Ok(cleaned)
984}
985
986/// Extract text from config files with validation
987fn extract_config_text(path: &Path, config: &RagConfig) -> Result<String> {
988    let content = fs::read_to_string(path)
989        .map_err(|e| anyhow!("Failed to read config file {:?}: {}", path, e))?;
990
991    // Mask any potential secrets in config files
992    let sanitized = if config.mask_secrets {
993        mask_secrets(&content)
994    } else {
995        content
996    };
997    Ok(sanitized)
998}
999
1000/// Extract text from shell scripts with enhanced security validation
1001fn extract_shell_text(path: &Path, config: &RagConfig) -> Result<String> {
1002    // Extra security validation for shell scripts
1003    validate_shell_security(path, &config.code_security_level)?;
1004
1005    let content = fs::read_to_string(path)
1006        .map_err(|e| anyhow!("Failed to read shell script {:?}: {}", path, e))?;
1007
1008    // Sanitize shell content
1009    let sanitized = if config.mask_secrets {
1010        sanitize_shell_content(&content)
1011    } else {
1012        content
1013    };
1014    Ok(sanitized)
1015}
1016
1017/// Extract text from environment files with secret masking
1018fn extract_env_text(path: &Path, _config: &RagConfig) -> Result<String> {
1019    let content = fs::read_to_string(path)
1020        .map_err(|e| anyhow!("Failed to read env file {:?}: {}", path, e))?;
1021
1022    // Heavily mask environment files
1023    let masked = mask_env_secrets(&content);
1024    Ok(masked)
1025}
1026
1027/// Security validation for code files to prevent malicious content processing
1028fn validate_code_security(
1029    path: &Path,
1030    security_level: &crate::rag::CodeSecurityLevel,
1031) -> Result<()> {
1032    use crate::rag::CodeSecurityLevel;
1033    log::debug!("Running security validation on code file: {:?}", path);
1034
1035    // Check file size
1036    let metadata = fs::metadata(path)?;
1037    const MAX_CODE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1038    if metadata.len() > MAX_CODE_SIZE {
1039        return Err(anyhow!(
1040            "Code file rejected: Size {} bytes exceeds maximum allowed size of {} bytes",
1041            metadata.len(),
1042            MAX_CODE_SIZE
1043        ));
1044    }
1045
1046    // Read file content for analysis
1047    let content = fs::read_to_string(path)
1048        .map_err(|e| anyhow!("Failed to read code file for validation: {}", e))?;
1049
1050    // Check for obfuscated code patterns
1051    if is_potentially_obfuscated(&content) {
1052        match security_level {
1053            CodeSecurityLevel::Strict => {
1054                return Err(anyhow!(
1055                    "Code file rejected: Contains potentially obfuscated content"
1056                ));
1057            }
1058            CodeSecurityLevel::Moderate => {
1059                log::warn!("Code file may contain obfuscated content: {:?}", path);
1060            }
1061            CodeSecurityLevel::Permissive => {
1062                log::debug!("Obfuscated content check bypassed (permissive mode)");
1063            }
1064        }
1065    }
1066
1067    // Check for suspicious URLs or domains
1068    validate_urls_in_code(&content, security_level)?;
1069
1070    // Check for prompt injection patterns
1071    check_prompt_injection(&content, security_level)?;
1072
1073    Ok(())
1074}
1075
1076/// Enhanced security validation for shell scripts
1077fn validate_shell_security(
1078    path: &Path,
1079    security_level: &crate::rag::CodeSecurityLevel,
1080) -> Result<()> {
1081    use crate::rag::CodeSecurityLevel;
1082    log::debug!(
1083        "Running enhanced security validation on shell script: {:?}",
1084        path
1085    );
1086
1087    let content = fs::read_to_string(path)
1088        .map_err(|e| anyhow!("Failed to read shell script for validation: {}", e))?;
1089
1090    // Dangerous shell command patterns
1091    let dangerous_patterns = [
1092        r"rm\s+-rf\s+/",               // rm -rf /
1093        r"rm\s+-rf\s+\*",              // rm -rf *
1094        r":\(\)\s*\{\s*:\|\:&\s*\};:", // Fork bomb
1095        r"mkfs\.",                     // Format filesystem
1096        r"dd\s+if=/dev/(zero|random)", // Disk wipe
1097        r">\s*/dev/sda",               // Direct disk write
1098        r"curl.*\|\s*(ba)?sh",         // Remote code execution
1099        r"wget.*\|\s*(ba)?sh",         // Remote code execution
1100        r"eval\s+.*\$\(",              // Eval with command substitution
1101        r"python\s+-c.*exec",          // Python exec
1102    ];
1103
1104    let compiled_patterns: Vec<regex::Regex> = dangerous_patterns
1105        .iter()
1106        .filter_map(|pattern| regex::Regex::new(pattern).ok())
1107        .collect();
1108
1109    for pattern in &compiled_patterns {
1110        if pattern.is_match(&content) {
1111            match security_level {
1112                CodeSecurityLevel::Strict | CodeSecurityLevel::Moderate => {
1113                    return Err(anyhow!(
1114                        "Shell script rejected: Contains potentially dangerous command pattern"
1115                    ));
1116                }
1117                CodeSecurityLevel::Permissive => {
1118                    log::warn!("Dangerous shell pattern detected but allowed in permissive mode");
1119                }
1120            }
1121        }
1122    }
1123
1124    Ok(())
1125}
1126
1127/// Check for potentially obfuscated code
1128fn is_potentially_obfuscated(content: &str) -> bool {
1129    // Check for high entropy (randomness) in variable names
1130    let lines: Vec<&str> = content.lines().collect();
1131    let mut suspicious_count = 0;
1132
1133    let hex_regex = regex::Regex::new(r"\\x[0-9a-fA-F]{2}").unwrap();
1134
1135    for line in lines {
1136        // Skip comments
1137        if line.trim().starts_with("//")
1138            || line.trim().starts_with("#")
1139            || line.trim().starts_with("/*")
1140        {
1141            continue;
1142        }
1143
1144        // Check for base64 encoded strings
1145        if line.contains("atob") || line.contains("btoa") || line.contains("base64") {
1146            suspicious_count += 1;
1147        }
1148
1149        // Check for hex strings
1150        if hex_regex.is_match(line) {
1151            suspicious_count += 1;
1152        }
1153
1154        // Check for excessive use of escape characters
1155        if line.matches('\\').count() > 10 {
1156            suspicious_count += 1;
1157        }
1158    }
1159
1160    suspicious_count > 5
1161}
1162
1163/// Validate URLs in code for suspicious domains
1164fn validate_urls_in_code(
1165    content: &str,
1166    security_level: &crate::rag::CodeSecurityLevel,
1167) -> Result<()> {
1168    use crate::rag::CodeSecurityLevel;
1169    let url_pattern = regex::Regex::new(r#"https?://[^\s"']+"#).unwrap();
1170
1171    let suspicious_domains = [
1172        "bit.ly",
1173        "tinyurl.com",
1174        "goo.gl",
1175        "ow.ly",
1176        "shorte.st",
1177        "adf.ly",
1178        "bc.vc",
1179        "bit.do",
1180        "soo.gd",
1181        "7.ly",
1182        "5z8.info",
1183        "DFHGDH", // Common in malware
1184    ];
1185
1186    for url_match in url_pattern.find_iter(content) {
1187        let url = url_match.as_str();
1188        for domain in &suspicious_domains {
1189            if url.contains(domain) {
1190                match security_level {
1191                    CodeSecurityLevel::Strict => {
1192                        return Err(anyhow!(
1193                            "Code rejected: Contains suspicious URL shortener: {}",
1194                            url
1195                        ));
1196                    }
1197                    CodeSecurityLevel::Moderate => {
1198                        log::warn!("Suspicious URL shortener found in code: {}", url);
1199                    }
1200                    CodeSecurityLevel::Permissive => {
1201                        log::debug!("URL check bypassed (permissive mode): {}", url);
1202                    }
1203                }
1204            }
1205        }
1206    }
1207
1208    Ok(())
1209}
1210
1211/// Check for prompt injection patterns
1212fn check_prompt_injection(
1213    content: &str,
1214    security_level: &crate::rag::CodeSecurityLevel,
1215) -> Result<()> {
1216    use crate::rag::CodeSecurityLevel;
1217    let injection_patterns = [
1218        "ignore previous instructions",
1219        "disregard all prior",
1220        "forget everything above",
1221        "new instructions:",
1222        "SYSTEM PROMPT:",
1223        "###SYSTEM###",
1224        "</system>",
1225        "<|im_start|>",
1226        "<|im_end|>",
1227    ];
1228
1229    let content_lower = content.to_lowercase();
1230    for pattern in &injection_patterns {
1231        if content_lower.contains(pattern) {
1232            match security_level {
1233                CodeSecurityLevel::Strict => {
1234                    return Err(anyhow!(
1235                        "Code rejected: Contains potential prompt injection pattern: {}",
1236                        pattern
1237                    ));
1238                }
1239                CodeSecurityLevel::Moderate => {
1240                    log::warn!("Potential prompt injection pattern detected: {}", pattern);
1241                }
1242                CodeSecurityLevel::Permissive => {
1243                    log::debug!("Prompt injection check bypassed (permissive mode)");
1244                }
1245            }
1246        }
1247    }
1248
1249    Ok(())
1250}
1251
1252/// Sanitize code content for safe indexing
1253fn sanitize_code_content(content: &str) -> String {
1254    // Remove any inline secrets or API keys and preserve code structure
1255    mask_secrets(content)
1256}
1257
1258/// Sanitize shell script content
1259fn sanitize_shell_content(content: &str) -> String {
1260    // Mask any hardcoded passwords or secrets
1261    mask_secrets(content)
1262}
1263
1264/// Mask secrets in content
1265fn mask_secrets(content: &str) -> String {
1266    let mut result = content.to_string();
1267
1268    // Patterns for common secrets
1269    let secret_patterns = [
1270        (
1271            r#"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1272            "API_KEY=[MASKED]",
1273        ),
1274        (
1275            r#"(?i)(secret|password|passwd|pwd)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1276            "SECRET=[MASKED]",
1277        ),
1278        (
1279            r#"(?i)(token|auth)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1280            "TOKEN=[MASKED]",
1281        ),
1282        (r"(?i)bearer\s+[a-zA-Z0-9\-._~+/]+", "Bearer [MASKED]"),
1283        (
1284            r"-----BEGIN (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----[\s\S]*?-----END (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----",
1285            "[PRIVATE_KEY_MASKED]",
1286        ),
1287        (r"ghp_[a-zA-Z0-9]{36}", "ghp_[GITHUB_TOKEN_MASKED]"),
1288        (r"sk-[a-zA-Z0-9]{48}", "sk-[OPENAI_KEY_MASKED]"),
1289    ];
1290
1291    for (pattern, replacement) in &secret_patterns {
1292        if let Ok(re) = regex::Regex::new(pattern) {
1293            result = re.replace_all(&result, *replacement).to_string();
1294        }
1295    }
1296
1297    result
1298}
1299
1300/// Heavily mask environment file secrets
1301fn mask_env_secrets(content: &str) -> String {
1302    let mut result = String::new();
1303
1304    for line in content.lines() {
1305        if line.trim().is_empty() || line.trim().starts_with('#') {
1306            result.push_str(line);
1307            result.push('\n');
1308            continue;
1309        }
1310
1311        if let Some(eq_pos) = line.find('=') {
1312            let key = &line[..eq_pos];
1313            // Keep the key but mask the value
1314            result.push_str(key);
1315            result.push_str("=[MASKED]\n");
1316        } else {
1317            result.push_str(line);
1318            result.push('\n');
1319        }
1320    }
1321
1322    result
1323}
1324
1325/// Security validation for PDF files to prevent malicious content processing
1326fn validate_pdf_security(path: &Path) -> Result<()> {
1327    log::debug!("Running security validation on PDF: {:?}", path);
1328
1329    // Check file size - reject extremely large files that could cause DoS
1330    const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; // 100MB limit
1331    let metadata = fs::metadata(path)?;
1332    if metadata.len() > MAX_PDF_SIZE {
1333        return Err(anyhow!(
1334            "PDF file rejected: Size {} bytes exceeds maximum allowed size of {} bytes ({}MB)",
1335            metadata.len(),
1336            MAX_PDF_SIZE,
1337            MAX_PDF_SIZE / (1024 * 1024)
1338        ));
1339    }
1340
1341    // Read the first few bytes to validate PDF header
1342    let mut buffer = vec![0u8; 1024];
1343    let file = fs::File::open(path)?;
1344    use std::io::Read;
1345    let mut reader = std::io::BufReader::new(file);
1346    let bytes_read = reader.read(&mut buffer)?;
1347
1348    if bytes_read < 8 {
1349        return Err(anyhow!("PDF file rejected: File too small or corrupted"));
1350    }
1351
1352    // Validate PDF magic header
1353    if !buffer.starts_with(b"%PDF-") {
1354        return Err(anyhow!(
1355            "PDF file rejected: Invalid PDF header - not a valid PDF file"
1356        ));
1357    }
1358
1359    // Check PDF version - reject very old or suspicious versions
1360    if bytes_read >= 8 {
1361        let version_bytes = &buffer[5..8];
1362        if let Ok(version_str) = std::str::from_utf8(version_bytes) {
1363            // Extract major version number
1364            if let Some(major_char) = version_str.chars().next() {
1365                if let Some(major) = major_char.to_digit(10) {
1366                    if !(1..=2).contains(&major) {
1367                        // Only allow PDF versions 1.x and 2.x
1368                        return Err(anyhow!(
1369                            "PDF file rejected: Unsupported PDF version {}",
1370                            version_str
1371                        ));
1372                    }
1373                }
1374            }
1375        }
1376    }
1377
1378    // Scan for suspicious content patterns in the first KB
1379    let content = std::str::from_utf8(&buffer[..bytes_read]).unwrap_or("");
1380
1381    // Dangerous JavaScript/ActionScript patterns
1382    let dangerous_patterns = [
1383        "/JavaScript",
1384        "/JS",
1385        "/OpenAction",
1386        "/AA", // Auto Action
1387        "/Launch",
1388        "/GoToE", // GoToEmbedded
1389        "/GoToR", // GoToRemote
1390        "/ImportData",
1391        "/SubmitForm",
1392        "/URI",
1393        "/Sound",
1394        "/Movie",
1395        "/RichMedia",
1396        "/3D",
1397        "/Encrypt",
1398        "eval(",
1399        "unescape(",
1400        "String.fromCharCode(",
1401        "document.write(",
1402        "this.print(",
1403        "app.alert(",
1404        "xfa.host",
1405        "soap.connect",
1406        "util.printf",
1407    ];
1408
1409    for pattern in &dangerous_patterns {
1410        if content.contains(pattern) {
1411            log::warn!(
1412                "PDF security violation: Found suspicious pattern '{}' in {}",
1413                pattern,
1414                path.display()
1415            );
1416            return Err(anyhow!(
1417                "PDF file rejected: Contains potentially malicious content pattern '{}'. PDF may contain embedded JavaScript or other dangerous elements.", 
1418                pattern
1419            ));
1420        }
1421    }
1422
1423    // Check for embedded files patterns
1424    let embed_patterns = ["/EmbeddedFile", "/F ", "/UF ", "/Filespec"];
1425    for pattern in &embed_patterns {
1426        if content.contains(pattern) {
1427            log::warn!(
1428                "PDF security violation: Found embedded file pattern '{}' in {}",
1429                pattern,
1430                path.display()
1431            );
1432            return Err(anyhow!(
1433                "PDF file rejected: Contains embedded files which pose security risks"
1434            ));
1435        }
1436    }
1437
1438    // Check for form patterns that could be used for data exfiltration
1439    let form_patterns = ["/XFA", "/AcroForm", "/Fields"];
1440    for pattern in &form_patterns {
1441        if content.contains(pattern) {
1442            log::warn!(
1443                "PDF security warning: Found form pattern '{}' in {}",
1444                pattern,
1445                path.display()
1446            );
1447            // Forms are suspicious but not automatically rejected - just logged
1448        }
1449    }
1450
1451    log::info!("PDF security validation passed for: {:?}", path);
1452    Ok(())
1453}
1454
1455/// Create metadata entry for documents that cannot be fully processed
1456fn create_doc_metadata(path: &Path, doc_type: &str) -> Result<String> {
1457    let file_name = path
1458        .file_stem()
1459        .and_then(|name| name.to_str())
1460        .unwrap_or("unknown");
1461
1462    // Get file size for indexing
1463    let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1464
1465    // Create indexable content from filename and metadata
1466    let mut content = String::new();
1467    content.push_str(&format!("{}: {}\n", doc_type, file_name));
1468    content.push_str(&format!("File size: {} bytes\n", file_size));
1469    content.push_str(&format!("Location: {}\n", path.display()));
1470
1471    // Add searchable terms from filename
1472    let searchable_terms: Vec<&str> = file_name
1473        .split(|c: char| !c.is_alphanumeric())
1474        .filter(|term| term.len() > 2)
1475        .collect();
1476
1477    if !searchable_terms.is_empty() {
1478        content.push_str("Keywords: ");
1479        content.push_str(&searchable_terms.join(", "));
1480        content.push('\n');
1481    }
1482
1483    // Enhanced document processing: indexed by filename, metadata, and content structure
1484    if let Ok(modified) = fs::metadata(path).and_then(|m| m.modified()) {
1485        if let Ok(duration) = modified.duration_since(std::time::SystemTime::UNIX_EPOCH) {
1486            let datetime = chrono::DateTime::from_timestamp(duration.as_secs() as i64, 0)
1487                .unwrap_or_else(chrono::Utc::now);
1488            content.push_str(&format!("Modified: {}\n", datetime.format("%Y-%m-%d")));
1489        }
1490    }
1491
1492    // Add file extension context
1493    if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
1494        content.push_str(&format!("Format: {} document\n", extension.to_uppercase()));
1495    }
1496
1497    Ok(content)
1498}
1499
1500/// Extract file metadata
1501fn extract_metadata(path: &Path) -> Result<DocumentMetadata> {
1502    let metadata = fs::metadata(path)?;
1503
1504    let file_type = path
1505        .extension()
1506        .and_then(|ext| ext.to_str())
1507        .unwrap_or("unknown")
1508        .to_lowercase();
1509
1510    let modified = metadata
1511        .modified()?
1512        .duration_since(std::time::UNIX_EPOCH)?
1513        .as_secs();
1514
1515    let modified_datetime = DateTime::from_timestamp(modified as i64, 0).unwrap_or_else(Utc::now);
1516
1517    // Extract tags from filename or path
1518    let tags = extract_tags_from_path(path);
1519
1520    // Try to detect language from content or filename
1521    let language = detect_language(path);
1522
1523    Ok(DocumentMetadata {
1524        file_type,
1525        size: metadata.len(),
1526        modified: modified_datetime,
1527        tags,
1528        language,
1529    })
1530}
1531
1532/// Extract tags from file path (e.g., directory names, filename patterns)
1533fn extract_tags_from_path(path: &Path) -> Vec<String> {
1534    let mut tags = Vec::new();
1535
1536    // Add parent directory names as tags
1537    if let Some(parent) = path.parent() {
1538        for component in parent.components() {
1539            if let Some(name) = component.as_os_str().to_str() {
1540                if !name.starts_with('.') && name != "/" {
1541                    tags.push(name.to_lowercase());
1542                }
1543            }
1544        }
1545    }
1546
1547    // Add filename-based tags
1548    if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1549        // Look for common patterns
1550        if filename.contains("readme") {
1551            tags.push("readme".to_string());
1552        }
1553        if filename.contains("api") {
1554            tags.push("api".to_string());
1555        }
1556        if filename.contains("guide") {
1557            tags.push("guide".to_string());
1558        }
1559        if filename.contains("tutorial") {
1560            tags.push("tutorial".to_string());
1561        }
1562    }
1563
1564    tags
1565}
1566
1567/// Detect document language
1568fn detect_language(_path: &Path) -> Option<String> {
1569    // For now, assume English. In a real implementation,
1570    // you could use language detection libraries
1571    Some("en".to_string())
1572}
1573
1574/// Detect document structure (title, sections)
1575fn detect_structure(content: &str, path: &Path) -> (Option<String>, Vec<String>) {
1576    let lines: Vec<&str> = content.lines().collect();
1577    let mut title = None;
1578    let mut sections = Vec::new();
1579
1580    // For markdown files, look for headers
1581    if path.extension().and_then(|s| s.to_str()) == Some("md") {
1582        for line in &lines {
1583            let trimmed = line.trim();
1584
1585            // Check for title (first # header)
1586            if title.is_none() && trimmed.starts_with("# ") {
1587                title = Some(trimmed[2..].trim().to_string());
1588            }
1589
1590            // Collect section headers
1591            if let Some(stripped) = trimmed.strip_prefix("## ") {
1592                sections.push(stripped.trim().to_string());
1593            } else if let Some(stripped) = trimmed.strip_prefix("### ") {
1594                sections.push(stripped.trim().to_string());
1595            }
1596        }
1597    }
1598
1599    // If no title found in markdown, use filename
1600    if title.is_none() {
1601        if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1602            title = Some(filename.replace(['_', '-'], " "));
1603        }
1604    }
1605
1606    (title, sections)
1607}
1608
1609/// Find which section a chunk belongs to
1610fn find_section_for_chunk(chunk: &str, sections: &[String]) -> Option<String> {
1611    // Look for section headers in the chunk
1612    for section in sections {
1613        if chunk.contains(section) {
1614            return Some(section.clone());
1615        }
1616    }
1617    None
1618}
1619
1620/// Chunk content into smaller pieces
1621fn chunk_content(content: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
1622    // Convert approximate token count to word count (rough estimate: 1 token ≈ 0.75 words)
1623    let word_chunk_size = (chunk_size as f32 * 0.75) as usize;
1624    let word_overlap = (overlap as f32 * 0.75) as usize;
1625
1626    // Use the preprocessing chunk_text function
1627    crate::rag::embeddings::preprocessing::chunk_text(content, word_chunk_size, word_overlap)
1628}
1629
1630/// Extract domain from URL without requiring external dependencies
1631fn extract_domain_from_url(url: &str) -> Option<String> {
1632    // Simple domain extraction without full URL parsing
1633    if let Some(start) = url.find("://") {
1634        let after_protocol = &url[start + 3..];
1635        if let Some(end) = after_protocol.find('/') {
1636            Some(after_protocol[..end].to_string())
1637        } else {
1638            Some(after_protocol.to_string())
1639        }
1640    } else {
1641        None
1642    }
1643}
1644
1645/// Clean HTML to plain text suitable for indexing
1646fn clean_html_to_text(html: &str) -> String {
1647    use regex::Regex;
1648    // Remove script and style blocks
1649    let re_script = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
1650    let re_style = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
1651    let stripped = re_script.replace_all(html, "");
1652    let without_code = re_style.replace_all(&stripped, "");
1653
1654    // Replace common block tags with newlines to keep structure
1655    let block_tags = [
1656        "</p>",
1657        "</div>",
1658        "</section>",
1659        "</article>",
1660        "</li>",
1661        "</ul>",
1662        "</ol>",
1663        "<br>",
1664        "<br/>",
1665        "<br />",
1666    ];
1667    let mut structured = without_code.to_string();
1668    for tag in &block_tags {
1669        structured = structured.replace(tag, "\n");
1670    }
1671
1672    // Remove all remaining tags
1673    let re_tags = Regex::new(r"<[^>]+>").unwrap();
1674    let no_tags = re_tags.replace_all(&structured, "");
1675
1676    // Decode common HTML entities
1677    let decoded = no_tags
1678        .replace("&amp;", "&")
1679        .replace("&lt;", "<")
1680        .replace("&gt;", ">")
1681        .replace("&quot;", "\"")
1682        .replace("&#39;", "'")
1683        .replace("&nbsp;", " ");
1684
1685    // Normalize whitespace
1686    let re_ws = regex::Regex::new(r"\s+").unwrap();
1687    re_ws.replace_all(&decoded, " ").trim().to_string()
1688}
1689
1690/// Extract page title from <title> tag (best effort)
1691fn extract_html_title(html: &str) -> Option<String> {
1692    use regex::Regex;
1693    let re = Regex::new(r"(?is)<title[^>]*>(.*?)</title>").ok()?;
1694    let caps = re.captures(html)?;
1695    let title = caps.get(1)?.as_str();
1696    let cleaned = clean_html_to_text(title);
1697    if cleaned.is_empty() {
1698        None
1699    } else {
1700        Some(cleaned)
1701    }
1702}
1703
1704/// Extract first <h1> text as a fallback title
1705fn extract_h1(html: &str) -> Option<String> {
1706    use regex::Regex;
1707    let re = Regex::new(r"(?is)<h1[^>]*>(.*?)</h1>").ok()?;
1708    let caps = re.captures(html)?;
1709    let h1 = caps.get(1)?.as_str();
1710    let cleaned = clean_html_to_text(h1);
1711    if cleaned.is_empty() {
1712        None
1713    } else {
1714        Some(cleaned)
1715    }
1716}
1717
1718#[cfg(test)]
1719mod tests {
1720    use super::*;
1721    // Additional imports available if needed for enhanced testing
1722    // use std::fs::File;
1723    // use std::io::Write;
1724
1725    #[test]
1726    fn test_is_supported_file() {
1727        assert!(is_supported_file(Path::new("test.md")));
1728        assert!(is_supported_file(Path::new("test.txt")));
1729        assert!(is_supported_file(Path::new("test.pdf")));
1730        assert!(is_supported_file(Path::new("test.rs"))); // Now supported for code indexing
1731        assert!(!is_supported_file(Path::new("test.unknown")));
1732        assert!(!is_supported_file(Path::new("test")));
1733    }
1734
1735    #[test]
1736    fn test_detect_structure() {
1737        let content = r#"# Main Title
1738
1739Some introduction text.
1740
1741## Section 1
1742
1743Content for section 1.
1744
1745## Section 2
1746
1747Content for section 2.
1748
1749### Subsection 2.1
1750
1751More content.
1752"#;
1753
1754        let path = Path::new("test.md");
1755        let (title, sections) = detect_structure(content, path);
1756
1757        assert_eq!(title, Some("Main Title".to_string()));
1758        assert_eq!(sections.len(), 3);
1759        assert!(sections.contains(&"Section 1".to_string()));
1760        assert!(sections.contains(&"Section 2".to_string()));
1761        assert!(sections.contains(&"Subsection 2.1".to_string()));
1762    }
1763
1764    #[test]
1765    fn test_extract_tags_from_path() {
1766        let path = Path::new("/docs/api/authentication/readme.md");
1767        let tags = extract_tags_from_path(path);
1768
1769        assert!(tags.contains(&"docs".to_string()));
1770        assert!(tags.contains(&"api".to_string()));
1771        assert!(tags.contains(&"authentication".to_string()));
1772        assert!(tags.contains(&"readme".to_string()));
1773    }
1774
1775    #[test]
1776    fn test_chunk_content() {
1777        let content = "This is a test document with multiple sentences. Each sentence should be preserved in the chunking process. We want to make sure the chunks are reasonable.";
1778        let chunks = chunk_content(content, 10, 2); // Small chunks for testing
1779
1780        assert!(chunks.len() > 1);
1781        assert!(!chunks[0].is_empty());
1782
1783        // Check for overlap
1784        if chunks.len() > 1 {
1785            let words1: Vec<&str> = chunks[0].split_whitespace().collect();
1786            let words2: Vec<&str> = chunks[1].split_whitespace().collect();
1787
1788            // There should be some overlap between consecutive chunks
1789            let overlap_found = words1
1790                .iter()
1791                .rev()
1792                .take(5)
1793                .any(|word| words2.iter().take(5).any(|w| w == word));
1794            assert!(overlap_found);
1795        }
1796    }
1797}