1use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use docrawl::{crawl, Config, CrawlConfig};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::time::Duration;
12use url::Url;
13use walkdir::WalkDir;
14
15use crate::rag::embeddings::preprocessing;
16use crate::rag::{DocumentChunk, DocumentMetadata, RagConfig, SourceType};
17
18pub struct Indexer {
20 config: RagConfig,
21 index_path: PathBuf,
22}
23
24impl Indexer {
25 pub fn new(config: &RagConfig) -> Result<Self> {
27 let index_path = if config.index_path.to_string_lossy().starts_with("~") {
28 let home = std::env::var("HOME")
30 .or_else(|_| std::env::var("USERPROFILE"))
31 .map_err(|_| anyhow!("Cannot determine home directory"))?;
32 let path_str = config.index_path.to_string_lossy();
33 let without_tilde = path_str.strip_prefix("~/").unwrap_or(&path_str[1..]);
34 PathBuf::from(home).join(without_tilde)
35 } else {
36 config.index_path.clone()
37 };
38
39 if !index_path.exists() {
41 fs::create_dir_all(&index_path)
42 .map_err(|e| anyhow!("Failed to create index directory {:?}: {}", index_path, e))?;
43 }
44
45 Ok(Self {
46 config: config.clone(),
47 index_path,
48 })
49 }
50
51 pub fn get_index_path(&self) -> &PathBuf {
53 &self.index_path
54 }
55
56 pub fn index_document(&self, path: PathBuf) -> Result<Vec<DocumentChunk>> {
58 index_document(path, &self.config)
59 }
60
61 pub fn index_directory(&self, dir_path: PathBuf) -> Result<Vec<DocumentChunk>> {
63 let documents = find_documents(&dir_path)?;
64 let mut all_chunks = Vec::new();
65
66 for doc_path in documents {
67 match self.index_document(doc_path.clone()) {
68 Ok(mut chunks) => all_chunks.append(&mut chunks),
69 Err(e) => {
70 log::warn!("Failed to index {:?}: {}", doc_path, e);
71 continue;
72 }
73 }
74 }
75
76 log::info!(
77 "Indexed {} chunks from {} directory",
78 all_chunks.len(),
79 dir_path.display()
80 );
81 Ok(all_chunks)
82 }
83
84 pub async fn index_single_url_no_crawl(&self, url: &str) -> Result<Vec<DocumentChunk>> {
86 log::info!("Fetching single URL without crawl: {}", url);
87
88 let _ = Url::parse(url).map_err(|e| anyhow!("Invalid URL '{}': {}", url, e))?;
90
91 let client = reqwest::Client::builder()
93 .user_agent("Manx/0.5.0 (Single Page Indexer)")
94 .timeout(Duration::from_secs(30))
95 .build()?;
96
97 let resp = client.get(url).send().await?;
98 if !resp.status().is_success() {
99 return Err(anyhow!("Failed to fetch URL {}: {}", url, resp.status()));
100 }
101 let html = resp.text().await?;
102
103 let page_title = extract_html_title(&html).or_else(|| extract_h1(&html));
105
106 let text = clean_html_to_text(&html);
108 if text.trim().is_empty() {
109 return Err(anyhow!("Fetched page contains no indexable text: {}", url));
110 }
111
112 let chunks = chunk_content(&text, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
114
115 let mut tags = vec!["documentation".to_string(), "single-page".to_string()];
117 if let Some(domain) = extract_domain_from_url(url) {
118 tags.push(domain);
119 }
120
121 let metadata = DocumentMetadata {
122 file_type: "html".to_string(),
123 size: text.len() as u64,
124 modified: Utc::now(),
125 tags,
126 language: Some("en".to_string()),
127 };
128
129 let mut document_chunks = Vec::new();
131 for (i, chunk_content) in chunks.into_iter().enumerate() {
132 let chunk = DocumentChunk {
133 id: format!("{}_{}", url, i),
134 content: preprocessing::clean_text(&chunk_content),
135 source_path: PathBuf::from(url),
136 source_type: SourceType::Web,
137 title: page_title.clone(),
138 section: None,
139 chunk_index: i,
140 metadata: metadata.clone(),
141 };
142 document_chunks.push(chunk);
143 }
144
145 log::info!(
146 "Indexed {} chunks from single URL without crawl: {}",
147 document_chunks.len(),
148 url
149 );
150 Ok(document_chunks)
151 }
152
153 #[allow(dead_code)]
155 pub async fn index_url(&self, url: String) -> Result<Vec<DocumentChunk>> {
156 log::info!("Indexing single URL (no crawling): {}", url);
157 self.index_url_deep(url, Some(0), false).await
159 }
160
161 pub async fn index_url_deep(
163 &self,
164 url: String,
165 crawl_depth: Option<u32>,
166 crawl_all: bool,
167 ) -> Result<Vec<DocumentChunk>> {
168 log::info!(
169 "Starting docrawl of URL: {} (depth: {:?}, crawl_all: {})",
170 url,
171 crawl_depth,
172 crawl_all
173 );
174
175 let parsed_url =
177 url::Url::parse(&url).map_err(|e| anyhow!("Invalid URL format '{}': {}", url, e))?;
178
179 match parsed_url.scheme() {
181 "http" | "https" => {}
182 scheme => {
183 return Err(anyhow!(
184 "Unsupported URL scheme '{}'. Only HTTP and HTTPS are allowed.",
185 scheme
186 ))
187 }
188 }
189
190 let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
192 std::fs::create_dir_all(&temp_dir)?;
193
194 let base_url = Url::parse(&url)?;
196
197 let config = CrawlConfig {
199 base_url,
200 output_dir: temp_dir.clone(),
201 user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
202 max_depth: if let Some(depth) = crawl_depth {
203 Some(depth as usize)
204 } else if crawl_all {
205 None } else {
207 Some(3) },
209 silence: true, rate_limit_per_sec: 10,
211 follow_sitemaps: true,
212 concurrency: 4,
213 timeout: Some(Duration::from_secs(30)),
214 resume: false,
215 config: Config::default(),
216 };
217
218 log::info!("Running docrawl on: {}", url);
219 match crawl(config).await {
220 Ok(stats) => {
221 log::info!(
222 "Docrawl completed successfully, processed {} pages",
223 stats.pages
224 );
225 }
226 Err(e) => {
227 let _ = std::fs::remove_dir_all(&temp_dir);
229 return Err(anyhow!("Docrawl failed: {}", e));
230 }
231 }
232
233 let mut all_chunks = Vec::new();
235 let markdown_files = self.find_markdown_files(&temp_dir)?;
236
237 log::info!(
238 "Processing {} markdown files from docrawl",
239 markdown_files.len()
240 );
241
242 for (index, md_file) in markdown_files.iter().enumerate() {
243 log::debug!(
244 "Processing markdown file {}/{}: {}",
245 index + 1,
246 markdown_files.len(),
247 md_file.display()
248 );
249
250 match self.process_markdown_file(md_file, &url).await {
251 Ok(chunks) => {
252 let chunk_count = chunks.len();
253 all_chunks.extend(chunks);
254 log::debug!(
255 "Successfully processed markdown: {} ({} chunks)",
256 md_file.display(),
257 chunk_count
258 );
259 }
260 Err(e) => {
261 log::warn!("Failed to process markdown '{}': {}", md_file.display(), e);
262 }
264 }
265 }
266
267 if let Err(e) = std::fs::remove_dir_all(&temp_dir) {
269 log::warn!("Failed to clean up temporary directory: {}", e);
270 }
271
272 log::info!(
273 "Successfully indexed {} chunks from {} markdown files via docrawl of: {}",
274 all_chunks.len(),
275 markdown_files.len(),
276 url
277 );
278
279 Ok(all_chunks)
280 }
281
282 fn find_markdown_files(&self, dir: &Path) -> Result<Vec<PathBuf>> {
284 let mut markdown_files = Vec::new();
285
286 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
287 let path = entry.path();
288 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md") {
289 markdown_files.push(path.to_path_buf());
290 }
291 }
292
293 Ok(markdown_files)
294 }
295
296 pub(crate) async fn process_markdown_file(
298 &self,
299 md_file: &Path,
300 base_url: &str,
301 ) -> Result<Vec<DocumentChunk>> {
302 let content = std::fs::read_to_string(md_file)?;
304
305 if content.trim().is_empty() {
306 return Err(anyhow!(
307 "Markdown file contains no content: {}",
308 md_file.display()
309 ));
310 }
311
312 let metadata = self.create_markdown_metadata(md_file, &content, base_url)?;
314
315 let (title, sections) = detect_structure(&content, md_file);
317
318 let page_url = self.derive_page_url(md_file, base_url);
320
321 let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
323
324 let mut document_chunks = Vec::new();
326 for (i, chunk_content) in chunks.into_iter().enumerate() {
327 let section = find_section_for_chunk(&chunk_content, §ions);
329
330 let chunk = DocumentChunk {
331 id: format!("{}_{}", page_url, i),
332 content: preprocessing::clean_text(&chunk_content),
333 source_path: PathBuf::from(&page_url),
334 source_type: SourceType::Web,
335 title: title.clone(),
336 section: section.clone(),
337 chunk_index: i,
338 metadata: metadata.clone(),
339 };
340
341 document_chunks.push(chunk);
342 }
343
344 Ok(document_chunks)
345 }
346
347 fn create_markdown_metadata(
349 &self,
350 md_file: &Path,
351 content: &str,
352 base_url: &str,
353 ) -> Result<DocumentMetadata> {
354 let file_metadata = std::fs::metadata(md_file)?;
355 let modified_time = file_metadata.modified()?;
356 let modified_datetime = chrono::DateTime::<chrono::Utc>::from(modified_time);
357
358 let mut tags = extract_tags_from_path(md_file);
360 tags.push("documentation".to_string());
361 tags.push("crawled".to_string());
362
363 if let Some(domain) = extract_domain_from_url(base_url) {
365 tags.push(domain);
366 }
367
368 let language = detect_language(md_file);
370
371 Ok(DocumentMetadata {
372 file_type: "markdown".to_string(),
373 size: content.len() as u64,
374 modified: modified_datetime,
375 tags,
376 language,
377 })
378 }
379
380 fn derive_page_url(&self, md_file: &Path, base_url: &str) -> String {
382 let file_name = md_file
384 .file_stem()
385 .and_then(|s| s.to_str())
386 .unwrap_or("page");
387
388 if base_url.ends_with('/') {
390 format!("{}{}", base_url, file_name)
391 } else {
392 format!("{}/{}", base_url, file_name)
393 }
394 }
395
396 pub async fn index_shallow_url(
398 &self,
399 url: &str,
400 max_pages: Option<usize>,
401 ) -> Result<Vec<DocumentChunk>> {
402 use indicatif::{ProgressBar, ProgressStyle};
403 use scraper::{Html, Selector};
404 use tokio::task::JoinSet;
405
406 let client = reqwest::Client::builder()
407 .user_agent("Manx/0.5.0 (Shallow Crawler)")
408 .timeout(Duration::from_secs(30))
409 .build()?;
410
411 let resp = client.get(url).send().await?;
413 if !resp.status().is_success() {
414 return Err(anyhow!("Failed to fetch URL {}: {}", url, resp.status()));
415 }
416 let final_url = resp.url().clone();
417 let base_html = resp.text().await?;
418
419 eprintln!("\n🌐 Shallow crawl starting: {}", url);
420
421 let mut all_chunks: Vec<DocumentChunk> = Vec::new();
423 let make_chunks = |page_url: &str, html: &str| -> Result<Vec<DocumentChunk>> {
424 let page_title = extract_html_title(html).or_else(|| extract_h1(html));
425 let text = clean_html_to_text(html);
426 if text.trim().is_empty() {
427 return Ok(vec![]);
428 }
429 let chunks = chunk_content(&text, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
430
431 let mut tags = vec!["documentation".to_string(), "shallow-crawl".to_string()];
433 if let Some(domain) = extract_domain_from_url(page_url) {
434 tags.push(domain);
435 }
436 let metadata = DocumentMetadata {
437 file_type: "html".to_string(),
438 size: text.len() as u64,
439 modified: Utc::now(),
440 tags,
441 language: Some("en".to_string()),
442 };
443
444 let mut docs = Vec::new();
445 for (i, chunk_content) in chunks.into_iter().enumerate() {
446 docs.push(DocumentChunk {
447 id: format!("{}_{}", page_url, i),
448 content: preprocessing::clean_text(&chunk_content),
449 source_path: PathBuf::from(page_url),
450 source_type: SourceType::Web,
451 title: page_title.clone(),
452 section: None,
453 chunk_index: i,
454 metadata: metadata.clone(),
455 });
456 }
457 Ok(docs)
458 };
459
460 eprintln!("🔎 Fetching base page: {}", final_url);
462 all_chunks.extend(make_chunks(final_url.as_str(), &base_html)?);
463
464 let doc = Html::parse_document(&base_html);
466 let a_sel = Selector::parse("a[href]").unwrap();
467 let base_host = final_url.host_str().unwrap_or("");
468 let norm = |h: &str| h.strip_prefix("www.").unwrap_or(h).to_string();
469 let base_norm = norm(base_host);
470
471 use std::collections::HashSet;
472 let mut seen: HashSet<String> = HashSet::new();
473 seen.insert(final_url.as_str().to_string());
474
475 let page_cap = max_pages.unwrap_or(usize::MAX);
476 let mut targets: Vec<url::Url> = Vec::new();
477 for a in doc.select(&a_sel) {
478 if let Some(href) = a.value().attr("href") {
479 if let Ok(abs) = final_url.join(href) {
481 if let Some(h) = abs.host_str() {
483 if norm(h) != base_norm {
484 continue;
485 }
486 } else {
487 continue;
488 }
489 let s = abs.as_str().to_string();
491 if !seen.insert(s.clone()) {
492 continue;
493 }
494 targets.push(abs);
495 if seen.len() >= page_cap {
496 break;
497 }
498 }
499 }
500 }
501
502 eprintln!("🔗 Found {} same-host links", targets.len());
503
504 let pb = if !targets.is_empty() {
506 let pb = ProgressBar::new(targets.len() as u64);
507 pb.set_style(
508 ProgressStyle::default_bar()
509 .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} pages ({percent}%) | {msg}")
510 .unwrap()
511 .progress_chars("█▉▊▋▌▍▎▏ "),
512 );
513 pb.set_message("Fetching pages...");
514 Some(pb)
515 } else {
516 None
517 };
518 let mut set = JoinSet::new();
519 let client2 = client.clone();
520 for t in targets.into_iter() {
521 let client3 = client2.clone();
522 set.spawn(async move {
523 let r = client3.get(t.clone()).send().await.ok()?;
524 if !r.status().is_success() {
525 return None;
526 }
527 let html = r.text().await.ok()?;
528 Some((t.to_string(), html))
529 });
530 }
531
532 let mut fetched = 0usize;
533 while let Some(res) = set.join_next().await {
534 if let Ok(Some((page_url, html))) = res {
535 if let Ok(mut chunks) = make_chunks(&page_url, &html) {
536 all_chunks.append(&mut chunks);
537 }
538 }
539 fetched += 1;
540 if let Some(pb) = &pb {
541 pb.set_position(fetched as u64);
542 }
543 }
544
545 if let Some(pb) = pb {
546 pb.finish_with_message("✓ Shallow crawl completed");
547 }
548
549 Ok(all_chunks)
550 }
551}
552
553const SUPPORTED_EXTENSIONS: &[&str] = &[
555 ".md",
557 ".txt",
558 ".pdf",
559 ".doc",
560 ".docx",
561 ".rst",
562 ".js",
564 ".jsx",
565 ".ts",
566 ".tsx",
567 ".vue",
568 ".svelte",
569 ".html",
570 ".css",
571 ".scss",
572 ".sass",
573 ".less",
574 ".py",
576 ".rb",
577 ".php",
578 ".java",
579 ".scala",
580 ".kotlin",
581 ".groovy",
582 ".c",
584 ".cpp",
585 ".cc",
586 ".cxx",
587 ".h",
588 ".hpp",
589 ".rs",
590 ".go",
591 ".zig",
592 ".ml",
594 ".mli",
595 ".hs",
596 ".elm",
597 ".clj",
598 ".cljs",
599 ".erl",
600 ".ex",
601 ".exs",
602 ".json",
604 ".yaml",
605 ".yml",
606 ".toml",
607 ".xml",
608 ".ini",
609 ".env",
610 ".properties",
611 ".sh",
613 ".bash",
614 ".zsh",
615 ".fish",
616 ".ps1",
617 ".bat",
618 ".cmd",
619 ".swift",
621 ".m",
622 ".mm",
623 ".kt",
624 ".dart",
625 ".sql",
627 ".graphql",
628 ".prisma",
629 ".r",
631 ".R",
632 ".jl",
633 ".lua",
634 ".vim",
635 ".el",
636];
637
638const DEFAULT_CHUNK_SIZE: usize = 500;
640
641const DEFAULT_CHUNK_OVERLAP: usize = 50;
643
644pub fn find_documents(dir_path: &Path) -> Result<Vec<PathBuf>> {
646 if !dir_path.exists() {
647 return Err(anyhow!("Directory does not exist: {:?}", dir_path));
648 }
649
650 if !dir_path.is_dir() {
651 return Err(anyhow!("Path is not a directory: {:?}", dir_path));
652 }
653
654 let mut documents = Vec::new();
655 let max_depth = 10; let max_file_size = 100 * 1024 * 1024; log::info!("Scanning directory for documents: {:?}", dir_path);
659
660 for entry in WalkDir::new(dir_path)
661 .max_depth(max_depth)
662 .follow_links(false) .into_iter()
664 .filter_map(|e| e.ok())
665 {
667 let path = entry.path();
668
669 if !path.is_file() {
671 continue;
672 }
673
674 if !is_supported_file(path) {
676 continue;
677 }
678
679 if let Ok(metadata) = entry.metadata() {
681 if metadata.len() > max_file_size {
682 log::warn!(
683 "Skipping large file ({}MB): {:?}",
684 metadata.len() / 1024 / 1024,
685 path
686 );
687 continue;
688 }
689 }
690
691 if path
693 .file_name()
694 .and_then(|name| name.to_str())
695 .map(|name| name.starts_with('.'))
696 .unwrap_or(false)
697 {
698 log::debug!("Skipping hidden file: {:?}", path);
699 continue;
700 }
701
702 let path_str = path.to_string_lossy();
704 let skip_patterns = [
705 "/target/",
706 "/.git/",
707 "/node_modules/",
708 "/__pycache__/",
709 "/.cache/",
710 "/dist/",
711 "/build/",
712 ];
713
714 if skip_patterns
715 .iter()
716 .any(|pattern| path_str.contains(pattern))
717 {
718 log::debug!("Skipping file in ignored directory: {:?}", path);
719 continue;
720 }
721
722 documents.push(path.to_path_buf());
723 }
724
725 log::info!(
726 "Found {} indexable documents in {:?} (max depth: {})",
727 documents.len(),
728 dir_path,
729 max_depth
730 );
731
732 if documents.is_empty() {
733 log::warn!(
734 "No supported documents found in {:?}. Supported formats: {:?}",
735 dir_path,
736 SUPPORTED_EXTENSIONS
737 );
738 }
739
740 Ok(documents)
741}
742
743pub fn is_supported_file(path: &Path) -> bool {
745 path.extension()
746 .and_then(|ext| ext.to_str())
747 .map(|ext| SUPPORTED_EXTENSIONS.contains(&format!(".{}", ext.to_lowercase()).as_str()))
748 .unwrap_or(false)
749}
750
751pub fn index_document(path: PathBuf, config: &RagConfig) -> Result<Vec<DocumentChunk>> {
753 if !path.exists() {
754 return Err(anyhow!("File does not exist: {:?}", path));
755 }
756
757 if !is_supported_file(&path) {
758 return Err(anyhow!("Unsupported file type: {:?}", path));
759 }
760
761 let extension = path
763 .extension()
764 .and_then(|ext| ext.to_str())
765 .unwrap_or("")
766 .to_lowercase();
767
768 if extension == "pdf" && !config.allow_pdf_processing {
769 log::warn!("PDF processing disabled for security. Skipping: {:?}", path);
770 return Ok(vec![]); }
772
773 let code_extensions = [
775 "js", "jsx", "ts", "tsx", "py", "rb", "php", "java", "scala", "kotlin", "rs", "go", "c",
776 "cpp", "sh", "bash", "ps1",
777 ];
778 if code_extensions.contains(&extension.as_str()) && !config.allow_code_processing {
779 log::warn!("Code processing disabled. Skipping: {:?}", path);
780 return Ok(vec![]); }
782
783 log::info!("Indexing document: {:?}", path);
784
785 let content = extract_text(&path, config)?;
787 if content.trim().is_empty() {
788 return Err(anyhow!("Document contains no text content: {:?}", path));
789 }
790
791 let metadata = extract_metadata(&path)?;
793
794 let (title, sections) = detect_structure(&content, &path);
796
797 let chunks = chunk_content(&content, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP);
799
800 let mut document_chunks = Vec::new();
802 for (i, chunk_content) in chunks.into_iter().enumerate() {
803 let section = find_section_for_chunk(&chunk_content, §ions);
805
806 let chunk = DocumentChunk {
807 id: format!("{}_{}", path.to_string_lossy(), i),
808 content: preprocessing::clean_text(&chunk_content),
809 source_path: path.clone(),
810 source_type: SourceType::Local, title: title.clone(),
812 section: section.clone(),
813 chunk_index: i,
814 metadata: metadata.clone(),
815 };
816
817 document_chunks.push(chunk);
818 }
819
820 log::info!("Created {} chunks from {:?}", document_chunks.len(), path);
821 Ok(document_chunks)
822}
823
824fn extract_text(path: &Path, config: &RagConfig) -> Result<String> {
826 let extension = path
827 .extension()
828 .and_then(|ext| ext.to_str())
829 .unwrap_or("")
830 .to_lowercase();
831
832 match extension.as_str() {
833 "md" | "txt" | "rst" => extract_text_file(path),
834 "pdf" => extract_pdf_text(path),
835 "doc" | "docx" => extract_doc_text(path),
836 "js" | "jsx" | "ts" | "tsx" | "vue" | "svelte" | "html" | "css" | "scss" | "sass"
838 | "less" | "py" | "rb" | "php" | "java" | "scala" | "kotlin" | "groovy" | "c" | "cpp"
839 | "cc" | "cxx" | "h" | "hpp" | "rs" | "go" | "zig" | "ml" | "mli" | "hs" | "elm"
840 | "clj" | "cljs" | "erl" | "ex" | "exs" | "swift" | "m" | "mm" | "kt" | "dart" | "r"
841 | "jl" | "lua" | "vim" | "el" | "sql" | "graphql" | "prisma" => {
842 extract_code_text(path, config)
843 }
844 "json" | "yaml" | "yml" | "toml" | "xml" | "ini" | "properties" => {
846 extract_config_text(path, config)
847 }
848 "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => extract_shell_text(path, config),
850 "env" => extract_env_text(path, config),
852 _ => Err(anyhow!("Unsupported file extension: {}", extension)),
853 }
854}
855
856fn extract_text_file(path: &Path) -> Result<String> {
858 fs::read_to_string(path).map_err(|e| anyhow!("Failed to read text file {:?}: {}", path, e))
859}
860
861fn extract_pdf_text(path: &Path) -> Result<String> {
863 log::info!("Processing PDF file with security validation: {:?}", path);
864
865 validate_pdf_security(path)?;
867
868 let file_name = path
871 .file_stem()
872 .and_then(|name| name.to_str())
873 .unwrap_or("unknown");
874
875 let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
877
878 let mut content = String::new();
880 content.push_str(&format!("PDF Document: {}\n", file_name));
881 content.push_str(&format!("File size: {} bytes\n", file_size));
882 content.push_str(&format!("Location: {}\n", path.display()));
883
884 let searchable_terms: Vec<&str> = file_name
886 .split(|c: char| !c.is_alphanumeric())
887 .filter(|term| term.len() > 2)
888 .collect();
889
890 if !searchable_terms.is_empty() {
891 content.push_str("Keywords: ");
892 content.push_str(&searchable_terms.join(", "));
893 content.push('\n');
894 }
895
896 content.push_str("This document is indexed by filename and metadata.");
898
899 log::info!(
900 "Created indexable content for PDF {:?} ({} characters)",
901 path,
902 content.len()
903 );
904 Ok(content)
905}
906
907fn extract_doc_text(path: &Path) -> Result<String> {
909 log::info!("Processing DOC/DOCX file: {:?}", path);
910
911 let extension = path
912 .extension()
913 .and_then(|ext| ext.to_str())
914 .unwrap_or("")
915 .to_lowercase();
916
917 if extension == "doc" {
919 log::warn!("Legacy DOC format detected: {:?}", path);
920 return create_doc_metadata(path, "DOC (Legacy Word Document)");
921 }
922
923 if extension == "docx" {
925 match extract_docx_text_safe(path) {
927 Ok(content) => {
928 log::info!(
929 "Successfully extracted {} characters from DOCX: {:?}",
930 content.len(),
931 path
932 );
933 return Ok(content);
934 }
935 Err(e) => {
936 log::warn!(
937 "Failed to extract DOCX text, using metadata fallback: {}",
938 e
939 );
940 return create_doc_metadata(path, "DOCX (Word Document)");
941 }
942 }
943 }
944
945 Err(anyhow!("Unsupported document format: {:?}", extension))
946}
947
948fn extract_docx_text_safe(path: &Path) -> Result<String> {
950 use docx_rs::read_docx;
951
952 let file_bytes = std::fs::read(path).map_err(|e| anyhow!("Failed to read DOCX file: {}", e))?;
954
955 let _docx = read_docx(&file_bytes).map_err(|e| anyhow!("Failed to parse DOCX file: {}", e))?;
956
957 let mut text_content = String::new();
959
960 text_content.push_str(&format!("DOCX Document from: {}\n", path.display()));
962 text_content.push_str("Document content successfully parsed.\n");
963 text_content.push_str("Note: Basic DOCX processing - text extraction can be enhanced.");
964
965 Ok(text_content)
966}
967
968fn extract_code_text(path: &Path, config: &RagConfig) -> Result<String> {
970 validate_code_security(path, &config.code_security_level)?;
972
973 let content = fs::read_to_string(path)
975 .map_err(|e| anyhow!("Failed to read code file {:?}: {}", path, e))?;
976
977 let cleaned = if config.mask_secrets {
979 sanitize_code_content(&content)
980 } else {
981 content
982 };
983 Ok(cleaned)
984}
985
986fn extract_config_text(path: &Path, config: &RagConfig) -> Result<String> {
988 let content = fs::read_to_string(path)
989 .map_err(|e| anyhow!("Failed to read config file {:?}: {}", path, e))?;
990
991 let sanitized = if config.mask_secrets {
993 mask_secrets(&content)
994 } else {
995 content
996 };
997 Ok(sanitized)
998}
999
1000fn extract_shell_text(path: &Path, config: &RagConfig) -> Result<String> {
1002 validate_shell_security(path, &config.code_security_level)?;
1004
1005 let content = fs::read_to_string(path)
1006 .map_err(|e| anyhow!("Failed to read shell script {:?}: {}", path, e))?;
1007
1008 let sanitized = if config.mask_secrets {
1010 sanitize_shell_content(&content)
1011 } else {
1012 content
1013 };
1014 Ok(sanitized)
1015}
1016
1017fn extract_env_text(path: &Path, _config: &RagConfig) -> Result<String> {
1019 let content = fs::read_to_string(path)
1020 .map_err(|e| anyhow!("Failed to read env file {:?}: {}", path, e))?;
1021
1022 let masked = mask_env_secrets(&content);
1024 Ok(masked)
1025}
1026
1027fn validate_code_security(
1029 path: &Path,
1030 security_level: &crate::rag::CodeSecurityLevel,
1031) -> Result<()> {
1032 use crate::rag::CodeSecurityLevel;
1033 log::debug!("Running security validation on code file: {:?}", path);
1034
1035 let metadata = fs::metadata(path)?;
1037 const MAX_CODE_SIZE: u64 = 100 * 1024 * 1024; if metadata.len() > MAX_CODE_SIZE {
1039 return Err(anyhow!(
1040 "Code file rejected: Size {} bytes exceeds maximum allowed size of {} bytes",
1041 metadata.len(),
1042 MAX_CODE_SIZE
1043 ));
1044 }
1045
1046 let content = fs::read_to_string(path)
1048 .map_err(|e| anyhow!("Failed to read code file for validation: {}", e))?;
1049
1050 if is_potentially_obfuscated(&content) {
1052 match security_level {
1053 CodeSecurityLevel::Strict => {
1054 return Err(anyhow!(
1055 "Code file rejected: Contains potentially obfuscated content"
1056 ));
1057 }
1058 CodeSecurityLevel::Moderate => {
1059 log::warn!("Code file may contain obfuscated content: {:?}", path);
1060 }
1061 CodeSecurityLevel::Permissive => {
1062 log::debug!("Obfuscated content check bypassed (permissive mode)");
1063 }
1064 }
1065 }
1066
1067 validate_urls_in_code(&content, security_level)?;
1069
1070 check_prompt_injection(&content, security_level)?;
1072
1073 Ok(())
1074}
1075
1076fn validate_shell_security(
1078 path: &Path,
1079 security_level: &crate::rag::CodeSecurityLevel,
1080) -> Result<()> {
1081 use crate::rag::CodeSecurityLevel;
1082 log::debug!(
1083 "Running enhanced security validation on shell script: {:?}",
1084 path
1085 );
1086
1087 let content = fs::read_to_string(path)
1088 .map_err(|e| anyhow!("Failed to read shell script for validation: {}", e))?;
1089
1090 let dangerous_patterns = [
1092 r"rm\s+-rf\s+/", r"rm\s+-rf\s+\*", r":\(\)\s*\{\s*:\|\:&\s*\};:", r"mkfs\.", r"dd\s+if=/dev/(zero|random)", r">\s*/dev/sda", r"curl.*\|\s*(ba)?sh", r"wget.*\|\s*(ba)?sh", r"eval\s+.*\$\(", r"python\s+-c.*exec", ];
1103
1104 let compiled_patterns: Vec<regex::Regex> = dangerous_patterns
1105 .iter()
1106 .filter_map(|pattern| regex::Regex::new(pattern).ok())
1107 .collect();
1108
1109 for pattern in &compiled_patterns {
1110 if pattern.is_match(&content) {
1111 match security_level {
1112 CodeSecurityLevel::Strict | CodeSecurityLevel::Moderate => {
1113 return Err(anyhow!(
1114 "Shell script rejected: Contains potentially dangerous command pattern"
1115 ));
1116 }
1117 CodeSecurityLevel::Permissive => {
1118 log::warn!("Dangerous shell pattern detected but allowed in permissive mode");
1119 }
1120 }
1121 }
1122 }
1123
1124 Ok(())
1125}
1126
1127fn is_potentially_obfuscated(content: &str) -> bool {
1129 let lines: Vec<&str> = content.lines().collect();
1131 let mut suspicious_count = 0;
1132
1133 let hex_regex = regex::Regex::new(r"\\x[0-9a-fA-F]{2}").unwrap();
1134
1135 for line in lines {
1136 if line.trim().starts_with("//")
1138 || line.trim().starts_with("#")
1139 || line.trim().starts_with("/*")
1140 {
1141 continue;
1142 }
1143
1144 if line.contains("atob") || line.contains("btoa") || line.contains("base64") {
1146 suspicious_count += 1;
1147 }
1148
1149 if hex_regex.is_match(line) {
1151 suspicious_count += 1;
1152 }
1153
1154 if line.matches('\\').count() > 10 {
1156 suspicious_count += 1;
1157 }
1158 }
1159
1160 suspicious_count > 5
1161}
1162
1163fn validate_urls_in_code(
1165 content: &str,
1166 security_level: &crate::rag::CodeSecurityLevel,
1167) -> Result<()> {
1168 use crate::rag::CodeSecurityLevel;
1169 let url_pattern = regex::Regex::new(r#"https?://[^\s"']+"#).unwrap();
1170
1171 let suspicious_domains = [
1172 "bit.ly",
1173 "tinyurl.com",
1174 "goo.gl",
1175 "ow.ly",
1176 "shorte.st",
1177 "adf.ly",
1178 "bc.vc",
1179 "bit.do",
1180 "soo.gd",
1181 "7.ly",
1182 "5z8.info",
1183 "DFHGDH", ];
1185
1186 for url_match in url_pattern.find_iter(content) {
1187 let url = url_match.as_str();
1188 for domain in &suspicious_domains {
1189 if url.contains(domain) {
1190 match security_level {
1191 CodeSecurityLevel::Strict => {
1192 return Err(anyhow!(
1193 "Code rejected: Contains suspicious URL shortener: {}",
1194 url
1195 ));
1196 }
1197 CodeSecurityLevel::Moderate => {
1198 log::warn!("Suspicious URL shortener found in code: {}", url);
1199 }
1200 CodeSecurityLevel::Permissive => {
1201 log::debug!("URL check bypassed (permissive mode): {}", url);
1202 }
1203 }
1204 }
1205 }
1206 }
1207
1208 Ok(())
1209}
1210
1211fn check_prompt_injection(
1213 content: &str,
1214 security_level: &crate::rag::CodeSecurityLevel,
1215) -> Result<()> {
1216 use crate::rag::CodeSecurityLevel;
1217 let injection_patterns = [
1218 "ignore previous instructions",
1219 "disregard all prior",
1220 "forget everything above",
1221 "new instructions:",
1222 "SYSTEM PROMPT:",
1223 "###SYSTEM###",
1224 "</system>",
1225 "<|im_start|>",
1226 "<|im_end|>",
1227 ];
1228
1229 let content_lower = content.to_lowercase();
1230 for pattern in &injection_patterns {
1231 if content_lower.contains(pattern) {
1232 match security_level {
1233 CodeSecurityLevel::Strict => {
1234 return Err(anyhow!(
1235 "Code rejected: Contains potential prompt injection pattern: {}",
1236 pattern
1237 ));
1238 }
1239 CodeSecurityLevel::Moderate => {
1240 log::warn!("Potential prompt injection pattern detected: {}", pattern);
1241 }
1242 CodeSecurityLevel::Permissive => {
1243 log::debug!("Prompt injection check bypassed (permissive mode)");
1244 }
1245 }
1246 }
1247 }
1248
1249 Ok(())
1250}
1251
1252fn sanitize_code_content(content: &str) -> String {
1254 mask_secrets(content)
1256}
1257
1258fn sanitize_shell_content(content: &str) -> String {
1260 mask_secrets(content)
1262}
1263
1264fn mask_secrets(content: &str) -> String {
1266 let mut result = content.to_string();
1267
1268 let secret_patterns = [
1270 (
1271 r#"(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1272 "API_KEY=[MASKED]",
1273 ),
1274 (
1275 r#"(?i)(secret|password|passwd|pwd)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1276 "SECRET=[MASKED]",
1277 ),
1278 (
1279 r#"(?i)(token|auth)\s*[:=]\s*['\"]?([^'\";\s]+)"#,
1280 "TOKEN=[MASKED]",
1281 ),
1282 (r"(?i)bearer\s+[a-zA-Z0-9\-._~+/]+", "Bearer [MASKED]"),
1283 (
1284 r"-----BEGIN (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----[\s\S]*?-----END (RSA |EC |DSA |OPENSSH |)PRIVATE KEY-----",
1285 "[PRIVATE_KEY_MASKED]",
1286 ),
1287 (r"ghp_[a-zA-Z0-9]{36}", "ghp_[GITHUB_TOKEN_MASKED]"),
1288 (r"sk-[a-zA-Z0-9]{48}", "sk-[OPENAI_KEY_MASKED]"),
1289 ];
1290
1291 for (pattern, replacement) in &secret_patterns {
1292 if let Ok(re) = regex::Regex::new(pattern) {
1293 result = re.replace_all(&result, *replacement).to_string();
1294 }
1295 }
1296
1297 result
1298}
1299
1300fn mask_env_secrets(content: &str) -> String {
1302 let mut result = String::new();
1303
1304 for line in content.lines() {
1305 if line.trim().is_empty() || line.trim().starts_with('#') {
1306 result.push_str(line);
1307 result.push('\n');
1308 continue;
1309 }
1310
1311 if let Some(eq_pos) = line.find('=') {
1312 let key = &line[..eq_pos];
1313 result.push_str(key);
1315 result.push_str("=[MASKED]\n");
1316 } else {
1317 result.push_str(line);
1318 result.push('\n');
1319 }
1320 }
1321
1322 result
1323}
1324
1325fn validate_pdf_security(path: &Path) -> Result<()> {
1327 log::debug!("Running security validation on PDF: {:?}", path);
1328
1329 const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; let metadata = fs::metadata(path)?;
1332 if metadata.len() > MAX_PDF_SIZE {
1333 return Err(anyhow!(
1334 "PDF file rejected: Size {} bytes exceeds maximum allowed size of {} bytes ({}MB)",
1335 metadata.len(),
1336 MAX_PDF_SIZE,
1337 MAX_PDF_SIZE / (1024 * 1024)
1338 ));
1339 }
1340
1341 let mut buffer = vec![0u8; 1024];
1343 let file = fs::File::open(path)?;
1344 use std::io::Read;
1345 let mut reader = std::io::BufReader::new(file);
1346 let bytes_read = reader.read(&mut buffer)?;
1347
1348 if bytes_read < 8 {
1349 return Err(anyhow!("PDF file rejected: File too small or corrupted"));
1350 }
1351
1352 if !buffer.starts_with(b"%PDF-") {
1354 return Err(anyhow!(
1355 "PDF file rejected: Invalid PDF header - not a valid PDF file"
1356 ));
1357 }
1358
1359 if bytes_read >= 8 {
1361 let version_bytes = &buffer[5..8];
1362 if let Ok(version_str) = std::str::from_utf8(version_bytes) {
1363 if let Some(major_char) = version_str.chars().next() {
1365 if let Some(major) = major_char.to_digit(10) {
1366 if !(1..=2).contains(&major) {
1367 return Err(anyhow!(
1369 "PDF file rejected: Unsupported PDF version {}",
1370 version_str
1371 ));
1372 }
1373 }
1374 }
1375 }
1376 }
1377
1378 let content = std::str::from_utf8(&buffer[..bytes_read]).unwrap_or("");
1380
1381 let dangerous_patterns = [
1383 "/JavaScript",
1384 "/JS",
1385 "/OpenAction",
1386 "/AA", "/Launch",
1388 "/GoToE", "/GoToR", "/ImportData",
1391 "/SubmitForm",
1392 "/URI",
1393 "/Sound",
1394 "/Movie",
1395 "/RichMedia",
1396 "/3D",
1397 "/Encrypt",
1398 "eval(",
1399 "unescape(",
1400 "String.fromCharCode(",
1401 "document.write(",
1402 "this.print(",
1403 "app.alert(",
1404 "xfa.host",
1405 "soap.connect",
1406 "util.printf",
1407 ];
1408
1409 for pattern in &dangerous_patterns {
1410 if content.contains(pattern) {
1411 log::warn!(
1412 "PDF security violation: Found suspicious pattern '{}' in {}",
1413 pattern,
1414 path.display()
1415 );
1416 return Err(anyhow!(
1417 "PDF file rejected: Contains potentially malicious content pattern '{}'. PDF may contain embedded JavaScript or other dangerous elements.",
1418 pattern
1419 ));
1420 }
1421 }
1422
1423 let embed_patterns = ["/EmbeddedFile", "/F ", "/UF ", "/Filespec"];
1425 for pattern in &embed_patterns {
1426 if content.contains(pattern) {
1427 log::warn!(
1428 "PDF security violation: Found embedded file pattern '{}' in {}",
1429 pattern,
1430 path.display()
1431 );
1432 return Err(anyhow!(
1433 "PDF file rejected: Contains embedded files which pose security risks"
1434 ));
1435 }
1436 }
1437
1438 let form_patterns = ["/XFA", "/AcroForm", "/Fields"];
1440 for pattern in &form_patterns {
1441 if content.contains(pattern) {
1442 log::warn!(
1443 "PDF security warning: Found form pattern '{}' in {}",
1444 pattern,
1445 path.display()
1446 );
1447 }
1449 }
1450
1451 log::info!("PDF security validation passed for: {:?}", path);
1452 Ok(())
1453}
1454
1455fn create_doc_metadata(path: &Path, doc_type: &str) -> Result<String> {
1457 let file_name = path
1458 .file_stem()
1459 .and_then(|name| name.to_str())
1460 .unwrap_or("unknown");
1461
1462 let file_size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1464
1465 let mut content = String::new();
1467 content.push_str(&format!("{}: {}\n", doc_type, file_name));
1468 content.push_str(&format!("File size: {} bytes\n", file_size));
1469 content.push_str(&format!("Location: {}\n", path.display()));
1470
1471 let searchable_terms: Vec<&str> = file_name
1473 .split(|c: char| !c.is_alphanumeric())
1474 .filter(|term| term.len() > 2)
1475 .collect();
1476
1477 if !searchable_terms.is_empty() {
1478 content.push_str("Keywords: ");
1479 content.push_str(&searchable_terms.join(", "));
1480 content.push('\n');
1481 }
1482
1483 if let Ok(modified) = fs::metadata(path).and_then(|m| m.modified()) {
1485 if let Ok(duration) = modified.duration_since(std::time::SystemTime::UNIX_EPOCH) {
1486 let datetime = chrono::DateTime::from_timestamp(duration.as_secs() as i64, 0)
1487 .unwrap_or_else(chrono::Utc::now);
1488 content.push_str(&format!("Modified: {}\n", datetime.format("%Y-%m-%d")));
1489 }
1490 }
1491
1492 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
1494 content.push_str(&format!("Format: {} document\n", extension.to_uppercase()));
1495 }
1496
1497 Ok(content)
1498}
1499
1500fn extract_metadata(path: &Path) -> Result<DocumentMetadata> {
1502 let metadata = fs::metadata(path)?;
1503
1504 let file_type = path
1505 .extension()
1506 .and_then(|ext| ext.to_str())
1507 .unwrap_or("unknown")
1508 .to_lowercase();
1509
1510 let modified = metadata
1511 .modified()?
1512 .duration_since(std::time::UNIX_EPOCH)?
1513 .as_secs();
1514
1515 let modified_datetime = DateTime::from_timestamp(modified as i64, 0).unwrap_or_else(Utc::now);
1516
1517 let tags = extract_tags_from_path(path);
1519
1520 let language = detect_language(path);
1522
1523 Ok(DocumentMetadata {
1524 file_type,
1525 size: metadata.len(),
1526 modified: modified_datetime,
1527 tags,
1528 language,
1529 })
1530}
1531
1532fn extract_tags_from_path(path: &Path) -> Vec<String> {
1534 let mut tags = Vec::new();
1535
1536 if let Some(parent) = path.parent() {
1538 for component in parent.components() {
1539 if let Some(name) = component.as_os_str().to_str() {
1540 if !name.starts_with('.') && name != "/" {
1541 tags.push(name.to_lowercase());
1542 }
1543 }
1544 }
1545 }
1546
1547 if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1549 if filename.contains("readme") {
1551 tags.push("readme".to_string());
1552 }
1553 if filename.contains("api") {
1554 tags.push("api".to_string());
1555 }
1556 if filename.contains("guide") {
1557 tags.push("guide".to_string());
1558 }
1559 if filename.contains("tutorial") {
1560 tags.push("tutorial".to_string());
1561 }
1562 }
1563
1564 tags
1565}
1566
1567fn detect_language(_path: &Path) -> Option<String> {
1569 Some("en".to_string())
1572}
1573
1574fn detect_structure(content: &str, path: &Path) -> (Option<String>, Vec<String>) {
1576 let lines: Vec<&str> = content.lines().collect();
1577 let mut title = None;
1578 let mut sections = Vec::new();
1579
1580 if path.extension().and_then(|s| s.to_str()) == Some("md") {
1582 for line in &lines {
1583 let trimmed = line.trim();
1584
1585 if title.is_none() && trimmed.starts_with("# ") {
1587 title = Some(trimmed[2..].trim().to_string());
1588 }
1589
1590 if let Some(stripped) = trimmed.strip_prefix("## ") {
1592 sections.push(stripped.trim().to_string());
1593 } else if let Some(stripped) = trimmed.strip_prefix("### ") {
1594 sections.push(stripped.trim().to_string());
1595 }
1596 }
1597 }
1598
1599 if title.is_none() {
1601 if let Some(filename) = path.file_stem().and_then(|s| s.to_str()) {
1602 title = Some(filename.replace(['_', '-'], " "));
1603 }
1604 }
1605
1606 (title, sections)
1607}
1608
1609fn find_section_for_chunk(chunk: &str, sections: &[String]) -> Option<String> {
1611 for section in sections {
1613 if chunk.contains(section) {
1614 return Some(section.clone());
1615 }
1616 }
1617 None
1618}
1619
1620fn chunk_content(content: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
1622 let word_chunk_size = (chunk_size as f32 * 0.75) as usize;
1624 let word_overlap = (overlap as f32 * 0.75) as usize;
1625
1626 crate::rag::embeddings::preprocessing::chunk_text(content, word_chunk_size, word_overlap)
1628}
1629
1630fn extract_domain_from_url(url: &str) -> Option<String> {
1632 if let Some(start) = url.find("://") {
1634 let after_protocol = &url[start + 3..];
1635 if let Some(end) = after_protocol.find('/') {
1636 Some(after_protocol[..end].to_string())
1637 } else {
1638 Some(after_protocol.to_string())
1639 }
1640 } else {
1641 None
1642 }
1643}
1644
1645fn clean_html_to_text(html: &str) -> String {
1647 use regex::Regex;
1648 let re_script = Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
1650 let re_style = Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
1651 let stripped = re_script.replace_all(html, "");
1652 let without_code = re_style.replace_all(&stripped, "");
1653
1654 let block_tags = [
1656 "</p>",
1657 "</div>",
1658 "</section>",
1659 "</article>",
1660 "</li>",
1661 "</ul>",
1662 "</ol>",
1663 "<br>",
1664 "<br/>",
1665 "<br />",
1666 ];
1667 let mut structured = without_code.to_string();
1668 for tag in &block_tags {
1669 structured = structured.replace(tag, "\n");
1670 }
1671
1672 let re_tags = Regex::new(r"<[^>]+>").unwrap();
1674 let no_tags = re_tags.replace_all(&structured, "");
1675
1676 let decoded = no_tags
1678 .replace("&", "&")
1679 .replace("<", "<")
1680 .replace(">", ">")
1681 .replace(""", "\"")
1682 .replace("'", "'")
1683 .replace(" ", " ");
1684
1685 let re_ws = regex::Regex::new(r"\s+").unwrap();
1687 re_ws.replace_all(&decoded, " ").trim().to_string()
1688}
1689
1690fn extract_html_title(html: &str) -> Option<String> {
1692 use regex::Regex;
1693 let re = Regex::new(r"(?is)<title[^>]*>(.*?)</title>").ok()?;
1694 let caps = re.captures(html)?;
1695 let title = caps.get(1)?.as_str();
1696 let cleaned = clean_html_to_text(title);
1697 if cleaned.is_empty() {
1698 None
1699 } else {
1700 Some(cleaned)
1701 }
1702}
1703
1704fn extract_h1(html: &str) -> Option<String> {
1706 use regex::Regex;
1707 let re = Regex::new(r"(?is)<h1[^>]*>(.*?)</h1>").ok()?;
1708 let caps = re.captures(html)?;
1709 let h1 = caps.get(1)?.as_str();
1710 let cleaned = clean_html_to_text(h1);
1711 if cleaned.is_empty() {
1712 None
1713 } else {
1714 Some(cleaned)
1715 }
1716}
1717
1718#[cfg(test)]
1719mod tests {
1720 use super::*;
1721 #[test]
1726 fn test_is_supported_file() {
1727 assert!(is_supported_file(Path::new("test.md")));
1728 assert!(is_supported_file(Path::new("test.txt")));
1729 assert!(is_supported_file(Path::new("test.pdf")));
1730 assert!(is_supported_file(Path::new("test.rs"))); assert!(!is_supported_file(Path::new("test.unknown")));
1732 assert!(!is_supported_file(Path::new("test")));
1733 }
1734
1735 #[test]
1736 fn test_detect_structure() {
1737 let content = r#"# Main Title
1738
1739Some introduction text.
1740
1741## Section 1
1742
1743Content for section 1.
1744
1745## Section 2
1746
1747Content for section 2.
1748
1749### Subsection 2.1
1750
1751More content.
1752"#;
1753
1754 let path = Path::new("test.md");
1755 let (title, sections) = detect_structure(content, path);
1756
1757 assert_eq!(title, Some("Main Title".to_string()));
1758 assert_eq!(sections.len(), 3);
1759 assert!(sections.contains(&"Section 1".to_string()));
1760 assert!(sections.contains(&"Section 2".to_string()));
1761 assert!(sections.contains(&"Subsection 2.1".to_string()));
1762 }
1763
1764 #[test]
1765 fn test_extract_tags_from_path() {
1766 let path = Path::new("/docs/api/authentication/readme.md");
1767 let tags = extract_tags_from_path(path);
1768
1769 assert!(tags.contains(&"docs".to_string()));
1770 assert!(tags.contains(&"api".to_string()));
1771 assert!(tags.contains(&"authentication".to_string()));
1772 assert!(tags.contains(&"readme".to_string()));
1773 }
1774
1775 #[test]
1776 fn test_chunk_content() {
1777 let content = "This is a test document with multiple sentences. Each sentence should be preserved in the chunking process. We want to make sure the chunks are reasonable.";
1778 let chunks = chunk_content(content, 10, 2); assert!(chunks.len() > 1);
1781 assert!(!chunks[0].is_empty());
1782
1783 if chunks.len() > 1 {
1785 let words1: Vec<&str> = chunks[0].split_whitespace().collect();
1786 let words2: Vec<&str> = chunks[1].split_whitespace().collect();
1787
1788 let overlap_found = words1
1790 .iter()
1791 .rev()
1792 .take(5)
1793 .any(|word| words2.iter().take(5).any(|w| w == word));
1794 assert!(overlap_found);
1795 }
1796 }
1797}