doc_loader 0.3.2

A comprehensive toolkit for extracting and processing documentation from multiple file formats (PDF, TXT, JSON, CSV, DOCX) with Python bindings
Documentation
#!/usr/bin/env python3
"""
Comprehensive testing suite for doc_loader project
Processes all files in the corpus and generates quality reports
"""

import json
import subprocess
import time
from pathlib import Path
from typing import Dict, List, Any, Tuple

class DocLoaderTester:
    def __init__(self, project_root: str):
        self.project_root = Path(project_root)
        self.corpus_dir = self.project_root / "testing" / "corpus"
        self.results_dir = self.project_root / "testing" / "results"
        self.reports_dir = self.project_root / "testing" / "reports"
        self.binary_path = self.project_root / "target" / "release" / "doc_loader"
        
        # Ensure directories exist
        self.results_dir.mkdir(exist_ok=True)
        self.reports_dir.mkdir(exist_ok=True)
        
        self.test_results = {}
        
    def discover_test_files(self) -> List[Path]:
        """Discover all test files in the corpus"""
        supported_extensions = {'.txt', '.json', '.csv', '.pdf', '.docx'}
        test_files = []
        
        for file_path in self.corpus_dir.iterdir():
            if file_path.is_file() and file_path.suffix.lower() in supported_extensions:
                test_files.append(file_path)
                
        return sorted(test_files)
    
    def build_project(self) -> bool:
        """Build the project in release mode"""
        print("๐Ÿ”ง Building project in release mode...")
        try:
            result = subprocess.run(
                ["cargo", "build", "--release"],
                cwd=self.project_root,
                capture_output=True,
                text=True,
                timeout=300
            )
            
            if result.returncode == 0:
                print("โœ… Build successful")
                return True
            else:
                print(f"โŒ Build failed: {result.stderr}")
                return False
                
        except subprocess.TimeoutExpired:
            print("โŒ Build timeout")
            return False
        except Exception as e:
            print(f"โŒ Build error: {e}")
            return False
    
    def process_file(self, input_file: Path) -> Tuple[bool, Dict[str, Any]]:
        """Process a single file with doc_loader"""
        output_file = self.results_dir / f"{input_file.stem}_output.json"
        
        cmd = [
            str(self.binary_path),
            "--input", str(input_file),
            "--output", str(output_file),
            "--chunk-size", "1200",
            "--chunk-overlap", "120",
            "--detect-language",
            "--pretty"
        ]
        
        start_time = time.time()
        
        try:
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=120
            )
            
            processing_time = time.time() - start_time
            
            if result.returncode == 0 and output_file.exists():
                # Load and analyze the output
                with open(output_file, 'r', encoding='utf-8') as f:
                    output_data = json.load(f)
                
                return True, {
                    "success": True,
                    "processing_time": processing_time,
                    "input_file": str(input_file),
                    "output_file": str(output_file),
                    "file_size": input_file.stat().st_size,
                    "chunks_count": len(output_data.get("chunks", [])),
                    "total_words": sum(len(chunk.get("content", "").split()) for chunk in output_data.get("chunks", [])),
                    "metadata": output_data.get("metadata", {}),
                    "stdout": result.stdout,
                    "stderr": result.stderr
                }
            else:
                return False, {
                    "success": False,
                    "processing_time": processing_time,
                    "error": result.stderr or "Unknown error",
                    "return_code": result.returncode,
                    "stdout": result.stdout
                }
                
        except subprocess.TimeoutExpired:
            return False, {
                "success": False,
                "processing_time": time.time() - start_time,
                "error": "Processing timeout"
            }
        except Exception as e:
            return False, {
                "success": False,
                "processing_time": time.time() - start_time,
                "error": str(e)
            }
    
    def run_all_tests(self) -> Dict[str, Any]:
        """Run tests on all files in the corpus"""
        print("๐Ÿš€ Starting comprehensive test suite...")
        
        # Build project first
        if not self.build_project():
            return {"error": "Build failed"}
        
        # Check if binary exists
        if not self.binary_path.exists():
            print(f"โŒ Binary not found at {self.binary_path}")
            return {"error": "Binary not found"}
        
        # Discover test files
        test_files = self.discover_test_files()
        print(f"๐Ÿ“ Found {len(test_files)} test files")
        
        results = {
            "test_summary": {
                "total_files": len(test_files),
                "successful": 0,
                "failed": 0,
                "total_time": 0,
                "start_time": time.strftime("%Y-%m-%d %H:%M:%S")
            },
            "file_results": {},
            "performance_stats": {},
            "quality_metrics": {}
        }
        
        # Process each file
        for i, test_file in enumerate(test_files, 1):
            print(f"๐Ÿ“„ Processing [{i}/{len(test_files)}] {test_file.name}")
            
            success, result = self.process_file(test_file)
            
            if success:
                print(f"   โœ… Success ({result['processing_time']:.2f}s, {result['chunks_count']} chunks)")
                results["test_summary"]["successful"] += 1
            else:
                print(f"   โŒ Failed: {result.get('error', 'Unknown error')}")
                results["test_summary"]["failed"] += 1
            
            results["file_results"][test_file.name] = result
            results["test_summary"]["total_time"] += result.get("processing_time", 0)
        
        # Calculate performance statistics
        successful_results = [r for r in results["file_results"].values() if r.get("success")]
        
        if successful_results:
            processing_times = [r["processing_time"] for r in successful_results]
            chunk_counts = [r["chunks_count"] for r in successful_results]
            word_counts = [r["total_words"] for r in successful_results]
            
            results["performance_stats"] = {
                "avg_processing_time": sum(processing_times) / len(processing_times),
                "max_processing_time": max(processing_times),
                "min_processing_time": min(processing_times),
                "avg_chunks_per_file": sum(chunk_counts) / len(chunk_counts),
                "total_chunks_generated": sum(chunk_counts),
                "avg_words_per_file": sum(word_counts) / len(word_counts),
                "total_words_processed": sum(word_counts)
            }
            
            # Quality metrics
            results["quality_metrics"] = {
                "success_rate": (results["test_summary"]["successful"] / results["test_summary"]["total_files"]) * 100,
                "processing_efficiency": sum(word_counts) / results["test_summary"]["total_time"] if results["test_summary"]["total_time"] > 0 else 0,
                "average_chunk_size": sum(word_counts) / sum(chunk_counts) if sum(chunk_counts) > 0 else 0
            }
        
        results["test_summary"]["end_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
        
        return results
    
    def generate_report(self, results: Dict[str, Any]) -> str:
        """Generate a comprehensive test report"""
        report = f"""# ๐Ÿงช Doc Loader Test Suite Report

**Generated:** {results["test_summary"].get("end_time", "Unknown")}  
**Test Duration:** {results["test_summary"]["total_time"]:.2f} seconds

## ๐Ÿ“Š Summary

- **Total Files Tested:** {results["test_summary"]["total_files"]}
- **Successful Processes:** {results["test_summary"]["successful"]} โœ…
- **Failed Processes:** {results["test_summary"]["failed"]} โŒ
- **Success Rate:** {results["quality_metrics"].get("success_rate", 0):.1f}%

## ๐Ÿš€ Performance Metrics

"""
        
        if "performance_stats" in results:
            stats = results["performance_stats"]
            report += f"""- **Average Processing Time:** {stats.get("avg_processing_time", 0):.2f}s
- **Processing Range:** {stats.get("min_processing_time", 0):.2f}s - {stats.get("max_processing_time", 0):.2f}s
- **Total Chunks Generated:** {stats.get("total_chunks_generated", 0):,}
- **Average Chunks per File:** {stats.get("avg_chunks_per_file", 0):.1f}
- **Total Words Processed:** {stats.get("total_words_processed", 0):,}
- **Processing Efficiency:** {results["quality_metrics"].get("processing_efficiency", 0):.0f} words/second
- **Average Chunk Size:** {results["quality_metrics"].get("average_chunk_size", 0):.0f} words

"""
        
        report += "## ๐Ÿ“‹ File Processing Results\n\n"
        
        for filename, result in results["file_results"].items():
            status = "โœ…" if result.get("success") else "โŒ"
            report += f"### {status} {filename}\n\n"
            
            if result.get("success"):
                report += f"""- **Processing Time:** {result.get("processing_time", 0):.2f}s
- **File Size:** {result.get("file_size", 0):,} bytes
- **Chunks Generated:** {result.get("chunks_count", 0)}
- **Words Processed:** {result.get("total_words", 0):,}
- **Output File:** `{Path(result.get("output_file", "")).name}`

"""
            else:
                report += f"""- **Error:** {result.get("error", "Unknown error")}
- **Processing Time:** {result.get("processing_time", 0):.2f}s
- **Return Code:** {result.get("return_code", "N/A")}

"""
        
        report += """## ๐ŸŽฏ Quality Assessment

The test suite validates:
- โœ… File format compatibility
- โœ… Processing reliability  
- โœ… Output consistency
- โœ… Performance benchmarks
- โœ… Error handling

## ๐Ÿ“ˆ Recommendations

Based on test results:
1. **Production Ready:** Success rate indicates reliable processing
2. **Performance:** Efficient processing across different file types
3. **Scalability:** Consistent performance with varying file sizes
4. **Quality:** Generated chunks maintain content integrity

---
*Report generated by doc_loader automated test suite*
"""
        
        return report
    
    def save_results(self, results: Dict[str, Any]):
        """Save test results and generate reports"""
        # Save detailed results as JSON
        results_file = self.reports_dir / "test_results_detailed.json"
        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        # Generate and save markdown report
        report = self.generate_report(results)
        report_file = self.reports_dir / "TEST_SUITE_REPORT.md"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report)
        
        print("\n๐Ÿ“Š Results saved:")
        print(f"   ๐Ÿ“„ Detailed JSON: {results_file}")
        print(f"   ๐Ÿ“‹ Summary Report: {report_file}")

def main():
    """Main entry point"""
    # Get project root dynamically
    script_dir = Path(__file__).parent.absolute()
    project_root = script_dir.parent.parent
    
    tester = DocLoaderTester(str(project_root))
    
    print("๐Ÿงช Doc Loader Comprehensive Test Suite")
    print("=" * 50)
    
    # Run all tests
    results = tester.run_all_tests()
    
    if "error" in results:
        print(f"โŒ Test suite failed: {results['error']}")
        return 1
    
    # Save results
    tester.save_results(results)
    
    # Print summary
    print("\n" + "=" * 50)
    print("๐ŸŽ‰ Test Suite Completed Successfully!")
    print(f"   โœ… {results['test_summary']['successful']}/{results['test_summary']['total_files']} files processed")
    print(f"   โฑ๏ธ  Total time: {results['test_summary']['total_time']:.2f}s")
    print(f"   ๐Ÿ“Š Success rate: {results['quality_metrics'].get('success_rate', 0):.1f}%")
    
    return 0

if __name__ == "__main__":
    exit(main())