omnivore-cli 0.2.0

Universal web scraper and code extractor CLI - crawl websites, analyze repositories, build knowledge graphs
Documentation
use anyhow::{Context, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use ignore::gitignore::{Gitignore, GitignoreBuilder};
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use walkdir::{DirEntry, WalkDir};

#[derive(Debug, Clone)]
pub struct FilteredFile {
    pub path: PathBuf,
    pub relative_path: PathBuf,
    #[allow(dead_code)]
    pub size: u64,
}

pub struct FileFilter {
    root_path: PathBuf,
    include_patterns: Option<GlobSet>,
    exclude_patterns: Option<GlobSet>,
    use_gitignore: bool,
    exclude_binary: bool,
    max_file_size: Option<u64>,
    default_excludes: GlobSet,
}

impl FileFilter {
    pub fn new(root_path: PathBuf) -> Self {
        let default_excludes = build_default_excludes();
        
        Self {
            root_path,
            include_patterns: None,
            exclude_patterns: None,
            use_gitignore: true,
            exclude_binary: false,
            max_file_size: None,
            default_excludes,
        }
    }

    pub fn ignore_gitignore(&mut self) {
        self.use_gitignore = false;
    }

    pub fn set_include_patterns(&mut self, patterns: Vec<String>) -> Result<()> {
        let mut builder = GlobSetBuilder::new();
        for pattern in patterns {
            let glob = Glob::new(&pattern)
                .with_context(|| format!("Invalid include pattern: {}", pattern))?;
            builder.add(glob);
        }
        self.include_patterns = Some(builder.build()?);
        Ok(())
    }

    pub fn set_exclude_patterns(&mut self, patterns: Vec<String>) -> Result<()> {
        let mut builder = GlobSetBuilder::new();
        for pattern in patterns {
            let glob = Glob::new(&pattern)
                .with_context(|| format!("Invalid exclude pattern: {}", pattern))?;
            builder.add(glob);
        }
        self.exclude_patterns = Some(builder.build()?);
        Ok(())
    }

    pub fn exclude_binary_files(&mut self) {
        self.exclude_binary = true;
    }

    pub fn set_max_file_size(&mut self, max_size: u64) {
        self.max_file_size = Some(max_size);
    }

    pub fn filter_files(&self) -> Result<Vec<FilteredFile>> {
        let mut filtered_files = Vec::new();
        let gitignore = if self.use_gitignore {
            Some(self.build_gitignore()?)
        } else {
            None
        };

        let walker = WalkDir::new(&self.root_path)
            .follow_links(false)
            .into_iter()
            .filter_entry(|e| self.should_traverse_dir(e, &gitignore));

        for entry in walker {
            let entry = entry?;
            
            if !entry.file_type().is_file() {
                continue;
            }

            let path = entry.path();
            let relative_path = path
                .strip_prefix(&self.root_path)
                .unwrap_or(path)
                .to_path_buf();

            if !self.should_include_file(&relative_path, &entry, &gitignore)? {
                continue;
            }

            let metadata = entry.metadata()?;
            filtered_files.push(FilteredFile {
                path: path.to_path_buf(),
                relative_path,
                size: metadata.len(),
            });
        }

        Ok(filtered_files)
    }

    fn build_gitignore(&self) -> Result<Gitignore> {
        let mut builder = GitignoreBuilder::new(&self.root_path);
        
        for entry in WalkDir::new(&self.root_path)
            .follow_links(false)
            .into_iter()
            .filter_map(|e| e.ok())
        {
            if entry.file_name() == ".gitignore" {
                let gitignore_path = entry.path();
                let _parent = gitignore_path.parent().unwrap_or(&self.root_path);
                builder.add(gitignore_path);
            }
        }
        
        Ok(builder.build()?)
    }

    fn should_traverse_dir(&self, entry: &DirEntry, gitignore: &Option<Gitignore>) -> bool {
        let path = entry.path();
        let relative_path = path
            .strip_prefix(&self.root_path)
            .unwrap_or(path);

        if self.default_excludes.is_match(relative_path) {
            return false;
        }

        if let Some(ref gi) = gitignore {
            if gi.matched(relative_path, entry.file_type().is_dir()).is_ignore() {
                return false;
            }
        }

        true
    }

    fn should_include_file(
        &self,
        relative_path: &Path,
        entry: &DirEntry,
        gitignore: &Option<Gitignore>,
    ) -> Result<bool> {
        if self.default_excludes.is_match(relative_path) {
            return Ok(false);
        }

        if let Some(ref gi) = gitignore {
            if gi.matched(relative_path, false).is_ignore() {
                return Ok(false);
            }
        }

        if let Some(ref exclude) = self.exclude_patterns {
            if exclude.is_match(relative_path) {
                return Ok(false);
            }
        }

        if let Some(ref include) = self.include_patterns {
            if !include.is_match(relative_path) {
                return Ok(false);
            }
        }

        let metadata = entry.metadata()?;
        if let Some(max_size) = self.max_file_size {
            if metadata.len() > max_size {
                return Ok(false);
            }
        }

        if self.exclude_binary && is_likely_binary(entry.path())? {
            return Ok(false);
        }

        Ok(true)
    }
}

fn build_default_excludes() -> GlobSet {
    let mut builder = GlobSetBuilder::new();
    
    let patterns = vec![
        ".git/**",
        ".svn/**",
        ".hg/**",
        ".bzr/**",
        "**/.git/**",
        "**/.svn/**",
        "**/.hg/**",
        "**/.bzr/**",
        "**/node_modules/**",
        "**/target/**",
        "**/dist/**",
        "**/build/**",
        "**/.DS_Store",
        "**/Thumbs.db",
        "**/*.pyc",
        "**/__pycache__/**",
        "**/.pytest_cache/**",
        "**/.mypy_cache/**",
        "**/.tox/**",
        "**/.coverage",
        "**/.idea/**",
        "**/.vscode/**",
        "**/*.swp",
        "**/*.swo",
        "**/*~",
    ];
    
    for pattern in patterns {
        if let Ok(glob) = Glob::new(pattern) {
            builder.add(glob);
        }
    }
    
    builder.build().expect("Failed to build default excludes")
}

fn is_likely_binary(path: &Path) -> Result<bool> {
    let extension = path
        .extension()
        .and_then(|ext| ext.to_str())
        .unwrap_or("");

    let binary_extensions = HashSet::from([
        "exe", "dll", "so", "dylib", "a", "lib", "o", "obj",
        "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp",
        "mp3", "mp4", "avi", "mov", "wmv", "flv", "webm", "m4a", "wav",
        "zip", "tar", "gz", "bz2", "xz", "7z", "rar",
        "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
        "ttf", "otf", "woff", "woff2", "eot",
        "db", "sqlite", "sqlite3",
        "jar", "war", "ear",
        "pyc", "pyo", "class",
        "min.js", "min.css",
    ]);

    if binary_extensions.contains(extension) {
        return Ok(true);
    }

    if let Ok(contents) = fs::read(path) {
        if contents.len() > 8192 {
            let sample = &contents[..8192];
            let null_count = sample.iter().filter(|&&b| b == 0).count();
            return Ok(null_count > 0);
        }
        
        let null_count = contents.iter().filter(|&&b| b == 0).count();
        Ok(null_count > 0)
    } else {
        Ok(false)
    }
}