source-map-php 0.1.3

CLI-first PHP code search indexer for Laravel and Hyperf repositories
Documentation
mod fallback;
mod phpactor;

use std::collections::HashMap;
use std::fs;
use std::path::Path;

use anyhow::Result;
use regex::Regex;

use crate::Framework;
use crate::composer::ComposerExport;
use crate::models::{SymbolDoc, make_stable_id};
use crate::sanitizer::Sanitizer;

#[derive(Debug, Clone)]
pub struct DeclarationCandidate {
    pub kind: String,
    pub name: String,
    pub owner_class: Option<String>,
    pub namespace: Option<String>,
    pub line_start: usize,
    pub line_end: usize,
    pub signature: Option<String>,
    pub extraction_confidence: String,
    pub references_count: u32,
}

#[derive(Debug, Clone, Default)]
struct ParsedComments {
    summary: Option<String>,
    description: Option<String>,
    params: Vec<String>,
    return_doc: Option<String>,
    throws_docs: Vec<String>,
    inline_comments: Vec<String>,
}

pub fn extract_symbols(
    repo: &Path,
    repo_name: &str,
    framework: Framework,
    files: &[crate::scanner::ScannedFile],
    packages: &ComposerExport,
    sanitizer: &Sanitizer,
) -> Result<Vec<SymbolDoc>> {
    let mut phpactor = phpactor::PhpactorExtractor::connect(repo).ok();
    let mut symbols = Vec::new();

    for file in files {
        if !file.relative_path.to_string_lossy().ends_with(".php") {
            continue;
        }

        let contents = fs::read_to_string(&file.absolute_path)?;
        let declarations = if let Some(client) = phpactor.as_mut() {
            client
                .extract_candidates(&file.absolute_path, &contents)
                .unwrap_or_else(|_| fallback::extract_candidates(&contents))
        } else {
            fallback::extract_candidates(&contents)
        };
        let comment_map = collect_comments(&contents, sanitizer);
        let package = packages.package_for_path(&file.absolute_path);
        let is_test =
            file.relative_path.starts_with("tests") || file.relative_path.starts_with("test");
        let path_str = file.relative_path.to_string_lossy().into_owned();
        let abs_str = file.absolute_path.to_string_lossy().into_owned();

        for declaration in declarations {
            let fqn = build_fqn(&declaration);
            let stable_key = format!("{}|{}|{}", repo_name, declaration.kind, fqn);
            let comments = comment_map
                .get(&declaration.line_start)
                .cloned()
                .unwrap_or_default();

            symbols.push(SymbolDoc {
                id: make_stable_id(&[
                    repo_name,
                    &declaration.kind,
                    &fqn,
                    &path_str,
                    &declaration.line_start.to_string(),
                ]),
                stable_key,
                repo: repo_name.to_string(),
                framework: framework.as_str().to_string(),
                kind: declaration.kind.clone(),
                short_name: declaration.name.clone(),
                fqn,
                owner_class: declaration.owner_class.clone(),
                namespace: declaration.namespace.clone(),
                signature: declaration.signature.clone(),
                doc_summary: comments.summary.clone(),
                doc_description: comments.description.clone(),
                param_docs: comments.params.clone(),
                return_doc: comments.return_doc.clone(),
                throws_docs: comments.throws_docs.clone(),
                magic_methods: Vec::new(),
                magic_properties: Vec::new(),
                inline_rule_comments: comments.inline_comments.clone(),
                comment_keywords: keywordize(
                    comments
                        .summary
                        .iter()
                        .chain(comments.inline_comments.iter())
                        .map(String::as_str)
                        .collect::<Vec<_>>()
                        .join(" ")
                        .as_str(),
                ),
                symbol_tokens: keywordize(&declaration.name),
                framework_tags: vec![framework.as_str().to_string()],
                risk_tags: infer_risk_tags(&path_str, comments.summary.as_deref()),
                route_ids: Vec::new(),
                related_symbols: Vec::new(),
                related_tests: Vec::new(),
                related_tests_count: 0,
                validation_commands: Vec::new(),
                missing_test_warning: None,
                package_name: package.name.clone(),
                package_type: package.package_type.clone(),
                package_version: package.version.clone(),
                package_keywords: package.keywords.clone(),
                is_vendor: !package.is_root,
                is_project_code: package.is_root,
                is_test,
                autoloadable: true,
                extraction_confidence: declaration.extraction_confidence.clone(),
                references_count: declaration.references_count,
                path: path_str.clone(),
                absolute_path: abs_str.clone(),
                line_start: declaration.line_start,
                line_end: declaration.line_end,
            });
        }
    }

    Ok(symbols)
}

pub fn fallback_candidates(contents: &str) -> Vec<DeclarationCandidate> {
    fallback::extract_candidates(contents)
}

fn build_fqn(declaration: &DeclarationCandidate) -> String {
    match (&declaration.namespace, &declaration.owner_class) {
        (Some(namespace), Some(owner)) if declaration.kind == "method" => {
            format!("{namespace}\\{owner}::{}", declaration.name)
        }
        (Some(namespace), _) => format!("{namespace}\\{}", declaration.name),
        (None, Some(owner)) if declaration.kind == "method" => {
            format!("{owner}::{}", declaration.name)
        }
        _ => declaration.name.clone(),
    }
}

fn collect_comments(contents: &str, sanitizer: &Sanitizer) -> HashMap<usize, ParsedComments> {
    let mut map = HashMap::new();
    let lines: Vec<_> = contents.lines().collect();
    let decl_re = Regex::new(r"^\s*(?:final\s+|abstract\s+)?(?:class|interface|trait|enum|function|public\s+function|protected\s+function|private\s+function)").unwrap();
    let param_re = Regex::new(r"@param\s+(.+)").unwrap();
    let return_re = Regex::new(r"@return\s+(.+)").unwrap();
    let throws_re = Regex::new(r"@throws\s+(.+)").unwrap();

    for (idx, line) in lines.iter().enumerate() {
        if !decl_re.is_match(line) {
            continue;
        }
        let mut cursor = idx as isize - 1;
        let mut doc_lines = Vec::new();
        let mut inline_comments = Vec::new();
        while cursor >= 0 {
            let candidate = lines[cursor as usize].trim();
            if candidate.starts_with("//") || candidate.starts_with('#') {
                if let Some(value) =
                    sanitizer.sanitize_text(candidate.trim_start_matches(&['/', '#'][..]).trim())
                {
                    inline_comments.push(value);
                }
                cursor -= 1;
                continue;
            }
            if candidate.ends_with("*/")
                || candidate.starts_with('*')
                || candidate.starts_with("/**")
            {
                doc_lines.push(candidate.to_string());
                cursor -= 1;
                continue;
            }
            break;
        }
        doc_lines.reverse();
        inline_comments.reverse();

        let mut parsed = ParsedComments::default();
        let mut description_lines = Vec::new();
        for raw in doc_lines {
            let cleaned = raw
                .trim_start_matches("/**")
                .trim_start_matches("/*")
                .trim_start_matches('*')
                .trim_end_matches("*/")
                .trim();
            if cleaned.is_empty() {
                continue;
            }
            if let Some(param) = param_re
                .captures(cleaned)
                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
                .and_then(|value| sanitizer.sanitize_text(value))
            {
                parsed.params.push(param);
                continue;
            }
            if let Some(return_doc) = return_re
                .captures(cleaned)
                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
                .and_then(|value| sanitizer.sanitize_text(value))
            {
                parsed.return_doc = Some(return_doc);
                continue;
            }
            if let Some(throws_doc) = throws_re
                .captures(cleaned)
                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
                .and_then(|value| sanitizer.sanitize_text(value))
            {
                parsed.throws_docs.push(throws_doc);
                continue;
            }
            if parsed.summary.is_none() {
                parsed.summary = sanitizer.sanitize_text(cleaned);
            } else if let Some(line) = sanitizer.sanitize_text(cleaned) {
                description_lines.push(line);
            }
        }
        parsed.description = if description_lines.is_empty() {
            None
        } else {
            Some(description_lines.join(" "))
        };
        parsed.inline_comments = inline_comments;

        map.insert(idx + 1, parsed);
    }

    map
}

fn keywordize(text: &str) -> Vec<String> {
    text.split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '\\')
        .filter(|token| token.len() > 2)
        .map(|token| token.to_ascii_lowercase())
        .collect()
}

fn infer_risk_tags(path: &str, summary: Option<&str>) -> Vec<String> {
    let mut tags = Vec::new();
    let text = format!("{path} {}", summary.unwrap_or_default()).to_ascii_lowercase();
    for (needle, tag) in [
        ("policy", "risk:access-control"),
        ("auth", "risk:access-control"),
        ("consent", "risk:patient-consent"),
        ("audit", "risk:audit-trail"),
        ("patient", "risk:patient-data"),
    ] {
        if text.contains(needle) {
            tags.push(tag.to_string());
        }
    }
    tags.sort();
    tags.dedup();
    tags
}

#[cfg(test)]
mod tests {
    use std::fs;

    use tempfile::tempdir;

    use crate::Framework;
    use crate::composer::export_packages;
    use crate::config::IndexerConfig;
    use crate::sanitizer::Sanitizer;
    use crate::scanner::scan_repo;

    use super::extract_symbols;

    #[test]
    fn extracts_symbols_with_docblocks() {
        let dir = tempdir().unwrap();
        fs::create_dir_all(dir.path().join("app")).unwrap();
        fs::write(dir.path().join("composer.json"), r#"{"name":"acme/app"}"#).unwrap();
        fs::write(
            dir.path().join("app/ConsentService.php"),
            r#"<?php
namespace App\Services;

class ConsentService {
    /**
     * Sign consent.
     * @param string $patientId patient id
     * @return bool
     */
    public function sign(string $patientId): bool
    {
        return true;
    }
}
"#,
        )
        .unwrap();

        let files = scan_repo(dir.path(), &IndexerConfig::default().paths).unwrap();
        let packages = export_packages(dir.path()).unwrap();
        let symbols = extract_symbols(
            dir.path(),
            "acme/app",
            Framework::Laravel,
            &files,
            &packages,
            &Sanitizer::default(),
        )
        .unwrap();

        assert!(
            symbols
                .iter()
                .any(|symbol| symbol.fqn == "App\\Services\\ConsentService")
        );
        assert!(
            symbols
                .iter()
                .any(|symbol| symbol.fqn == "App\\Services\\ConsentService::sign"
                    && symbol.doc_summary.as_deref() == Some("Sign consent."))
        );
    }
}