Skip to main content

source_map_php/extract/
mod.rs

1mod fallback;
2mod phpactor;
3
4use std::collections::HashMap;
5use std::fs;
6use std::path::Path;
7
8use anyhow::Result;
9use regex::Regex;
10
11use crate::Framework;
12use crate::composer::ComposerExport;
13use crate::models::{SymbolDoc, make_stable_id};
14use crate::sanitizer::Sanitizer;
15
16#[derive(Debug, Clone)]
17pub struct DeclarationCandidate {
18    pub kind: String,
19    pub name: String,
20    pub owner_class: Option<String>,
21    pub namespace: Option<String>,
22    pub line_start: usize,
23    pub line_end: usize,
24    pub signature: Option<String>,
25    pub extraction_confidence: String,
26    pub references_count: u32,
27}
28
29#[derive(Debug, Clone, Default)]
30struct ParsedComments {
31    summary: Option<String>,
32    description: Option<String>,
33    params: Vec<String>,
34    return_doc: Option<String>,
35    throws_docs: Vec<String>,
36    inline_comments: Vec<String>,
37}
38
39pub fn extract_symbols(
40    repo: &Path,
41    repo_name: &str,
42    framework: Framework,
43    files: &[crate::scanner::ScannedFile],
44    packages: &ComposerExport,
45    sanitizer: &Sanitizer,
46) -> Result<Vec<SymbolDoc>> {
47    let mut phpactor = phpactor::PhpactorExtractor::connect(repo).ok();
48    let mut symbols = Vec::new();
49
50    for file in files {
51        if !file.relative_path.to_string_lossy().ends_with(".php") {
52            continue;
53        }
54
55        let contents = fs::read_to_string(&file.absolute_path)?;
56        let declarations = if let Some(client) = phpactor.as_mut() {
57            client
58                .extract_candidates(&file.absolute_path, &contents)
59                .unwrap_or_else(|_| fallback::extract_candidates(&contents))
60        } else {
61            fallback::extract_candidates(&contents)
62        };
63        let comment_map = collect_comments(&contents, sanitizer);
64        let package = packages.package_for_path(&file.absolute_path);
65        let is_test =
66            file.relative_path.starts_with("tests") || file.relative_path.starts_with("test");
67        let path_str = file.relative_path.to_string_lossy().into_owned();
68        let abs_str = file.absolute_path.to_string_lossy().into_owned();
69
70        for declaration in declarations {
71            let fqn = build_fqn(&declaration);
72            let stable_key = format!("{}|{}|{}", repo_name, declaration.kind, fqn);
73            let comments = comment_map
74                .get(&declaration.line_start)
75                .cloned()
76                .unwrap_or_default();
77
78            symbols.push(SymbolDoc {
79                id: make_stable_id(&[
80                    repo_name,
81                    &declaration.kind,
82                    &fqn,
83                    &path_str,
84                    &declaration.line_start.to_string(),
85                ]),
86                stable_key,
87                repo: repo_name.to_string(),
88                framework: framework.as_str().to_string(),
89                kind: declaration.kind.clone(),
90                short_name: declaration.name.clone(),
91                fqn,
92                owner_class: declaration.owner_class.clone(),
93                namespace: declaration.namespace.clone(),
94                signature: declaration.signature.clone(),
95                doc_summary: comments.summary.clone(),
96                doc_description: comments.description.clone(),
97                param_docs: comments.params.clone(),
98                return_doc: comments.return_doc.clone(),
99                throws_docs: comments.throws_docs.clone(),
100                magic_methods: Vec::new(),
101                magic_properties: Vec::new(),
102                inline_rule_comments: comments.inline_comments.clone(),
103                comment_keywords: keywordize(
104                    comments
105                        .summary
106                        .iter()
107                        .chain(comments.inline_comments.iter())
108                        .map(String::as_str)
109                        .collect::<Vec<_>>()
110                        .join(" ")
111                        .as_str(),
112                ),
113                symbol_tokens: keywordize(&declaration.name),
114                framework_tags: vec![framework.as_str().to_string()],
115                risk_tags: infer_risk_tags(&path_str, comments.summary.as_deref()),
116                route_ids: Vec::new(),
117                related_symbols: Vec::new(),
118                related_tests: Vec::new(),
119                related_tests_count: 0,
120                validation_commands: Vec::new(),
121                missing_test_warning: None,
122                package_name: package.name.clone(),
123                package_type: package.package_type.clone(),
124                package_version: package.version.clone(),
125                package_keywords: package.keywords.clone(),
126                is_vendor: !package.is_root,
127                is_project_code: package.is_root,
128                is_test,
129                autoloadable: true,
130                extraction_confidence: declaration.extraction_confidence.clone(),
131                references_count: declaration.references_count,
132                path: path_str.clone(),
133                absolute_path: abs_str.clone(),
134                line_start: declaration.line_start,
135                line_end: declaration.line_end,
136            });
137        }
138    }
139
140    Ok(symbols)
141}
142
143pub fn fallback_candidates(contents: &str) -> Vec<DeclarationCandidate> {
144    fallback::extract_candidates(contents)
145}
146
147fn build_fqn(declaration: &DeclarationCandidate) -> String {
148    match (&declaration.namespace, &declaration.owner_class) {
149        (Some(namespace), Some(owner)) if declaration.kind == "method" => {
150            format!("{namespace}\\{owner}::{}", declaration.name)
151        }
152        (Some(namespace), _) => format!("{namespace}\\{}", declaration.name),
153        (None, Some(owner)) if declaration.kind == "method" => {
154            format!("{owner}::{}", declaration.name)
155        }
156        _ => declaration.name.clone(),
157    }
158}
159
160fn collect_comments(contents: &str, sanitizer: &Sanitizer) -> HashMap<usize, ParsedComments> {
161    let mut map = HashMap::new();
162    let lines: Vec<_> = contents.lines().collect();
163    let decl_re = Regex::new(r"^\s*(?:final\s+|abstract\s+)?(?:class|interface|trait|enum|function|public\s+function|protected\s+function|private\s+function)").unwrap();
164    let param_re = Regex::new(r"@param\s+(.+)").unwrap();
165    let return_re = Regex::new(r"@return\s+(.+)").unwrap();
166    let throws_re = Regex::new(r"@throws\s+(.+)").unwrap();
167
168    for (idx, line) in lines.iter().enumerate() {
169        if !decl_re.is_match(line) {
170            continue;
171        }
172        let mut cursor = idx as isize - 1;
173        let mut doc_lines = Vec::new();
174        let mut inline_comments = Vec::new();
175        while cursor >= 0 {
176            let candidate = lines[cursor as usize].trim();
177            if candidate.starts_with("//") || candidate.starts_with('#') {
178                if let Some(value) =
179                    sanitizer.sanitize_text(candidate.trim_start_matches(&['/', '#'][..]).trim())
180                {
181                    inline_comments.push(value);
182                }
183                cursor -= 1;
184                continue;
185            }
186            if candidate.ends_with("*/")
187                || candidate.starts_with('*')
188                || candidate.starts_with("/**")
189            {
190                doc_lines.push(candidate.to_string());
191                cursor -= 1;
192                continue;
193            }
194            break;
195        }
196        doc_lines.reverse();
197        inline_comments.reverse();
198
199        let mut parsed = ParsedComments::default();
200        let mut description_lines = Vec::new();
201        for raw in doc_lines {
202            let cleaned = raw
203                .trim_start_matches("/**")
204                .trim_start_matches("/*")
205                .trim_start_matches('*')
206                .trim_end_matches("*/")
207                .trim();
208            if cleaned.is_empty() {
209                continue;
210            }
211            if let Some(param) = param_re
212                .captures(cleaned)
213                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
214                .and_then(|value| sanitizer.sanitize_text(value))
215            {
216                parsed.params.push(param);
217                continue;
218            }
219            if let Some(return_doc) = return_re
220                .captures(cleaned)
221                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
222                .and_then(|value| sanitizer.sanitize_text(value))
223            {
224                parsed.return_doc = Some(return_doc);
225                continue;
226            }
227            if let Some(throws_doc) = throws_re
228                .captures(cleaned)
229                .and_then(|caps| caps.get(1).map(|item| item.as_str()))
230                .and_then(|value| sanitizer.sanitize_text(value))
231            {
232                parsed.throws_docs.push(throws_doc);
233                continue;
234            }
235            if parsed.summary.is_none() {
236                parsed.summary = sanitizer.sanitize_text(cleaned);
237            } else if let Some(line) = sanitizer.sanitize_text(cleaned) {
238                description_lines.push(line);
239            }
240        }
241        parsed.description = if description_lines.is_empty() {
242            None
243        } else {
244            Some(description_lines.join(" "))
245        };
246        parsed.inline_comments = inline_comments;
247
248        map.insert(idx + 1, parsed);
249    }
250
251    map
252}
253
254fn keywordize(text: &str) -> Vec<String> {
255    text.split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '\\')
256        .filter(|token| token.len() > 2)
257        .map(|token| token.to_ascii_lowercase())
258        .collect()
259}
260
261fn infer_risk_tags(path: &str, summary: Option<&str>) -> Vec<String> {
262    let mut tags = Vec::new();
263    let text = format!("{path} {}", summary.unwrap_or_default()).to_ascii_lowercase();
264    for (needle, tag) in [
265        ("policy", "risk:access-control"),
266        ("auth", "risk:access-control"),
267        ("consent", "risk:patient-consent"),
268        ("audit", "risk:audit-trail"),
269        ("patient", "risk:patient-data"),
270    ] {
271        if text.contains(needle) {
272            tags.push(tag.to_string());
273        }
274    }
275    tags.sort();
276    tags.dedup();
277    tags
278}
279
280#[cfg(test)]
281mod tests {
282    use std::fs;
283
284    use tempfile::tempdir;
285
286    use crate::Framework;
287    use crate::composer::export_packages;
288    use crate::config::IndexerConfig;
289    use crate::sanitizer::Sanitizer;
290    use crate::scanner::scan_repo;
291
292    use super::extract_symbols;
293
294    #[test]
295    fn extracts_symbols_with_docblocks() {
296        let dir = tempdir().unwrap();
297        fs::create_dir_all(dir.path().join("app")).unwrap();
298        fs::write(dir.path().join("composer.json"), r#"{"name":"acme/app"}"#).unwrap();
299        fs::write(
300            dir.path().join("app/ConsentService.php"),
301            r#"<?php
302namespace App\Services;
303
304class ConsentService {
305    /**
306     * Sign consent.
307     * @param string $patientId patient id
308     * @return bool
309     */
310    public function sign(string $patientId): bool
311    {
312        return true;
313    }
314}
315"#,
316        )
317        .unwrap();
318
319        let files = scan_repo(dir.path(), &IndexerConfig::default().paths).unwrap();
320        let packages = export_packages(dir.path()).unwrap();
321        let symbols = extract_symbols(
322            dir.path(),
323            "acme/app",
324            Framework::Laravel,
325            &files,
326            &packages,
327            &Sanitizer::default(),
328        )
329        .unwrap();
330
331        assert!(
332            symbols
333                .iter()
334                .any(|symbol| symbol.fqn == "App\\Services\\ConsentService")
335        );
336        assert!(
337            symbols
338                .iter()
339                .any(|symbol| symbol.fqn == "App\\Services\\ConsentService::sign"
340                    && symbol.doc_summary.as_deref() == Some("Sign consent."))
341        );
342    }
343}