Skip to main content

provenant/parsers/
buck.rs

1//! Buck BUILD and METADATA.bzl parsers
2//!
3//! Extracts package metadata from Buck build system files using Starlark (Python-like) syntax.
4//!
5//! ## Features
6//! - **BuckBuildParser**: Parses BUCK files with multiple package support
7//! - **BuckMetadataBzlParser**: Parses METADATA.bzl dictionary assignments with package_url support
8//!
9//! ## Usage
10//! - `BuckBuildParser::extract_packages()` - Returns ALL packages from BUCK file
11//! - `BuckMetadataBzlParser::extract_first_package()` - Returns single package from METADATA.bzl
12//!
13//! ## Reference
14//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py`
15//! - BuckPackageHandler (lines 310-325)
16//! - BuckMetadataBzlHandler (lines 328-432)
17
18use std::collections::HashMap;
19use std::path::Path;
20
21use crate::parser_warn as warn;
22use packageurl::PackageUrl;
23use starlark_syntax::syntax::ast;
24use starlark_syntax::syntax::module::AstModuleFields;
25use starlark_syntax::syntax::{AstModule, Dialect};
26
27use crate::models::{DatasourceId, PackageData, PackageType, Party, Sha1Digest};
28
29use super::PackageParser;
30
31type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
32
33struct StarlarkCall<'a> {
34    func: &'a ast::AstExpr,
35    args: &'a StarlarkCallArgs,
36}
37
38/// Parser for Buck BUCK files (build rules)
39pub struct BuckBuildParser;
40
41impl PackageParser for BuckBuildParser {
42    const PACKAGE_TYPE: PackageType = PackageType::Buck;
43
44    fn is_match(path: &Path) -> bool {
45        path.file_name()
46            .and_then(|name| name.to_str())
47            .is_some_and(|name| name == "BUCK")
48    }
49
50    fn extract_packages(path: &Path) -> Vec<PackageData> {
51        match parse_buck_build(path) {
52            Ok(packages) if !packages.is_empty() => packages,
53            Ok(_) => vec![fallback_package_data(path)],
54            Err(e) => {
55                warn!("Failed to parse Buck BUCK file {:?}: {}", path, e);
56                vec![fallback_package_data(path)]
57            }
58        }
59    }
60}
61
62/// Parser for Buck METADATA.bzl files (metadata dictionaries)
63pub struct BuckMetadataBzlParser;
64
65impl PackageParser for BuckMetadataBzlParser {
66    const PACKAGE_TYPE: PackageType = PackageType::Buck;
67
68    fn is_match(path: &Path) -> bool {
69        path.file_name()
70            .and_then(|name| name.to_str())
71            .is_some_and(|name| name == "METADATA.bzl")
72    }
73
74    fn extract_packages(path: &Path) -> Vec<PackageData> {
75        vec![match parse_metadata_bzl(path) {
76            Ok(pkg) => pkg,
77            Err(e) => {
78                warn!("Failed to parse Buck METADATA.bzl {:?}: {}", path, e);
79                PackageData {
80                    package_type: Some(Self::PACKAGE_TYPE),
81                    datasource_id: Some(DatasourceId::BuckMetadata),
82                    ..Default::default()
83                }
84            }
85        }]
86    }
87}
88
89/// Parse a Buck BUCK file (same logic as Bazel BUILD)
90fn parse_buck_build(path: &Path) -> Result<Vec<PackageData>, String> {
91    let content =
92        std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
93    let module = parse_starlark_module("<BUCK>", content)?;
94
95    let mut packages = Vec::new();
96
97    for statement in top_level_statements(&module) {
98        if let Some(package_data) = extract_build_package_from_statement(statement) {
99            packages.push(package_data);
100        }
101    }
102
103    Ok(packages)
104}
105
106/// Parse a Buck METADATA.bzl file
107fn parse_metadata_bzl(path: &Path) -> Result<PackageData, String> {
108    let content =
109        std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
110    let module = parse_starlark_module("<METADATA.bzl>", content)?;
111
112    // Look for METADATA = {...} assignment
113    for statement in top_level_statements(&module) {
114        if let Some(dict) = extract_metadata_assignment_dict(statement) {
115            return Ok(extract_metadata_dict(dict));
116        }
117    }
118
119    // No METADATA found
120    Ok(PackageData {
121        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
122        datasource_id: Some(DatasourceId::BuckMetadata),
123        ..Default::default()
124    })
125}
126
127fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
128    let dialect = Dialect {
129        enable_top_level_stmt: true,
130        ..Dialect::Standard
131    };
132    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
133}
134
135fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
136    match &module.statement().node {
137        ast::StmtP::Statements(statements) => statements,
138        _ => std::slice::from_ref(module.statement()),
139    }
140}
141
142fn extract_metadata_assignment_dict(
143    statement: &ast::AstStmt,
144) -> Option<&[(ast::AstExpr, ast::AstExpr)]> {
145    let ast::StmtP::Assign(assign) = &statement.node else {
146        return None;
147    };
148    let ast::AssignTargetP::Identifier(target) = &assign.lhs.node else {
149        return None;
150    };
151    if target.node.ident != "METADATA" {
152        return None;
153    }
154    match &assign.rhs.node {
155        ast::ExprP::Dict(items) => Some(items.as_slice()),
156        _ => None,
157    }
158}
159
160/// Extract metadata from a dictionary AST node
161fn extract_metadata_dict(dict: &[(ast::AstExpr, ast::AstExpr)]) -> PackageData {
162    let mut fields: HashMap<String, MetadataValue> = HashMap::new();
163
164    for (key, value) in dict {
165        let Some(key_name) = expr_as_string(key) else {
166            continue;
167        };
168        let Some(metadata_value) = metadata_value_from_expr(value) else {
169            continue;
170        };
171
172        fields.insert(key_name, metadata_value);
173    }
174
175    build_package_from_metadata(fields)
176}
177
178fn get_metadata_string(fields: &HashMap<String, MetadataValue>, keys: &[&str]) -> Option<String> {
179    keys.iter().find_map(|key| match fields.get(*key) {
180        Some(MetadataValue::String(value)) => Some(value.clone()),
181        _ => None,
182    })
183}
184
185fn get_metadata_list(
186    fields: &HashMap<String, MetadataValue>,
187    keys: &[&str],
188) -> Option<Vec<String>> {
189    keys.iter().find_map(|key| match fields.get(*key) {
190        Some(MetadataValue::List(values)) => Some(values.clone()),
191        _ => None,
192    })
193}
194
195/// Metadata value types
196enum MetadataValue {
197    String(String),
198    List(Vec<String>),
199}
200
201fn split_buck_license_values(values: &[String]) -> (Vec<String>, Vec<String>) {
202    let mut statements = Vec::new();
203    let mut references = Vec::new();
204
205    for value in values {
206        if is_probable_local_license_reference(value) {
207            references.push(value.clone());
208        } else {
209            statements.push(value.clone());
210        }
211    }
212
213    (statements, references)
214}
215
216fn is_probable_local_license_reference(value: &str) -> bool {
217    let trimmed = value.trim();
218    if trimmed.is_empty() {
219        return false;
220    }
221
222    let lower = trimmed.to_ascii_lowercase();
223    lower.contains('/')
224        || lower.contains('\\')
225        || lower.starts_with("license")
226        || lower.starts_with("licence")
227        || lower.starts_with("copying")
228        || lower.starts_with("notice")
229        || lower.starts_with("copyright")
230        || lower.ends_with(".txt")
231        || lower.ends_with(".md")
232        || lower.ends_with(".rst")
233        || lower.ends_with(".html")
234}
235
236fn insert_license_reference_extra_data(
237    extra_data: &mut HashMap<String, serde_json::Value>,
238    references: &[String],
239) {
240    match references {
241        [] => {}
242        [reference] => {
243            extra_data.insert(
244                "license_file".to_string(),
245                serde_json::Value::String(reference.clone()),
246            );
247        }
248        _ => {
249            extra_data.insert(
250                "license_files".to_string(),
251                serde_json::Value::Array(
252                    references
253                        .iter()
254                        .cloned()
255                        .map(serde_json::Value::String)
256                        .collect(),
257                ),
258            );
259        }
260    }
261}
262
263/// Build PackageData from extracted metadata fields
264fn build_package_from_metadata(fields: HashMap<String, MetadataValue>) -> PackageData {
265    let mut pkg = PackageData {
266        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
267        datasource_id: Some(DatasourceId::BuckMetadata),
268        ..Default::default()
269    };
270    let mut license_references = Vec::new();
271
272    // Extract name
273    if let Some(name) = get_metadata_string(&fields, &["name"]) {
274        pkg.name = Some(name);
275    }
276
277    // Extract version
278    if let Some(version) = get_metadata_string(&fields, &["version"]) {
279        pkg.version = Some(version);
280    }
281
282    // Extract namespace from explicit metadata when present.
283    if let Some(namespace) = get_metadata_string(&fields, &["namespace"]) {
284        pkg.namespace = Some(namespace);
285    }
286
287    // Extract package type from canonical or legacy ecosystem fields.
288    // Intentionally ignore `upstream_type`: it does not describe the purl package type.
289    if let Some(ecosystem) = get_metadata_string(&fields, &["ecosystem", "type", "package_type"])
290        && let Ok(package_type) = ecosystem.parse::<PackageType>()
291    {
292        pkg.package_type = Some(package_type);
293    }
294
295    // Extract licenses (licenses or license_expression)
296    if let Some(licenses) = get_metadata_list(&fields, &["licenses"]) {
297        let (license_statements, references) = split_buck_license_values(&licenses);
298        license_references = references;
299        let extracted_license_statement = if !license_statements.is_empty() {
300            Some(license_statements.join(", "))
301        } else if !license_references.is_empty() {
302            Some(license_references.join(", "))
303        } else {
304            None
305        };
306        pkg.extracted_license_statement = extracted_license_statement;
307    } else if let Some(license_expression) = get_metadata_string(&fields, &["license_expression"]) {
308        pkg.extracted_license_statement = Some(license_expression);
309    }
310
311    if let Some(copyright) = get_metadata_list(&fields, &["copyrights"]) {
312        if !copyright.is_empty() {
313            pkg.copyright = Some(copyright.join("\n"));
314        }
315    } else if let Some(copyright) = get_metadata_string(&fields, &["copyright"]) {
316        pkg.copyright = Some(copyright);
317    }
318
319    // Extract homepage (upstream_address, upstream_url, or homepage_url)
320    if let Some(homepage_url) = get_metadata_string(
321        &fields,
322        &["upstream_address", "upstream_url", "homepage_url"],
323    ) {
324        pkg.homepage_url = Some(homepage_url);
325    }
326
327    // Extract download_url
328    if let Some(download_url) = get_metadata_string(&fields, &["download_url"]) {
329        pkg.download_url = Some(download_url);
330    }
331
332    // Extract vcs_url
333    if let Some(vcs_url) = get_metadata_string(&fields, &["vcs_url"]) {
334        pkg.vcs_url = Some(vcs_url);
335    }
336
337    // Extract sha1 (download_archive_sha1)
338    if let Some(sha1) = get_metadata_string(&fields, &["download_archive_sha1"]) {
339        pkg.sha1 = Sha1Digest::from_hex(&sha1).ok();
340    }
341
342    // Extract maintainers
343    if let Some(maintainers) = get_metadata_list(&fields, &["maintainers"]) {
344        pkg.parties.extend(maintainers.iter().map(|name| Party {
345            r#type: Some("organization".to_string()),
346            name: Some(name.clone()),
347            role: Some("maintainer".to_string()),
348            email: None,
349            url: None,
350            organization: None,
351            organization_url: None,
352            timezone: None,
353        }));
354    }
355
356    if let Some(vendor) = get_metadata_string(&fields, &["vendor", "publisher"]) {
357        pkg.parties.push(Party {
358            r#type: None,
359            name: Some(vendor),
360            role: Some("publisher".to_string()),
361            email: None,
362            url: None,
363            organization: None,
364            organization_url: None,
365            timezone: None,
366        });
367    }
368
369    // Extract extra_data fields
370    let mut extra_data = HashMap::new();
371    if let Some(vcs_commit_hash) = get_metadata_string(&fields, &["vcs_commit_hash"]) {
372        extra_data.insert(
373            "vcs_commit_hash".to_string(),
374            serde_json::Value::String(vcs_commit_hash),
375        );
376    }
377    if let Some(upstream_hash) =
378        get_metadata_string(&fields, &["upstream_hash", "upstream_commit_hash"])
379    {
380        extra_data.insert(
381            "upstream_hash".to_string(),
382            serde_json::Value::String(upstream_hash),
383        );
384    }
385    if let Some(upstream_branch) = get_metadata_string(&fields, &["upstream_branch"]) {
386        extra_data.insert(
387            "upstream_branch".to_string(),
388            serde_json::Value::String(upstream_branch),
389        );
390    }
391    insert_license_reference_extra_data(&mut extra_data, &license_references);
392    if !extra_data.is_empty() {
393        pkg.extra_data = Some(extra_data);
394    }
395
396    // Parse package_url if present and update package fields
397    if let Some(purl_str) = get_metadata_string(&fields, &["package_url"])
398        && let Ok(purl) = purl_str.parse::<PackageUrl>()
399    {
400        pkg.purl = Some(purl.to_string());
401
402        // Override package fields with purl data
403        if let Ok(package_type) = purl.ty().parse::<PackageType>() {
404            pkg.package_type = Some(package_type);
405        }
406        if let Some(ns) = purl.namespace() {
407            pkg.namespace = Some(ns.to_string());
408        }
409        pkg.name = Some(purl.name().to_string());
410        if let Some(ver) = purl.version() {
411            pkg.version = Some(ver.to_string());
412        }
413        // Qualifiers
414        if !purl.qualifiers().is_empty() {
415            let quals: HashMap<String, String> = purl
416                .qualifiers()
417                .iter()
418                .map(|(k, v)| (k.to_string(), v.to_string()))
419                .collect();
420            pkg.qualifiers = Some(quals);
421        }
422        // Subpath
423        if let Some(sp) = purl.subpath() {
424            pkg.subpath = Some(sp.to_string());
425        }
426    }
427
428    pkg
429}
430
431fn metadata_value_from_expr(expr: &ast::AstExpr) -> Option<MetadataValue> {
432    if let Some(string) = expr_as_string(expr) {
433        return Some(MetadataValue::String(string));
434    }
435
436    let items = match &expr.node {
437        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
438        _ => return None,
439    };
440    let values: Vec<_> = items.iter().filter_map(expr_as_string).collect();
441    (!values.is_empty()).then_some(MetadataValue::List(values))
442}
443
444/// Extract package data from a single AST statement (for BUCK files)
445fn extract_build_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
446    let call = extract_call(statement)?;
447    let rule_name = match &call.func.node {
448        ast::ExprP::Identifier(identifier) => identifier.node.ident.as_str(),
449        _ => return None,
450    };
451
452    if !check_rule_name_ending(rule_name) {
453        return None;
454    }
455
456    let name = extract_named_kwarg_string(&call, "name");
457    let licenses = extract_named_kwarg_string_list(&call, "licenses");
458
459    let package_name = name?;
460    let (license_statements, license_references) = licenses
461        .as_deref()
462        .map(split_buck_license_values)
463        .unwrap_or_default();
464    let extracted_license_statement = if !license_statements.is_empty() {
465        Some(license_statements.join(", "))
466    } else if !license_references.is_empty() {
467        Some(license_references.join(", "))
468    } else {
469        None
470    };
471    let mut extra_data = HashMap::new();
472    insert_license_reference_extra_data(&mut extra_data, &license_references);
473
474    Some(PackageData {
475        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
476        name: Some(package_name),
477        extracted_license_statement,
478        extra_data: (!extra_data.is_empty()).then_some(extra_data),
479        datasource_id: Some(DatasourceId::BuckFile),
480        ..Default::default()
481    })
482}
483
484fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
485    match &statement.node {
486        ast::StmtP::Expression(expr) => extract_call_expr(expr),
487        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
488        _ => None,
489    }
490}
491
492fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
493    match &expr.node {
494        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
495        _ => None,
496    }
497}
498
499fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
500    call.args
501        .args
502        .iter()
503        .find_map(|argument| match &argument.node {
504            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
505            _ => None,
506        })
507}
508
509fn extract_named_kwarg_string(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
510    extract_named_kwarg(call, key).and_then(expr_as_string)
511}
512
513fn extract_named_kwarg_string_list(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
514    let expr = extract_named_kwarg(call, key)?;
515    let items = match &expr.node {
516        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
517        _ => return None,
518    };
519    let values: Vec<_> = items.iter().filter_map(expr_as_string).collect();
520    (!values.is_empty()).then_some(values)
521}
522
523fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
524    match &expr.node {
525        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
526        _ => None,
527    }
528}
529
530/// Check if rule name ends with "binary" or "library"
531fn check_rule_name_ending(rule_name: &str) -> bool {
532    rule_name.ends_with("binary") || rule_name.ends_with("library")
533}
534
535/// Create fallback package data using parent directory name
536fn fallback_package_data(path: &Path) -> PackageData {
537    let name = path
538        .parent()
539        .and_then(|p| p.file_name())
540        .and_then(|n| n.to_str())
541        .map(|s| s.to_string());
542
543    PackageData {
544        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
545        name,
546        datasource_id: Some(DatasourceId::BuckFile),
547        ..Default::default()
548    }
549}
550
551#[cfg(test)]
552mod tests {
553    use super::*;
554    use std::path::PathBuf;
555
556    #[test]
557    fn test_buck_build_is_match() {
558        assert!(BuckBuildParser::is_match(&PathBuf::from("BUCK")));
559        assert!(BuckBuildParser::is_match(&PathBuf::from("path/to/BUCK")));
560        assert!(!BuckBuildParser::is_match(&PathBuf::from("BUILD")));
561        assert!(!BuckBuildParser::is_match(&PathBuf::from("buck")));
562    }
563
564    #[test]
565    fn test_metadata_bzl_is_match() {
566        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
567            "METADATA.bzl"
568        )));
569        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
570            "path/to/METADATA.bzl"
571        )));
572        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from(
573            "metadata.bzl"
574        )));
575        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from("METADATA")));
576    }
577
578    #[test]
579    fn test_check_rule_name_ending() {
580        assert!(check_rule_name_ending("android_binary"));
581        assert!(check_rule_name_ending("android_library"));
582        assert!(check_rule_name_ending("java_binary"));
583        assert!(!check_rule_name_ending("filegroup"));
584    }
585}
586
587crate::register_parser!(
588    "Buck build file and METADATA.bzl",
589    &["**/BUCK", "**/METADATA.bzl"],
590    "buck",
591    "",
592    Some("https://buck.build/"),
593);