Skip to main content

provenant/parsers/
buck.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Buck BUILD and METADATA.bzl parsers
5//!
6//! Extracts package metadata from Buck build system files using Starlark (Python-like) syntax.
7//!
8//! ## Features
9//! - **BuckBuildParser**: Parses BUCK files with multiple package support
10//! - **BuckMetadataBzlParser**: Parses METADATA.bzl dictionary assignments with package_url support
11//!
12//! ## Usage
13//! - `BuckBuildParser::extract_packages()` - Returns ALL packages from BUCK file
14//! - `BuckMetadataBzlParser::extract_first_package()` - Returns single package from METADATA.bzl
15//!
16//! ## Reference
17//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py`
18//! - BuckPackageHandler (lines 310-325)
19//! - BuckMetadataBzlHandler (lines 328-432)
20
21use std::collections::HashMap;
22use std::path::Path;
23
24use crate::parser_warn as warn;
25use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
26use packageurl::PackageUrl;
27use starlark_syntax::syntax::ast;
28use starlark_syntax::syntax::module::AstModuleFields;
29use starlark_syntax::syntax::{AstModule, Dialect};
30
31use crate::models::{DatasourceId, PackageData, PackageType, Party, Sha1Digest};
32
33use super::PackageParser;
34use super::metadata::ParserMetadata;
35
36type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
37
38struct StarlarkCall<'a> {
39    func: &'a ast::AstExpr,
40    args: &'a StarlarkCallArgs,
41}
42
43/// Parser for Buck BUCK files (build rules)
44pub struct BuckBuildParser;
45
46impl PackageParser for BuckBuildParser {
47    const PACKAGE_TYPE: PackageType = PackageType::Buck;
48
49    fn metadata() -> Vec<ParserMetadata> {
50        vec![ParserMetadata {
51            description: "Buck build file and METADATA.bzl",
52            file_patterns: &["**/BUCK", "**/METADATA.bzl"],
53            package_type: "buck",
54            primary_language: "",
55            documentation_url: Some("https://buck.build/"),
56        }]
57    }
58
59    fn is_match(path: &Path) -> bool {
60        path.file_name()
61            .and_then(|name| name.to_str())
62            .is_some_and(|name| name == "BUCK")
63    }
64
65    fn extract_packages(path: &Path) -> Vec<PackageData> {
66        match parse_buck_build(path) {
67            Ok(packages) if !packages.is_empty() => packages,
68            Ok(_) => vec![fallback_package_data(path)],
69            Err(e) => {
70                warn!("Failed to parse Buck BUCK file {:?}: {}", path, e);
71                vec![fallback_package_data(path)]
72            }
73        }
74    }
75}
76
77/// Parser for Buck METADATA.bzl files (metadata dictionaries)
78pub struct BuckMetadataBzlParser;
79
80impl PackageParser for BuckMetadataBzlParser {
81    const PACKAGE_TYPE: PackageType = PackageType::Buck;
82
83    fn is_match(path: &Path) -> bool {
84        path.file_name()
85            .and_then(|name| name.to_str())
86            .is_some_and(|name| name == "METADATA.bzl")
87    }
88
89    fn extract_packages(path: &Path) -> Vec<PackageData> {
90        vec![match parse_metadata_bzl(path) {
91            Ok(pkg) => pkg,
92            Err(e) => {
93                warn!("Failed to parse Buck METADATA.bzl {:?}: {}", path, e);
94                PackageData {
95                    datasource_id: Some(DatasourceId::BuckMetadata),
96                    ..Default::default()
97                }
98            }
99        }]
100    }
101}
102
103/// Parse a Buck BUCK file (same logic as Bazel BUILD)
104fn parse_buck_build(path: &Path) -> Result<Vec<PackageData>, String> {
105    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
106    let module = parse_starlark_module("<BUCK>", content)?;
107
108    let mut packages = Vec::new();
109
110    for statement in top_level_statements(&module)
111        .iter()
112        .take(MAX_ITERATION_COUNT)
113    {
114        if let Some(package_data) = extract_build_package_from_statement(statement) {
115            packages.push(package_data);
116        }
117    }
118
119    Ok(packages)
120}
121
122/// Parse a Buck METADATA.bzl file
123fn parse_metadata_bzl(path: &Path) -> Result<PackageData, String> {
124    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
125    let module = parse_starlark_module("<METADATA.bzl>", content)?;
126
127    for statement in top_level_statements(&module)
128        .iter()
129        .take(MAX_ITERATION_COUNT)
130    {
131        if let Some(dict) = extract_metadata_assignment_dict(statement) {
132            return Ok(extract_metadata_dict(dict));
133        }
134    }
135
136    // No METADATA found
137    Ok(PackageData {
138        datasource_id: Some(DatasourceId::BuckMetadata),
139        ..Default::default()
140    })
141}
142
143fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
144    let content = preprocess_starlark_content(&content);
145    let dialect = Dialect {
146        enable_top_level_stmt: true,
147        ..Dialect::Standard
148    };
149    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
150}
151
152fn preprocess_starlark_content(content: &str) -> String {
153    let mut normalized = String::with_capacity(content.len());
154    let mut pending_oss_disable_indent: Option<String> = None;
155
156    for raw_line in content.lines() {
157        let trimmed_start = raw_line.trim_start();
158        let indent_len = raw_line.len() - trimmed_start.len();
159        let indent = &raw_line[..indent_len];
160
161        if trimmed_start.starts_with('#') && trimmed_start.contains("@oss-disable") {
162            pending_oss_disable_indent = Some(indent.to_string());
163            continue;
164        }
165
166        if let Some(marker_index) = raw_line.find("# @oss-enable") {
167            let code = raw_line[..marker_index].trim_end();
168            if !code.is_empty() {
169                if let Some(disabled_indent) = pending_oss_disable_indent.take() {
170                    normalized.push_str(&disabled_indent);
171                    normalized.push_str(code.trim_start());
172                } else {
173                    normalized.push_str(code);
174                }
175                normalized.push('\n');
176            }
177            continue;
178        }
179
180        pending_oss_disable_indent = None;
181        normalized.push_str(raw_line);
182        normalized.push('\n');
183    }
184
185    if !content.ends_with('\n') && normalized.ends_with('\n') {
186        normalized.pop();
187    }
188
189    normalized
190}
191
192fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
193    match &module.statement().node {
194        ast::StmtP::Statements(statements) => statements,
195        _ => std::slice::from_ref(module.statement()),
196    }
197}
198
199fn extract_metadata_assignment_dict(
200    statement: &ast::AstStmt,
201) -> Option<&[(ast::AstExpr, ast::AstExpr)]> {
202    let ast::StmtP::Assign(assign) = &statement.node else {
203        return None;
204    };
205    let ast::AssignTargetP::Identifier(target) = &assign.lhs.node else {
206        return None;
207    };
208    if target.node.ident != "METADATA" {
209        return None;
210    }
211    match &assign.rhs.node {
212        ast::ExprP::Dict(items) => Some(items.as_slice()),
213        _ => None,
214    }
215}
216
217/// Extract metadata from a dictionary AST node
218fn extract_metadata_dict(dict: &[(ast::AstExpr, ast::AstExpr)]) -> PackageData {
219    let mut fields: HashMap<String, MetadataValue> = HashMap::new();
220
221    for (key, value) in dict.iter().take(MAX_ITERATION_COUNT) {
222        let Some(key_name) = expr_as_string(key) else {
223            continue;
224        };
225        let Some(metadata_value) = metadata_value_from_expr(value) else {
226            continue;
227        };
228
229        fields.insert(key_name, metadata_value);
230    }
231
232    build_package_from_metadata(fields)
233}
234
235fn get_metadata_string(fields: &HashMap<String, MetadataValue>, keys: &[&str]) -> Option<String> {
236    keys.iter().find_map(|key| match fields.get(*key) {
237        Some(MetadataValue::String(value)) => Some(value.clone()),
238        _ => None,
239    })
240}
241
242fn get_metadata_list(
243    fields: &HashMap<String, MetadataValue>,
244    keys: &[&str],
245) -> Option<Vec<String>> {
246    keys.iter().find_map(|key| match fields.get(*key) {
247        Some(MetadataValue::List(values)) => Some(values.clone()),
248        _ => None,
249    })
250}
251
252/// Metadata value types
253enum MetadataValue {
254    String(String),
255    List(Vec<String>),
256}
257
258fn split_buck_license_values(values: &[String]) -> (Vec<String>, Vec<String>) {
259    let mut statements = Vec::new();
260    let mut references = Vec::new();
261
262    for value in values {
263        if is_probable_local_license_reference(value) {
264            references.push(value.clone());
265        } else {
266            statements.push(value.clone());
267        }
268    }
269
270    (statements, references)
271}
272
273fn is_probable_local_license_reference(value: &str) -> bool {
274    let trimmed = value.trim();
275    if trimmed.is_empty() {
276        return false;
277    }
278
279    let lower = trimmed.to_ascii_lowercase();
280    lower.contains('/')
281        || lower.contains('\\')
282        || lower.starts_with("license")
283        || lower.starts_with("licence")
284        || lower.starts_with("copying")
285        || lower.starts_with("notice")
286        || lower.starts_with("copyright")
287        || lower.ends_with(".txt")
288        || lower.ends_with(".md")
289        || lower.ends_with(".rst")
290        || lower.ends_with(".html")
291}
292
293fn insert_license_reference_extra_data(
294    extra_data: &mut HashMap<String, serde_json::Value>,
295    references: &[String],
296) {
297    match references {
298        [] => {}
299        [reference] => {
300            extra_data.insert(
301                "license_file".to_string(),
302                serde_json::Value::String(reference.clone()),
303            );
304        }
305        _ => {
306            extra_data.insert(
307                "license_files".to_string(),
308                serde_json::Value::Array(
309                    references
310                        .iter()
311                        .cloned()
312                        .map(serde_json::Value::String)
313                        .collect(),
314                ),
315            );
316        }
317    }
318}
319
320/// Build PackageData from extracted metadata fields
321fn build_package_from_metadata(fields: HashMap<String, MetadataValue>) -> PackageData {
322    let mut pkg = PackageData {
323        datasource_id: Some(DatasourceId::BuckMetadata),
324        ..Default::default()
325    };
326    let mut license_references = Vec::new();
327
328    // Extract name
329    if let Some(name) = get_metadata_string(&fields, &["name"]) {
330        pkg.name = Some(truncate_field(name));
331    }
332
333    // Extract version
334    if let Some(version) = get_metadata_string(&fields, &["version"]) {
335        pkg.version = Some(truncate_field(version));
336    }
337
338    // Extract namespace from explicit metadata when present.
339    if let Some(namespace) = get_metadata_string(&fields, &["namespace"]) {
340        pkg.namespace = Some(truncate_field(namespace));
341    }
342
343    // Extract package type from canonical or legacy ecosystem fields.
344    // Intentionally ignore `upstream_type`: it does not describe the purl package type.
345    if let Some(ecosystem) = get_metadata_string(&fields, &["ecosystem", "type", "package_type"])
346        && let Ok(package_type) = ecosystem.parse::<PackageType>()
347    {
348        pkg.package_type = Some(package_type);
349    }
350
351    // Extract licenses (licenses or license_expression)
352    if let Some(licenses) = get_metadata_list(&fields, &["licenses"]) {
353        let (license_statements, references) = split_buck_license_values(&licenses);
354        license_references = references;
355        let extracted_license_statement = if !license_statements.is_empty() {
356            Some(license_statements.join(", "))
357        } else if !license_references.is_empty() {
358            Some(license_references.join(", "))
359        } else {
360            None
361        };
362        pkg.extracted_license_statement = extracted_license_statement.map(truncate_field);
363    } else if let Some(license_expression) = get_metadata_string(&fields, &["license_expression"]) {
364        pkg.extracted_license_statement = Some(truncate_field(license_expression));
365    }
366
367    if let Some(copyright) = get_metadata_list(&fields, &["copyrights"]) {
368        if !copyright.is_empty() {
369            pkg.copyright = Some(truncate_field(copyright.join("\n")));
370        }
371    } else if let Some(copyright) = get_metadata_string(&fields, &["copyright"]) {
372        pkg.copyright = Some(truncate_field(copyright));
373    }
374
375    // Extract homepage (upstream_address, upstream_url, or homepage_url)
376    if let Some(homepage_url) = get_metadata_string(
377        &fields,
378        &["upstream_address", "upstream_url", "homepage_url"],
379    ) {
380        pkg.homepage_url = Some(truncate_field(homepage_url));
381    }
382
383    // Extract download_url
384    if let Some(download_url) = get_metadata_string(&fields, &["download_url"]) {
385        pkg.download_url = Some(truncate_field(download_url));
386    }
387
388    // Extract vcs_url
389    if let Some(vcs_url) = get_metadata_string(&fields, &["vcs_url"]) {
390        pkg.vcs_url = Some(truncate_field(vcs_url));
391    }
392
393    // Extract sha1 (download_archive_sha1)
394    if let Some(sha1) = get_metadata_string(&fields, &["download_archive_sha1"]) {
395        pkg.sha1 = Sha1Digest::from_hex(&sha1).ok();
396    }
397
398    // Extract maintainers
399    if let Some(maintainers) = get_metadata_list(&fields, &["maintainers"]) {
400        pkg.parties.extend(maintainers.iter().map(|name| Party {
401            r#type: Some("organization".to_string()),
402            name: Some(name.clone()),
403            role: Some("maintainer".to_string()),
404            email: None,
405            url: None,
406            organization: None,
407            organization_url: None,
408            timezone: None,
409        }));
410    }
411
412    if let Some(vendor) = get_metadata_string(&fields, &["vendor", "publisher"]) {
413        pkg.parties.push(Party {
414            r#type: None,
415            name: Some(vendor),
416            role: Some("publisher".to_string()),
417            email: None,
418            url: None,
419            organization: None,
420            organization_url: None,
421            timezone: None,
422        });
423    }
424
425    // Extract extra_data fields
426    let mut extra_data = HashMap::new();
427    if let Some(vcs_commit_hash) = get_metadata_string(&fields, &["vcs_commit_hash"]) {
428        extra_data.insert(
429            "vcs_commit_hash".to_string(),
430            serde_json::Value::String(vcs_commit_hash),
431        );
432    }
433    if let Some(upstream_hash) =
434        get_metadata_string(&fields, &["upstream_hash", "upstream_commit_hash"])
435    {
436        extra_data.insert(
437            "upstream_hash".to_string(),
438            serde_json::Value::String(upstream_hash),
439        );
440    }
441    if let Some(upstream_branch) = get_metadata_string(&fields, &["upstream_branch"]) {
442        extra_data.insert(
443            "upstream_branch".to_string(),
444            serde_json::Value::String(upstream_branch),
445        );
446    }
447    insert_license_reference_extra_data(&mut extra_data, &license_references);
448    if !extra_data.is_empty() {
449        pkg.extra_data = Some(extra_data);
450    }
451
452    // Parse package_url if present and update package fields
453    if let Some(purl_str) = get_metadata_string(&fields, &["package_url"])
454        && let Ok(purl) = purl_str.parse::<PackageUrl>()
455    {
456        pkg.purl = Some(truncate_field(purl.to_string()));
457
458        if let Ok(package_type) = purl.ty().parse::<PackageType>() {
459            pkg.package_type = Some(package_type);
460        }
461        if let Some(ns) = purl.namespace() {
462            pkg.namespace = Some(truncate_field(ns.to_string()));
463        }
464        pkg.name = Some(truncate_field(purl.name().to_string()));
465        if let Some(ver) = purl.version() {
466            pkg.version = Some(truncate_field(ver.to_string()));
467        }
468        // Qualifiers
469        if !purl.qualifiers().is_empty() {
470            let quals: HashMap<String, String> = purl
471                .qualifiers()
472                .iter()
473                .map(|(k, v)| (k.to_string(), v.to_string()))
474                .collect();
475            pkg.qualifiers = Some(quals);
476        }
477        // Subpath
478        if let Some(sp) = purl.subpath() {
479            pkg.subpath = Some(sp.to_string());
480        }
481    }
482
483    pkg
484}
485
486fn metadata_value_from_expr(expr: &ast::AstExpr) -> Option<MetadataValue> {
487    if let Some(string) = expr_as_string(expr) {
488        return Some(MetadataValue::String(string));
489    }
490
491    let items = match &expr.node {
492        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
493        _ => return None,
494    };
495    let values: Vec<_> = items
496        .iter()
497        .take(MAX_ITERATION_COUNT)
498        .filter_map(expr_as_string)
499        .collect();
500    (!values.is_empty()).then_some(MetadataValue::List(values))
501}
502
503/// Extract package data from a single AST statement (for BUCK files)
504fn extract_build_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
505    let call = extract_call(statement)?;
506    let rule_name = match &call.func.node {
507        ast::ExprP::Identifier(identifier) => identifier.node.ident.as_str(),
508        _ => return None,
509    };
510
511    if !check_rule_name_ending(rule_name) {
512        return None;
513    }
514
515    let name = extract_named_kwarg_string(&call, "name");
516    let licenses = extract_named_kwarg_string_list(&call, "licenses");
517
518    let package_name = name?;
519    let (license_statements, license_references) = licenses
520        .as_deref()
521        .map(split_buck_license_values)
522        .unwrap_or_default();
523    let extracted_license_statement = if !license_statements.is_empty() {
524        Some(truncate_field(license_statements.join(", ")))
525    } else if !license_references.is_empty() {
526        Some(truncate_field(license_references.join(", ")))
527    } else {
528        None
529    };
530    let mut extra_data = HashMap::new();
531    insert_license_reference_extra_data(&mut extra_data, &license_references);
532
533    Some(PackageData {
534        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
535        name: Some(truncate_field(package_name)),
536        extracted_license_statement,
537        extra_data: (!extra_data.is_empty()).then_some(extra_data),
538        datasource_id: Some(DatasourceId::BuckFile),
539        ..Default::default()
540    })
541}
542
543fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
544    match &statement.node {
545        ast::StmtP::Expression(expr) => extract_call_expr(expr),
546        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
547        _ => None,
548    }
549}
550
551fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
552    match &expr.node {
553        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
554        _ => None,
555    }
556}
557
558fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
559    call.args
560        .args
561        .iter()
562        .find_map(|argument| match &argument.node {
563            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
564            _ => None,
565        })
566}
567
568fn extract_named_kwarg_string(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
569    extract_named_kwarg(call, key).and_then(expr_as_string)
570}
571
572fn extract_named_kwarg_string_list(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
573    let expr = extract_named_kwarg(call, key)?;
574    let items = match &expr.node {
575        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
576        _ => return None,
577    };
578    let values: Vec<_> = items
579        .iter()
580        .take(MAX_ITERATION_COUNT)
581        .filter_map(expr_as_string)
582        .collect();
583    (!values.is_empty()).then_some(values)
584}
585
586fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
587    match &expr.node {
588        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
589        _ => None,
590    }
591}
592
593/// Check if rule name ends with "binary" or "library"
594fn check_rule_name_ending(rule_name: &str) -> bool {
595    rule_name.ends_with("binary") || rule_name.ends_with("library")
596}
597
598/// Create fallback package data using parent directory name
599fn fallback_package_data(path: &Path) -> PackageData {
600    let name = path
601        .parent()
602        .and_then(|p| p.file_name())
603        .and_then(|n| n.to_str())
604        .map(|s| s.to_string());
605
606    PackageData {
607        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
608        name,
609        datasource_id: Some(DatasourceId::BuckFile),
610        ..Default::default()
611    }
612}
613
614#[cfg(test)]
615mod tests {
616    use super::*;
617    use std::path::PathBuf;
618
619    #[test]
620    fn test_buck_build_is_match() {
621        assert!(BuckBuildParser::is_match(&PathBuf::from("BUCK")));
622        assert!(BuckBuildParser::is_match(&PathBuf::from("path/to/BUCK")));
623        assert!(!BuckBuildParser::is_match(&PathBuf::from("BUILD")));
624        assert!(!BuckBuildParser::is_match(&PathBuf::from("buck")));
625    }
626
627    #[test]
628    fn test_metadata_bzl_is_match() {
629        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
630            "METADATA.bzl"
631        )));
632        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
633            "path/to/METADATA.bzl"
634        )));
635        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from(
636            "metadata.bzl"
637        )));
638        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from("METADATA")));
639    }
640
641    #[test]
642    fn test_check_rule_name_ending() {
643        assert!(check_rule_name_ending("android_binary"));
644        assert!(check_rule_name_ending("android_library"));
645        assert!(check_rule_name_ending("java_binary"));
646        assert!(!check_rule_name_ending("filegroup"));
647    }
648
649    #[test]
650    fn test_preprocess_starlark_content_handles_oss_guarded_alternatives() {
651        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
652prelude = native
653
654# @oss-disable: rust_unittest(
655    rust_test( # @oss-enable
656        name = "test",
657    )
658
659platform_utils = None # @oss-enable
660"#;
661
662        let normalized = preprocess_starlark_content(content);
663
664        assert!(!normalized.contains("@oss-disable"));
665        assert!(!normalized.contains("@oss-enable"));
666        assert!(normalized.contains("rust_test("));
667        assert!(normalized.contains("platform_utils = None"));
668        assert!(!normalized.contains("    rust_test("));
669    }
670
671    #[test]
672    fn test_parse_buck_build_with_oss_guarded_rule() {
673        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_library.bzl", "rust_library")
674# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
675
676oncall("build_infra")
677
678rust_library(
679    name = "library",
680    srcs = ["src/lib.rs"],
681)
682
683# @oss-disable: rust_unittest(
684    rust_test( # @oss-enable
685    name = "test",
686    srcs = ["tests/test.rs"],
687)
688"#;
689
690        let temp_dir = tempfile::tempdir().unwrap();
691        let buck_path = temp_dir.path().join("BUCK");
692        std::fs::write(&buck_path, content).unwrap();
693
694        let packages = parse_buck_build(&buck_path).expect("BUCK file should parse");
695
696        assert_eq!(packages.len(), 1);
697        assert_eq!(packages[0].package_type, Some(PackageType::Buck));
698        assert_eq!(packages[0].name.as_deref(), Some("library"));
699    }
700}