Skip to main content

provenant/parsers/
buck.rs

1//! Buck BUILD and METADATA.bzl parsers
2//!
3//! Extracts package metadata from Buck build system files using Starlark (Python-like) syntax.
4//!
5//! ## Features
6//! - **BuckBuildParser**: Parses BUCK files with multiple package support
7//! - **BuckMetadataBzlParser**: Parses METADATA.bzl dictionary assignments with package_url support
8//!
9//! ## Usage
10//! - `BuckBuildParser::extract_packages()` - Returns ALL packages from BUCK file
11//! - `BuckMetadataBzlParser::extract_first_package()` - Returns single package from METADATA.bzl
12//!
13//! ## Reference
14//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py`
15//! - BuckPackageHandler (lines 310-325)
16//! - BuckMetadataBzlHandler (lines 328-432)
17
18use std::collections::HashMap;
19use std::path::Path;
20
21use crate::parser_warn as warn;
22use packageurl::PackageUrl;
23use starlark_syntax::syntax::ast;
24use starlark_syntax::syntax::module::AstModuleFields;
25use starlark_syntax::syntax::{AstModule, Dialect};
26
27use crate::models::{DatasourceId, PackageData, PackageType, Party, Sha1Digest};
28
29use super::PackageParser;
30
31type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
32
33struct StarlarkCall<'a> {
34    func: &'a ast::AstExpr,
35    args: &'a StarlarkCallArgs,
36}
37
38/// Parser for Buck BUCK files (build rules)
39pub struct BuckBuildParser;
40
41impl PackageParser for BuckBuildParser {
42    const PACKAGE_TYPE: PackageType = PackageType::Buck;
43
44    fn is_match(path: &Path) -> bool {
45        path.file_name()
46            .and_then(|name| name.to_str())
47            .is_some_and(|name| name == "BUCK")
48    }
49
50    fn extract_packages(path: &Path) -> Vec<PackageData> {
51        match parse_buck_build(path) {
52            Ok(packages) if !packages.is_empty() => packages,
53            Ok(_) => vec![fallback_package_data(path)],
54            Err(e) => {
55                warn!("Failed to parse Buck BUCK file {:?}: {}", path, e);
56                vec![fallback_package_data(path)]
57            }
58        }
59    }
60}
61
62/// Parser for Buck METADATA.bzl files (metadata dictionaries)
63pub struct BuckMetadataBzlParser;
64
65impl PackageParser for BuckMetadataBzlParser {
66    const PACKAGE_TYPE: PackageType = PackageType::Buck;
67
68    fn is_match(path: &Path) -> bool {
69        path.file_name()
70            .and_then(|name| name.to_str())
71            .is_some_and(|name| name == "METADATA.bzl")
72    }
73
74    fn extract_packages(path: &Path) -> Vec<PackageData> {
75        vec![match parse_metadata_bzl(path) {
76            Ok(pkg) => pkg,
77            Err(e) => {
78                warn!("Failed to parse Buck METADATA.bzl {:?}: {}", path, e);
79                PackageData {
80                    package_type: Some(Self::PACKAGE_TYPE),
81                    datasource_id: Some(DatasourceId::BuckMetadata),
82                    ..Default::default()
83                }
84            }
85        }]
86    }
87}
88
89/// Parse a Buck BUCK file (same logic as Bazel BUILD)
90fn parse_buck_build(path: &Path) -> Result<Vec<PackageData>, String> {
91    let content =
92        std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
93    let module = parse_starlark_module("<BUCK>", content)?;
94
95    let mut packages = Vec::new();
96
97    for statement in top_level_statements(&module) {
98        if let Some(package_data) = extract_build_package_from_statement(statement) {
99            packages.push(package_data);
100        }
101    }
102
103    Ok(packages)
104}
105
106/// Parse a Buck METADATA.bzl file
107fn parse_metadata_bzl(path: &Path) -> Result<PackageData, String> {
108    let content =
109        std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
110    let module = parse_starlark_module("<METADATA.bzl>", content)?;
111
112    // Look for METADATA = {...} assignment
113    for statement in top_level_statements(&module) {
114        if let Some(dict) = extract_metadata_assignment_dict(statement) {
115            return Ok(extract_metadata_dict(dict));
116        }
117    }
118
119    // No METADATA found
120    Ok(PackageData {
121        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
122        datasource_id: Some(DatasourceId::BuckMetadata),
123        ..Default::default()
124    })
125}
126
127fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
128    let content = preprocess_starlark_content(&content);
129    let dialect = Dialect {
130        enable_top_level_stmt: true,
131        ..Dialect::Standard
132    };
133    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
134}
135
136fn preprocess_starlark_content(content: &str) -> String {
137    let mut normalized = String::with_capacity(content.len());
138    let mut pending_oss_disable_indent: Option<String> = None;
139
140    for raw_line in content.lines() {
141        let trimmed_start = raw_line.trim_start();
142        let indent_len = raw_line.len() - trimmed_start.len();
143        let indent = &raw_line[..indent_len];
144
145        if trimmed_start.starts_with('#') && trimmed_start.contains("@oss-disable") {
146            pending_oss_disable_indent = Some(indent.to_string());
147            continue;
148        }
149
150        if let Some(marker_index) = raw_line.find("# @oss-enable") {
151            let code = raw_line[..marker_index].trim_end();
152            if !code.is_empty() {
153                if let Some(disabled_indent) = pending_oss_disable_indent.take() {
154                    normalized.push_str(&disabled_indent);
155                    normalized.push_str(code.trim_start());
156                } else {
157                    normalized.push_str(code);
158                }
159                normalized.push('\n');
160            }
161            continue;
162        }
163
164        pending_oss_disable_indent = None;
165        normalized.push_str(raw_line);
166        normalized.push('\n');
167    }
168
169    if !content.ends_with('\n') && normalized.ends_with('\n') {
170        normalized.pop();
171    }
172
173    normalized
174}
175
176fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
177    match &module.statement().node {
178        ast::StmtP::Statements(statements) => statements,
179        _ => std::slice::from_ref(module.statement()),
180    }
181}
182
183fn extract_metadata_assignment_dict(
184    statement: &ast::AstStmt,
185) -> Option<&[(ast::AstExpr, ast::AstExpr)]> {
186    let ast::StmtP::Assign(assign) = &statement.node else {
187        return None;
188    };
189    let ast::AssignTargetP::Identifier(target) = &assign.lhs.node else {
190        return None;
191    };
192    if target.node.ident != "METADATA" {
193        return None;
194    }
195    match &assign.rhs.node {
196        ast::ExprP::Dict(items) => Some(items.as_slice()),
197        _ => None,
198    }
199}
200
201/// Extract metadata from a dictionary AST node
202fn extract_metadata_dict(dict: &[(ast::AstExpr, ast::AstExpr)]) -> PackageData {
203    let mut fields: HashMap<String, MetadataValue> = HashMap::new();
204
205    for (key, value) in dict {
206        let Some(key_name) = expr_as_string(key) else {
207            continue;
208        };
209        let Some(metadata_value) = metadata_value_from_expr(value) else {
210            continue;
211        };
212
213        fields.insert(key_name, metadata_value);
214    }
215
216    build_package_from_metadata(fields)
217}
218
219fn get_metadata_string(fields: &HashMap<String, MetadataValue>, keys: &[&str]) -> Option<String> {
220    keys.iter().find_map(|key| match fields.get(*key) {
221        Some(MetadataValue::String(value)) => Some(value.clone()),
222        _ => None,
223    })
224}
225
226fn get_metadata_list(
227    fields: &HashMap<String, MetadataValue>,
228    keys: &[&str],
229) -> Option<Vec<String>> {
230    keys.iter().find_map(|key| match fields.get(*key) {
231        Some(MetadataValue::List(values)) => Some(values.clone()),
232        _ => None,
233    })
234}
235
236/// Metadata value types
237enum MetadataValue {
238    String(String),
239    List(Vec<String>),
240}
241
242fn split_buck_license_values(values: &[String]) -> (Vec<String>, Vec<String>) {
243    let mut statements = Vec::new();
244    let mut references = Vec::new();
245
246    for value in values {
247        if is_probable_local_license_reference(value) {
248            references.push(value.clone());
249        } else {
250            statements.push(value.clone());
251        }
252    }
253
254    (statements, references)
255}
256
257fn is_probable_local_license_reference(value: &str) -> bool {
258    let trimmed = value.trim();
259    if trimmed.is_empty() {
260        return false;
261    }
262
263    let lower = trimmed.to_ascii_lowercase();
264    lower.contains('/')
265        || lower.contains('\\')
266        || lower.starts_with("license")
267        || lower.starts_with("licence")
268        || lower.starts_with("copying")
269        || lower.starts_with("notice")
270        || lower.starts_with("copyright")
271        || lower.ends_with(".txt")
272        || lower.ends_with(".md")
273        || lower.ends_with(".rst")
274        || lower.ends_with(".html")
275}
276
277fn insert_license_reference_extra_data(
278    extra_data: &mut HashMap<String, serde_json::Value>,
279    references: &[String],
280) {
281    match references {
282        [] => {}
283        [reference] => {
284            extra_data.insert(
285                "license_file".to_string(),
286                serde_json::Value::String(reference.clone()),
287            );
288        }
289        _ => {
290            extra_data.insert(
291                "license_files".to_string(),
292                serde_json::Value::Array(
293                    references
294                        .iter()
295                        .cloned()
296                        .map(serde_json::Value::String)
297                        .collect(),
298                ),
299            );
300        }
301    }
302}
303
304/// Build PackageData from extracted metadata fields
305fn build_package_from_metadata(fields: HashMap<String, MetadataValue>) -> PackageData {
306    let mut pkg = PackageData {
307        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
308        datasource_id: Some(DatasourceId::BuckMetadata),
309        ..Default::default()
310    };
311    let mut license_references = Vec::new();
312
313    // Extract name
314    if let Some(name) = get_metadata_string(&fields, &["name"]) {
315        pkg.name = Some(name);
316    }
317
318    // Extract version
319    if let Some(version) = get_metadata_string(&fields, &["version"]) {
320        pkg.version = Some(version);
321    }
322
323    // Extract namespace from explicit metadata when present.
324    if let Some(namespace) = get_metadata_string(&fields, &["namespace"]) {
325        pkg.namespace = Some(namespace);
326    }
327
328    // Extract package type from canonical or legacy ecosystem fields.
329    // Intentionally ignore `upstream_type`: it does not describe the purl package type.
330    if let Some(ecosystem) = get_metadata_string(&fields, &["ecosystem", "type", "package_type"])
331        && let Ok(package_type) = ecosystem.parse::<PackageType>()
332    {
333        pkg.package_type = Some(package_type);
334    }
335
336    // Extract licenses (licenses or license_expression)
337    if let Some(licenses) = get_metadata_list(&fields, &["licenses"]) {
338        let (license_statements, references) = split_buck_license_values(&licenses);
339        license_references = references;
340        let extracted_license_statement = if !license_statements.is_empty() {
341            Some(license_statements.join(", "))
342        } else if !license_references.is_empty() {
343            Some(license_references.join(", "))
344        } else {
345            None
346        };
347        pkg.extracted_license_statement = extracted_license_statement;
348    } else if let Some(license_expression) = get_metadata_string(&fields, &["license_expression"]) {
349        pkg.extracted_license_statement = Some(license_expression);
350    }
351
352    if let Some(copyright) = get_metadata_list(&fields, &["copyrights"]) {
353        if !copyright.is_empty() {
354            pkg.copyright = Some(copyright.join("\n"));
355        }
356    } else if let Some(copyright) = get_metadata_string(&fields, &["copyright"]) {
357        pkg.copyright = Some(copyright);
358    }
359
360    // Extract homepage (upstream_address, upstream_url, or homepage_url)
361    if let Some(homepage_url) = get_metadata_string(
362        &fields,
363        &["upstream_address", "upstream_url", "homepage_url"],
364    ) {
365        pkg.homepage_url = Some(homepage_url);
366    }
367
368    // Extract download_url
369    if let Some(download_url) = get_metadata_string(&fields, &["download_url"]) {
370        pkg.download_url = Some(download_url);
371    }
372
373    // Extract vcs_url
374    if let Some(vcs_url) = get_metadata_string(&fields, &["vcs_url"]) {
375        pkg.vcs_url = Some(vcs_url);
376    }
377
378    // Extract sha1 (download_archive_sha1)
379    if let Some(sha1) = get_metadata_string(&fields, &["download_archive_sha1"]) {
380        pkg.sha1 = Sha1Digest::from_hex(&sha1).ok();
381    }
382
383    // Extract maintainers
384    if let Some(maintainers) = get_metadata_list(&fields, &["maintainers"]) {
385        pkg.parties.extend(maintainers.iter().map(|name| Party {
386            r#type: Some("organization".to_string()),
387            name: Some(name.clone()),
388            role: Some("maintainer".to_string()),
389            email: None,
390            url: None,
391            organization: None,
392            organization_url: None,
393            timezone: None,
394        }));
395    }
396
397    if let Some(vendor) = get_metadata_string(&fields, &["vendor", "publisher"]) {
398        pkg.parties.push(Party {
399            r#type: None,
400            name: Some(vendor),
401            role: Some("publisher".to_string()),
402            email: None,
403            url: None,
404            organization: None,
405            organization_url: None,
406            timezone: None,
407        });
408    }
409
410    // Extract extra_data fields
411    let mut extra_data = HashMap::new();
412    if let Some(vcs_commit_hash) = get_metadata_string(&fields, &["vcs_commit_hash"]) {
413        extra_data.insert(
414            "vcs_commit_hash".to_string(),
415            serde_json::Value::String(vcs_commit_hash),
416        );
417    }
418    if let Some(upstream_hash) =
419        get_metadata_string(&fields, &["upstream_hash", "upstream_commit_hash"])
420    {
421        extra_data.insert(
422            "upstream_hash".to_string(),
423            serde_json::Value::String(upstream_hash),
424        );
425    }
426    if let Some(upstream_branch) = get_metadata_string(&fields, &["upstream_branch"]) {
427        extra_data.insert(
428            "upstream_branch".to_string(),
429            serde_json::Value::String(upstream_branch),
430        );
431    }
432    insert_license_reference_extra_data(&mut extra_data, &license_references);
433    if !extra_data.is_empty() {
434        pkg.extra_data = Some(extra_data);
435    }
436
437    // Parse package_url if present and update package fields
438    if let Some(purl_str) = get_metadata_string(&fields, &["package_url"])
439        && let Ok(purl) = purl_str.parse::<PackageUrl>()
440    {
441        pkg.purl = Some(purl.to_string());
442
443        // Override package fields with purl data
444        if let Ok(package_type) = purl.ty().parse::<PackageType>() {
445            pkg.package_type = Some(package_type);
446        }
447        if let Some(ns) = purl.namespace() {
448            pkg.namespace = Some(ns.to_string());
449        }
450        pkg.name = Some(purl.name().to_string());
451        if let Some(ver) = purl.version() {
452            pkg.version = Some(ver.to_string());
453        }
454        // Qualifiers
455        if !purl.qualifiers().is_empty() {
456            let quals: HashMap<String, String> = purl
457                .qualifiers()
458                .iter()
459                .map(|(k, v)| (k.to_string(), v.to_string()))
460                .collect();
461            pkg.qualifiers = Some(quals);
462        }
463        // Subpath
464        if let Some(sp) = purl.subpath() {
465            pkg.subpath = Some(sp.to_string());
466        }
467    }
468
469    pkg
470}
471
472fn metadata_value_from_expr(expr: &ast::AstExpr) -> Option<MetadataValue> {
473    if let Some(string) = expr_as_string(expr) {
474        return Some(MetadataValue::String(string));
475    }
476
477    let items = match &expr.node {
478        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
479        _ => return None,
480    };
481    let values: Vec<_> = items.iter().filter_map(expr_as_string).collect();
482    (!values.is_empty()).then_some(MetadataValue::List(values))
483}
484
485/// Extract package data from a single AST statement (for BUCK files)
486fn extract_build_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
487    let call = extract_call(statement)?;
488    let rule_name = match &call.func.node {
489        ast::ExprP::Identifier(identifier) => identifier.node.ident.as_str(),
490        _ => return None,
491    };
492
493    if !check_rule_name_ending(rule_name) {
494        return None;
495    }
496
497    let name = extract_named_kwarg_string(&call, "name");
498    let licenses = extract_named_kwarg_string_list(&call, "licenses");
499
500    let package_name = name?;
501    let (license_statements, license_references) = licenses
502        .as_deref()
503        .map(split_buck_license_values)
504        .unwrap_or_default();
505    let extracted_license_statement = if !license_statements.is_empty() {
506        Some(license_statements.join(", "))
507    } else if !license_references.is_empty() {
508        Some(license_references.join(", "))
509    } else {
510        None
511    };
512    let mut extra_data = HashMap::new();
513    insert_license_reference_extra_data(&mut extra_data, &license_references);
514
515    Some(PackageData {
516        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
517        name: Some(package_name),
518        extracted_license_statement,
519        extra_data: (!extra_data.is_empty()).then_some(extra_data),
520        datasource_id: Some(DatasourceId::BuckFile),
521        ..Default::default()
522    })
523}
524
525fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
526    match &statement.node {
527        ast::StmtP::Expression(expr) => extract_call_expr(expr),
528        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
529        _ => None,
530    }
531}
532
533fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
534    match &expr.node {
535        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
536        _ => None,
537    }
538}
539
540fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
541    call.args
542        .args
543        .iter()
544        .find_map(|argument| match &argument.node {
545            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
546            _ => None,
547        })
548}
549
550fn extract_named_kwarg_string(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
551    extract_named_kwarg(call, key).and_then(expr_as_string)
552}
553
554fn extract_named_kwarg_string_list(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
555    let expr = extract_named_kwarg(call, key)?;
556    let items = match &expr.node {
557        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
558        _ => return None,
559    };
560    let values: Vec<_> = items.iter().filter_map(expr_as_string).collect();
561    (!values.is_empty()).then_some(values)
562}
563
564fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
565    match &expr.node {
566        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
567        _ => None,
568    }
569}
570
571/// Check if rule name ends with "binary" or "library"
572fn check_rule_name_ending(rule_name: &str) -> bool {
573    rule_name.ends_with("binary") || rule_name.ends_with("library")
574}
575
576/// Create fallback package data using parent directory name
577fn fallback_package_data(path: &Path) -> PackageData {
578    let name = path
579        .parent()
580        .and_then(|p| p.file_name())
581        .and_then(|n| n.to_str())
582        .map(|s| s.to_string());
583
584    PackageData {
585        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
586        name,
587        datasource_id: Some(DatasourceId::BuckFile),
588        ..Default::default()
589    }
590}
591
592#[cfg(test)]
593mod tests {
594    use super::*;
595    use std::path::PathBuf;
596
597    #[test]
598    fn test_buck_build_is_match() {
599        assert!(BuckBuildParser::is_match(&PathBuf::from("BUCK")));
600        assert!(BuckBuildParser::is_match(&PathBuf::from("path/to/BUCK")));
601        assert!(!BuckBuildParser::is_match(&PathBuf::from("BUILD")));
602        assert!(!BuckBuildParser::is_match(&PathBuf::from("buck")));
603    }
604
605    #[test]
606    fn test_metadata_bzl_is_match() {
607        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
608            "METADATA.bzl"
609        )));
610        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
611            "path/to/METADATA.bzl"
612        )));
613        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from(
614            "metadata.bzl"
615        )));
616        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from("METADATA")));
617    }
618
619    #[test]
620    fn test_check_rule_name_ending() {
621        assert!(check_rule_name_ending("android_binary"));
622        assert!(check_rule_name_ending("android_library"));
623        assert!(check_rule_name_ending("java_binary"));
624        assert!(!check_rule_name_ending("filegroup"));
625    }
626
627    #[test]
628    fn test_preprocess_starlark_content_handles_oss_guarded_alternatives() {
629        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
630prelude = native
631
632# @oss-disable: rust_unittest(
633    rust_test( # @oss-enable
634        name = "test",
635    )
636
637platform_utils = None # @oss-enable
638"#;
639
640        let normalized = preprocess_starlark_content(content);
641
642        assert!(!normalized.contains("@oss-disable"));
643        assert!(!normalized.contains("@oss-enable"));
644        assert!(normalized.contains("rust_test("));
645        assert!(normalized.contains("platform_utils = None"));
646        assert!(!normalized.contains("    rust_test("));
647    }
648
649    #[test]
650    fn test_parse_buck_build_with_oss_guarded_rule() {
651        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_library.bzl", "rust_library")
652# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
653
654oncall("build_infra")
655
656rust_library(
657    name = "library",
658    srcs = ["src/lib.rs"],
659)
660
661# @oss-disable: rust_unittest(
662    rust_test( # @oss-enable
663    name = "test",
664    srcs = ["tests/test.rs"],
665)
666"#;
667
668        let temp_dir = tempfile::tempdir().unwrap();
669        let buck_path = temp_dir.path().join("BUCK");
670        std::fs::write(&buck_path, content).unwrap();
671
672        let packages = parse_buck_build(&buck_path).expect("BUCK file should parse");
673
674        assert_eq!(packages.len(), 1);
675        assert_eq!(packages[0].package_type, Some(PackageType::Buck));
676        assert_eq!(packages[0].name.as_deref(), Some("library"));
677    }
678}
679
680crate::register_parser!(
681    "Buck build file and METADATA.bzl",
682    &["**/BUCK", "**/METADATA.bzl"],
683    "buck",
684    "",
685    Some("https://buck.build/"),
686);