Skip to main content

provenant/parsers/
buck.rs

1//! Buck BUILD and METADATA.bzl parsers
2//!
3//! Extracts package metadata from Buck build system files using Starlark (Python-like) syntax.
4//!
5//! ## Features
6//! - **BuckBuildParser**: Parses BUCK files with multiple package support
7//! - **BuckMetadataBzlParser**: Parses METADATA.bzl dictionary assignments with package_url support
8//!
9//! ## Usage
10//! - `BuckBuildParser::extract_packages()` - Returns ALL packages from BUCK file
11//! - `BuckMetadataBzlParser::extract_first_package()` - Returns single package from METADATA.bzl
12//!
13//! ## Reference
14//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py`
15//! - BuckPackageHandler (lines 310-325)
16//! - BuckMetadataBzlHandler (lines 328-432)
17
18use std::collections::HashMap;
19use std::path::Path;
20
21use crate::parser_warn as warn;
22use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
23use packageurl::PackageUrl;
24use starlark_syntax::syntax::ast;
25use starlark_syntax::syntax::module::AstModuleFields;
26use starlark_syntax::syntax::{AstModule, Dialect};
27
28use crate::models::{DatasourceId, PackageData, PackageType, Party, Sha1Digest};
29
30use super::PackageParser;
31
32type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
33
34struct StarlarkCall<'a> {
35    func: &'a ast::AstExpr,
36    args: &'a StarlarkCallArgs,
37}
38
39/// Parser for Buck BUCK files (build rules)
40pub struct BuckBuildParser;
41
42impl PackageParser for BuckBuildParser {
43    const PACKAGE_TYPE: PackageType = PackageType::Buck;
44
45    fn is_match(path: &Path) -> bool {
46        path.file_name()
47            .and_then(|name| name.to_str())
48            .is_some_and(|name| name == "BUCK")
49    }
50
51    fn extract_packages(path: &Path) -> Vec<PackageData> {
52        match parse_buck_build(path) {
53            Ok(packages) if !packages.is_empty() => packages,
54            Ok(_) => vec![fallback_package_data(path)],
55            Err(e) => {
56                warn!("Failed to parse Buck BUCK file {:?}: {}", path, e);
57                vec![fallback_package_data(path)]
58            }
59        }
60    }
61}
62
63/// Parser for Buck METADATA.bzl files (metadata dictionaries)
64pub struct BuckMetadataBzlParser;
65
66impl PackageParser for BuckMetadataBzlParser {
67    const PACKAGE_TYPE: PackageType = PackageType::Buck;
68
69    fn is_match(path: &Path) -> bool {
70        path.file_name()
71            .and_then(|name| name.to_str())
72            .is_some_and(|name| name == "METADATA.bzl")
73    }
74
75    fn extract_packages(path: &Path) -> Vec<PackageData> {
76        vec![match parse_metadata_bzl(path) {
77            Ok(pkg) => pkg,
78            Err(e) => {
79                warn!("Failed to parse Buck METADATA.bzl {:?}: {}", path, e);
80                PackageData {
81                    package_type: Some(Self::PACKAGE_TYPE),
82                    datasource_id: Some(DatasourceId::BuckMetadata),
83                    ..Default::default()
84                }
85            }
86        }]
87    }
88}
89
90/// Parse a Buck BUCK file (same logic as Bazel BUILD)
91fn parse_buck_build(path: &Path) -> Result<Vec<PackageData>, String> {
92    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
93    let module = parse_starlark_module("<BUCK>", content)?;
94
95    let mut packages = Vec::new();
96
97    for statement in top_level_statements(&module)
98        .iter()
99        .take(MAX_ITERATION_COUNT)
100    {
101        if let Some(package_data) = extract_build_package_from_statement(statement) {
102            packages.push(package_data);
103        }
104    }
105
106    Ok(packages)
107}
108
109/// Parse a Buck METADATA.bzl file
110fn parse_metadata_bzl(path: &Path) -> Result<PackageData, String> {
111    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
112    let module = parse_starlark_module("<METADATA.bzl>", content)?;
113
114    for statement in top_level_statements(&module)
115        .iter()
116        .take(MAX_ITERATION_COUNT)
117    {
118        if let Some(dict) = extract_metadata_assignment_dict(statement) {
119            return Ok(extract_metadata_dict(dict));
120        }
121    }
122
123    // No METADATA found
124    Ok(PackageData {
125        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
126        datasource_id: Some(DatasourceId::BuckMetadata),
127        ..Default::default()
128    })
129}
130
131fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
132    let content = preprocess_starlark_content(&content);
133    let dialect = Dialect {
134        enable_top_level_stmt: true,
135        ..Dialect::Standard
136    };
137    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
138}
139
140fn preprocess_starlark_content(content: &str) -> String {
141    let mut normalized = String::with_capacity(content.len());
142    let mut pending_oss_disable_indent: Option<String> = None;
143
144    for raw_line in content.lines() {
145        let trimmed_start = raw_line.trim_start();
146        let indent_len = raw_line.len() - trimmed_start.len();
147        let indent = &raw_line[..indent_len];
148
149        if trimmed_start.starts_with('#') && trimmed_start.contains("@oss-disable") {
150            pending_oss_disable_indent = Some(indent.to_string());
151            continue;
152        }
153
154        if let Some(marker_index) = raw_line.find("# @oss-enable") {
155            let code = raw_line[..marker_index].trim_end();
156            if !code.is_empty() {
157                if let Some(disabled_indent) = pending_oss_disable_indent.take() {
158                    normalized.push_str(&disabled_indent);
159                    normalized.push_str(code.trim_start());
160                } else {
161                    normalized.push_str(code);
162                }
163                normalized.push('\n');
164            }
165            continue;
166        }
167
168        pending_oss_disable_indent = None;
169        normalized.push_str(raw_line);
170        normalized.push('\n');
171    }
172
173    if !content.ends_with('\n') && normalized.ends_with('\n') {
174        normalized.pop();
175    }
176
177    normalized
178}
179
180fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
181    match &module.statement().node {
182        ast::StmtP::Statements(statements) => statements,
183        _ => std::slice::from_ref(module.statement()),
184    }
185}
186
187fn extract_metadata_assignment_dict(
188    statement: &ast::AstStmt,
189) -> Option<&[(ast::AstExpr, ast::AstExpr)]> {
190    let ast::StmtP::Assign(assign) = &statement.node else {
191        return None;
192    };
193    let ast::AssignTargetP::Identifier(target) = &assign.lhs.node else {
194        return None;
195    };
196    if target.node.ident != "METADATA" {
197        return None;
198    }
199    match &assign.rhs.node {
200        ast::ExprP::Dict(items) => Some(items.as_slice()),
201        _ => None,
202    }
203}
204
205/// Extract metadata from a dictionary AST node
206fn extract_metadata_dict(dict: &[(ast::AstExpr, ast::AstExpr)]) -> PackageData {
207    let mut fields: HashMap<String, MetadataValue> = HashMap::new();
208
209    for (key, value) in dict.iter().take(MAX_ITERATION_COUNT) {
210        let Some(key_name) = expr_as_string(key) else {
211            continue;
212        };
213        let Some(metadata_value) = metadata_value_from_expr(value) else {
214            continue;
215        };
216
217        fields.insert(key_name, metadata_value);
218    }
219
220    build_package_from_metadata(fields)
221}
222
223fn get_metadata_string(fields: &HashMap<String, MetadataValue>, keys: &[&str]) -> Option<String> {
224    keys.iter().find_map(|key| match fields.get(*key) {
225        Some(MetadataValue::String(value)) => Some(value.clone()),
226        _ => None,
227    })
228}
229
230fn get_metadata_list(
231    fields: &HashMap<String, MetadataValue>,
232    keys: &[&str],
233) -> Option<Vec<String>> {
234    keys.iter().find_map(|key| match fields.get(*key) {
235        Some(MetadataValue::List(values)) => Some(values.clone()),
236        _ => None,
237    })
238}
239
240/// Metadata value types
241enum MetadataValue {
242    String(String),
243    List(Vec<String>),
244}
245
246fn split_buck_license_values(values: &[String]) -> (Vec<String>, Vec<String>) {
247    let mut statements = Vec::new();
248    let mut references = Vec::new();
249
250    for value in values {
251        if is_probable_local_license_reference(value) {
252            references.push(value.clone());
253        } else {
254            statements.push(value.clone());
255        }
256    }
257
258    (statements, references)
259}
260
261fn is_probable_local_license_reference(value: &str) -> bool {
262    let trimmed = value.trim();
263    if trimmed.is_empty() {
264        return false;
265    }
266
267    let lower = trimmed.to_ascii_lowercase();
268    lower.contains('/')
269        || lower.contains('\\')
270        || lower.starts_with("license")
271        || lower.starts_with("licence")
272        || lower.starts_with("copying")
273        || lower.starts_with("notice")
274        || lower.starts_with("copyright")
275        || lower.ends_with(".txt")
276        || lower.ends_with(".md")
277        || lower.ends_with(".rst")
278        || lower.ends_with(".html")
279}
280
281fn insert_license_reference_extra_data(
282    extra_data: &mut HashMap<String, serde_json::Value>,
283    references: &[String],
284) {
285    match references {
286        [] => {}
287        [reference] => {
288            extra_data.insert(
289                "license_file".to_string(),
290                serde_json::Value::String(reference.clone()),
291            );
292        }
293        _ => {
294            extra_data.insert(
295                "license_files".to_string(),
296                serde_json::Value::Array(
297                    references
298                        .iter()
299                        .cloned()
300                        .map(serde_json::Value::String)
301                        .collect(),
302                ),
303            );
304        }
305    }
306}
307
308/// Build PackageData from extracted metadata fields
309fn build_package_from_metadata(fields: HashMap<String, MetadataValue>) -> PackageData {
310    let mut pkg = PackageData {
311        package_type: Some(BuckMetadataBzlParser::PACKAGE_TYPE),
312        datasource_id: Some(DatasourceId::BuckMetadata),
313        ..Default::default()
314    };
315    let mut license_references = Vec::new();
316
317    // Extract name
318    if let Some(name) = get_metadata_string(&fields, &["name"]) {
319        pkg.name = Some(truncate_field(name));
320    }
321
322    // Extract version
323    if let Some(version) = get_metadata_string(&fields, &["version"]) {
324        pkg.version = Some(truncate_field(version));
325    }
326
327    // Extract namespace from explicit metadata when present.
328    if let Some(namespace) = get_metadata_string(&fields, &["namespace"]) {
329        pkg.namespace = Some(truncate_field(namespace));
330    }
331
332    // Extract package type from canonical or legacy ecosystem fields.
333    // Intentionally ignore `upstream_type`: it does not describe the purl package type.
334    if let Some(ecosystem) = get_metadata_string(&fields, &["ecosystem", "type", "package_type"])
335        && let Ok(package_type) = ecosystem.parse::<PackageType>()
336    {
337        pkg.package_type = Some(package_type);
338    }
339
340    // Extract licenses (licenses or license_expression)
341    if let Some(licenses) = get_metadata_list(&fields, &["licenses"]) {
342        let (license_statements, references) = split_buck_license_values(&licenses);
343        license_references = references;
344        let extracted_license_statement = if !license_statements.is_empty() {
345            Some(license_statements.join(", "))
346        } else if !license_references.is_empty() {
347            Some(license_references.join(", "))
348        } else {
349            None
350        };
351        pkg.extracted_license_statement = extracted_license_statement.map(truncate_field);
352    } else if let Some(license_expression) = get_metadata_string(&fields, &["license_expression"]) {
353        pkg.extracted_license_statement = Some(truncate_field(license_expression));
354    }
355
356    if let Some(copyright) = get_metadata_list(&fields, &["copyrights"]) {
357        if !copyright.is_empty() {
358            pkg.copyright = Some(truncate_field(copyright.join("\n")));
359        }
360    } else if let Some(copyright) = get_metadata_string(&fields, &["copyright"]) {
361        pkg.copyright = Some(truncate_field(copyright));
362    }
363
364    // Extract homepage (upstream_address, upstream_url, or homepage_url)
365    if let Some(homepage_url) = get_metadata_string(
366        &fields,
367        &["upstream_address", "upstream_url", "homepage_url"],
368    ) {
369        pkg.homepage_url = Some(truncate_field(homepage_url));
370    }
371
372    // Extract download_url
373    if let Some(download_url) = get_metadata_string(&fields, &["download_url"]) {
374        pkg.download_url = Some(truncate_field(download_url));
375    }
376
377    // Extract vcs_url
378    if let Some(vcs_url) = get_metadata_string(&fields, &["vcs_url"]) {
379        pkg.vcs_url = Some(truncate_field(vcs_url));
380    }
381
382    // Extract sha1 (download_archive_sha1)
383    if let Some(sha1) = get_metadata_string(&fields, &["download_archive_sha1"]) {
384        pkg.sha1 = Sha1Digest::from_hex(&sha1).ok();
385    }
386
387    // Extract maintainers
388    if let Some(maintainers) = get_metadata_list(&fields, &["maintainers"]) {
389        pkg.parties.extend(maintainers.iter().map(|name| Party {
390            r#type: Some("organization".to_string()),
391            name: Some(name.clone()),
392            role: Some("maintainer".to_string()),
393            email: None,
394            url: None,
395            organization: None,
396            organization_url: None,
397            timezone: None,
398        }));
399    }
400
401    if let Some(vendor) = get_metadata_string(&fields, &["vendor", "publisher"]) {
402        pkg.parties.push(Party {
403            r#type: None,
404            name: Some(vendor),
405            role: Some("publisher".to_string()),
406            email: None,
407            url: None,
408            organization: None,
409            organization_url: None,
410            timezone: None,
411        });
412    }
413
414    // Extract extra_data fields
415    let mut extra_data = HashMap::new();
416    if let Some(vcs_commit_hash) = get_metadata_string(&fields, &["vcs_commit_hash"]) {
417        extra_data.insert(
418            "vcs_commit_hash".to_string(),
419            serde_json::Value::String(vcs_commit_hash),
420        );
421    }
422    if let Some(upstream_hash) =
423        get_metadata_string(&fields, &["upstream_hash", "upstream_commit_hash"])
424    {
425        extra_data.insert(
426            "upstream_hash".to_string(),
427            serde_json::Value::String(upstream_hash),
428        );
429    }
430    if let Some(upstream_branch) = get_metadata_string(&fields, &["upstream_branch"]) {
431        extra_data.insert(
432            "upstream_branch".to_string(),
433            serde_json::Value::String(upstream_branch),
434        );
435    }
436    insert_license_reference_extra_data(&mut extra_data, &license_references);
437    if !extra_data.is_empty() {
438        pkg.extra_data = Some(extra_data);
439    }
440
441    // Parse package_url if present and update package fields
442    if let Some(purl_str) = get_metadata_string(&fields, &["package_url"])
443        && let Ok(purl) = purl_str.parse::<PackageUrl>()
444    {
445        pkg.purl = Some(truncate_field(purl.to_string()));
446
447        if let Ok(package_type) = purl.ty().parse::<PackageType>() {
448            pkg.package_type = Some(package_type);
449        }
450        if let Some(ns) = purl.namespace() {
451            pkg.namespace = Some(truncate_field(ns.to_string()));
452        }
453        pkg.name = Some(truncate_field(purl.name().to_string()));
454        if let Some(ver) = purl.version() {
455            pkg.version = Some(truncate_field(ver.to_string()));
456        }
457        // Qualifiers
458        if !purl.qualifiers().is_empty() {
459            let quals: HashMap<String, String> = purl
460                .qualifiers()
461                .iter()
462                .map(|(k, v)| (k.to_string(), v.to_string()))
463                .collect();
464            pkg.qualifiers = Some(quals);
465        }
466        // Subpath
467        if let Some(sp) = purl.subpath() {
468            pkg.subpath = Some(sp.to_string());
469        }
470    }
471
472    pkg
473}
474
475fn metadata_value_from_expr(expr: &ast::AstExpr) -> Option<MetadataValue> {
476    if let Some(string) = expr_as_string(expr) {
477        return Some(MetadataValue::String(string));
478    }
479
480    let items = match &expr.node {
481        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
482        _ => return None,
483    };
484    let values: Vec<_> = items
485        .iter()
486        .take(MAX_ITERATION_COUNT)
487        .filter_map(expr_as_string)
488        .collect();
489    (!values.is_empty()).then_some(MetadataValue::List(values))
490}
491
492/// Extract package data from a single AST statement (for BUCK files)
493fn extract_build_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
494    let call = extract_call(statement)?;
495    let rule_name = match &call.func.node {
496        ast::ExprP::Identifier(identifier) => identifier.node.ident.as_str(),
497        _ => return None,
498    };
499
500    if !check_rule_name_ending(rule_name) {
501        return None;
502    }
503
504    let name = extract_named_kwarg_string(&call, "name");
505    let licenses = extract_named_kwarg_string_list(&call, "licenses");
506
507    let package_name = name?;
508    let (license_statements, license_references) = licenses
509        .as_deref()
510        .map(split_buck_license_values)
511        .unwrap_or_default();
512    let extracted_license_statement = if !license_statements.is_empty() {
513        Some(truncate_field(license_statements.join(", ")))
514    } else if !license_references.is_empty() {
515        Some(truncate_field(license_references.join(", ")))
516    } else {
517        None
518    };
519    let mut extra_data = HashMap::new();
520    insert_license_reference_extra_data(&mut extra_data, &license_references);
521
522    Some(PackageData {
523        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
524        name: Some(truncate_field(package_name)),
525        extracted_license_statement,
526        extra_data: (!extra_data.is_empty()).then_some(extra_data),
527        datasource_id: Some(DatasourceId::BuckFile),
528        ..Default::default()
529    })
530}
531
532fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
533    match &statement.node {
534        ast::StmtP::Expression(expr) => extract_call_expr(expr),
535        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
536        _ => None,
537    }
538}
539
540fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
541    match &expr.node {
542        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
543        _ => None,
544    }
545}
546
547fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
548    call.args
549        .args
550        .iter()
551        .find_map(|argument| match &argument.node {
552            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
553            _ => None,
554        })
555}
556
557fn extract_named_kwarg_string(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
558    extract_named_kwarg(call, key).and_then(expr_as_string)
559}
560
561fn extract_named_kwarg_string_list(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
562    let expr = extract_named_kwarg(call, key)?;
563    let items = match &expr.node {
564        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
565        _ => return None,
566    };
567    let values: Vec<_> = items
568        .iter()
569        .take(MAX_ITERATION_COUNT)
570        .filter_map(expr_as_string)
571        .collect();
572    (!values.is_empty()).then_some(values)
573}
574
575fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
576    match &expr.node {
577        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
578        _ => None,
579    }
580}
581
582/// Check if rule name ends with "binary" or "library"
583fn check_rule_name_ending(rule_name: &str) -> bool {
584    rule_name.ends_with("binary") || rule_name.ends_with("library")
585}
586
587/// Create fallback package data using parent directory name
588fn fallback_package_data(path: &Path) -> PackageData {
589    let name = path
590        .parent()
591        .and_then(|p| p.file_name())
592        .and_then(|n| n.to_str())
593        .map(|s| s.to_string());
594
595    PackageData {
596        package_type: Some(BuckBuildParser::PACKAGE_TYPE),
597        name,
598        datasource_id: Some(DatasourceId::BuckFile),
599        ..Default::default()
600    }
601}
602
603#[cfg(test)]
604mod tests {
605    use super::*;
606    use std::path::PathBuf;
607
608    #[test]
609    fn test_buck_build_is_match() {
610        assert!(BuckBuildParser::is_match(&PathBuf::from("BUCK")));
611        assert!(BuckBuildParser::is_match(&PathBuf::from("path/to/BUCK")));
612        assert!(!BuckBuildParser::is_match(&PathBuf::from("BUILD")));
613        assert!(!BuckBuildParser::is_match(&PathBuf::from("buck")));
614    }
615
616    #[test]
617    fn test_metadata_bzl_is_match() {
618        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
619            "METADATA.bzl"
620        )));
621        assert!(BuckMetadataBzlParser::is_match(&PathBuf::from(
622            "path/to/METADATA.bzl"
623        )));
624        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from(
625            "metadata.bzl"
626        )));
627        assert!(!BuckMetadataBzlParser::is_match(&PathBuf::from("METADATA")));
628    }
629
630    #[test]
631    fn test_check_rule_name_ending() {
632        assert!(check_rule_name_ending("android_binary"));
633        assert!(check_rule_name_ending("android_library"));
634        assert!(check_rule_name_ending("java_binary"));
635        assert!(!check_rule_name_ending("filegroup"));
636    }
637
638    #[test]
639    fn test_preprocess_starlark_content_handles_oss_guarded_alternatives() {
640        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
641prelude = native
642
643# @oss-disable: rust_unittest(
644    rust_test( # @oss-enable
645        name = "test",
646    )
647
648platform_utils = None # @oss-enable
649"#;
650
651        let normalized = preprocess_starlark_content(content);
652
653        assert!(!normalized.contains("@oss-disable"));
654        assert!(!normalized.contains("@oss-enable"));
655        assert!(normalized.contains("rust_test("));
656        assert!(normalized.contains("platform_utils = None"));
657        assert!(!normalized.contains("    rust_test("));
658    }
659
660    #[test]
661    fn test_parse_buck_build_with_oss_guarded_rule() {
662        let content = r#"# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_library.bzl", "rust_library")
663# @oss-disable[end= ]: load("@fbsource//tools/build_defs:rust_unittest.bzl", "rust_unittest")
664
665oncall("build_infra")
666
667rust_library(
668    name = "library",
669    srcs = ["src/lib.rs"],
670)
671
672# @oss-disable: rust_unittest(
673    rust_test( # @oss-enable
674    name = "test",
675    srcs = ["tests/test.rs"],
676)
677"#;
678
679        let temp_dir = tempfile::tempdir().unwrap();
680        let buck_path = temp_dir.path().join("BUCK");
681        std::fs::write(&buck_path, content).unwrap();
682
683        let packages = parse_buck_build(&buck_path).expect("BUCK file should parse");
684
685        assert_eq!(packages.len(), 1);
686        assert_eq!(packages[0].package_type, Some(PackageType::Buck));
687        assert_eq!(packages[0].name.as_deref(), Some("library"));
688    }
689}
690
691crate::register_parser!(
692    "Buck build file and METADATA.bzl",
693    &["**/BUCK", "**/METADATA.bzl"],
694    "buck",
695    "",
696    Some("https://buck.build/"),
697);