Skip to main content

provenant/parsers/
bazel.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Bazel BUILD file parser
5//!
6//! Extracts package metadata from Bazel BUILD files using Starlark (Python-like) syntax.
7//!
8//! ## Features
9//! - Parses Starlark syntax using starlark_syntax
10//! - Extracts build rules ending with "binary" or "library" (e.g., cc_binary, cc_library)
11//! - Extracts name and licenses fields from rule arguments
12//! - Falls back to parent directory name if no rules found
13//! - **Supports multiple packages**: `extract_packages()` returns all rules (100% parity)
14//!
15//! ## Usage
16//! - `extract_first_package()` - Returns first package (convenience method)
17//! - `extract_packages()` - Returns ALL packages (recommended for BUILD files)
18//!
19//! ## Reference
20//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py` (BazelBuildHandler)
21
22use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
23use crate::parsers::utils::{MAX_ITERATION_COUNT, RecursionGuard, truncate_field};
24use packageurl::PackageUrl;
25use serde_json::{Map as JsonMap, Value as JsonValue};
26use std::path::Path;
27
28use crate::parser_warn as warn;
29use starlark_syntax::syntax::ast;
30use starlark_syntax::syntax::module::AstModuleFields;
31use starlark_syntax::syntax::{AstModule, Dialect};
32
33use super::PackageParser;
34
35type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
36const SCANCODE_SIMPLE_TOP_LEVEL_KEY: &str = "scancode_simple_top_level";
37
38struct StarlarkCall<'a> {
39    func: &'a ast::AstExpr,
40    args: &'a StarlarkCallArgs,
41}
42
43pub struct BazelBuildParser;
44
45impl PackageParser for BazelBuildParser {
46    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
47
48    fn is_match(path: &Path) -> bool {
49        path.file_name()
50            .and_then(|name| name.to_str())
51            .is_some_and(|name| name == "BUILD")
52    }
53
54    fn extract_packages(path: &Path) -> Vec<PackageData> {
55        match parse_bazel_build(path) {
56            Ok(packages) if !packages.is_empty() => packages,
57            Ok(_) => vec![fallback_package_data(path)],
58            Err(e) => {
59                warn!("Failed to parse Bazel BUILD file {:?}: {}", path, e);
60                vec![fallback_package_data(path)]
61            }
62        }
63    }
64}
65
66/// Parse a Bazel BUILD file and extract all package data
67fn parse_bazel_build(path: &Path) -> Result<Vec<PackageData>, String> {
68    let content =
69        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
70    let module = parse_starlark_module("<BUILD>", content)?;
71    let scancode_simple_top_level = is_scancode_simple_top_level_module(&module);
72
73    let mut packages = Vec::new();
74
75    for statement in top_level_statements(&module)
76        .iter()
77        .take(MAX_ITERATION_COUNT)
78    {
79        if let Some(mut package_data) = extract_package_from_statement(statement) {
80            set_scancode_simple_top_level(&mut package_data, scancode_simple_top_level);
81            packages.push(package_data);
82        }
83    }
84
85    Ok(packages)
86}
87
88/// Extract package data from a single AST statement
89fn extract_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
90    let call = extract_call(statement)?;
91    let rule_name = extract_call_name(&call)?;
92
93    if !check_rule_name_ending(rule_name) {
94        return None;
95    }
96
97    let name = extract_string_kwarg(&call, "name")?;
98    let licenses = extract_string_list_kwarg(&call, "licenses");
99    let purl = build_bazel_purl(&name, None).map(truncate_field);
100
101    Some(PackageData {
102        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
103        name: Some(truncate_field(name)),
104        extracted_license_statement: licenses.map(|licenses| truncate_field(licenses.join(", "))),
105        datasource_id: Some(DatasourceId::BazelBuild),
106        purl,
107        ..Default::default()
108    })
109}
110
111/// Check if rule name ends with "binary" or "library"
112fn check_rule_name_ending(rule_name: &str) -> bool {
113    rule_name.ends_with("binary") || rule_name.ends_with("library")
114}
115
116/// Create fallback package data using parent directory name
117fn fallback_package_data(path: &Path) -> PackageData {
118    let name = path
119        .parent()
120        .and_then(|p| p.file_name())
121        .and_then(|n| n.to_str())
122        .map(|s| truncate_field(s.to_string()));
123
124    PackageData {
125        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
126        purl: name
127            .as_deref()
128            .and_then(|name| build_bazel_purl(name, None))
129            .map(truncate_field),
130        name,
131        datasource_id: Some(DatasourceId::BazelBuild),
132        ..Default::default()
133    }
134}
135
136fn set_scancode_simple_top_level(package_data: &mut PackageData, enabled: bool) {
137    let extra_data = package_data.extra_data.get_or_insert_with(Default::default);
138    extra_data.insert(
139        SCANCODE_SIMPLE_TOP_LEVEL_KEY.to_string(),
140        JsonValue::Bool(enabled),
141    );
142}
143
144fn is_scancode_simple_top_level_module(module: &AstModule) -> bool {
145    top_level_statements(module)
146        .iter()
147        .all(is_scancode_simple_top_level_statement)
148}
149
150fn is_scancode_simple_top_level_statement(statement: &ast::AstStmt) -> bool {
151    match &statement.node {
152        ast::StmtP::Expression(expr) => {
153            matches!(&expr.node, ast::ExprP::Call(func, _) if matches!(&func.node, ast::ExprP::Identifier(_)))
154        }
155        _ => true,
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use crate::models::PackageType;
163    use std::path::PathBuf;
164
165    #[test]
166    fn test_is_match() {
167        assert!(BazelBuildParser::is_match(&PathBuf::from("BUILD")));
168        assert!(BazelBuildParser::is_match(&PathBuf::from("path/to/BUILD")));
169        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUILD.bazel")));
170        assert!(!BazelBuildParser::is_match(&PathBuf::from("build")));
171        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUCK")));
172    }
173
174    #[test]
175    fn test_check_rule_name_ending() {
176        assert!(check_rule_name_ending("cc_binary"));
177        assert!(check_rule_name_ending("cc_library"));
178        assert!(check_rule_name_ending("java_binary"));
179        assert!(check_rule_name_ending("py_library"));
180        assert!(!check_rule_name_ending("filegroup"));
181        assert!(!check_rule_name_ending("load"));
182        assert!(!check_rule_name_ending("cc_test"));
183    }
184
185    #[test]
186    fn test_fallback_package_data() {
187        let path = PathBuf::from("/path/to/myproject/BUILD");
188        let pkg = fallback_package_data(&path);
189        assert_eq!(pkg.package_type, Some(PackageType::Bazel));
190        assert_eq!(pkg.name, Some("myproject".to_string()));
191        assert_eq!(pkg.purl.as_deref(), Some("pkg:bazel/myproject"));
192    }
193
194    #[test]
195    fn test_scancode_simple_top_level_allows_direct_calls() {
196        let module = parse_starlark_module(
197            "<BUILD>",
198            "cc_library(name = \"demo\")\npy_binary(name = \"tool\")\n".to_string(),
199        )
200        .expect("parse BUILD");
201
202        assert!(is_scancode_simple_top_level_module(&module));
203    }
204
205    #[test]
206    fn test_scancode_simple_top_level_rejects_attribute_calls() {
207        let module = parse_starlark_module(
208            "<BUILD>",
209            "selects.config_setting_group(name = \"demo\")\ncc_library(name = \"demo\")\n"
210                .to_string(),
211        )
212        .expect("parse BUILD");
213
214        assert!(!is_scancode_simple_top_level_module(&module));
215    }
216
217    #[test]
218    fn test_scancode_simple_top_level_rejects_non_call_expressions() {
219        let module =
220            parse_starlark_module("<BUILD>", "[(cc_binary(name = \"demo\"),)]\n".to_string())
221                .expect("parse BUILD");
222
223        assert!(!is_scancode_simple_top_level_module(&module));
224    }
225}
226
227crate::register_parser!(
228    "Bazel BUILD file",
229    &["**/BUILD"],
230    "bazel",
231    "",
232    Some("https://bazel.build/"),
233);
234
235pub struct BazelModuleParser;
236
237impl PackageParser for BazelModuleParser {
238    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
239
240    fn is_match(path: &Path) -> bool {
241        path.file_name()
242            .and_then(|name| name.to_str())
243            .is_some_and(|name| name == "MODULE.bazel")
244    }
245
246    fn extract_packages(path: &Path) -> Vec<PackageData> {
247        match parse_bazel_module(path) {
248            Ok(package) => vec![package],
249            Err(e) => {
250                warn!("Failed to parse Bazel MODULE.bazel {:?}: {}", path, e);
251                vec![default_bazel_module_package_data()]
252            }
253        }
254    }
255}
256
257fn parse_bazel_module(path: &Path) -> Result<PackageData, String> {
258    let content =
259        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
260    let module = parse_starlark_module("<MODULE.bazel>", content)?;
261
262    let mut package = default_bazel_module_package_data();
263    let mut extra_data = JsonMap::new();
264    let mut dependencies = Vec::new();
265    let mut overrides = Vec::new();
266
267    for statement in top_level_statements(&module)
268        .iter()
269        .take(MAX_ITERATION_COUNT)
270    {
271        let Some(call) = extract_call(statement) else {
272            continue;
273        };
274
275        let Some(function_name) = extract_call_name(&call) else {
276            continue;
277        };
278
279        match function_name {
280            "module" => {
281                package.name = extract_string_kwarg(&call, "name").map(truncate_field);
282                package.version = extract_string_kwarg(&call, "version").map(truncate_field);
283                package.purl = package
284                    .name
285                    .as_deref()
286                    .and_then(|name| build_bazel_purl(name, package.version.as_deref()))
287                    .map(truncate_field);
288
289                if let Some(repo_name) =
290                    extract_string_kwarg(&call, "repo_name").map(truncate_field)
291                {
292                    extra_data.insert("repo_name".to_string(), JsonValue::String(repo_name));
293                }
294                if let Some(compatibility_level) = extract_int_kwarg(&call, "compatibility_level") {
295                    extra_data.insert(
296                        "compatibility_level".to_string(),
297                        JsonValue::Number(compatibility_level.into()),
298                    );
299                }
300                if let Some(bazel_compatibility) = extract_kwarg_json(&call, "bazel_compatibility")
301                {
302                    extra_data.insert("bazel_compatibility".to_string(), bazel_compatibility);
303                }
304            }
305            "bazel_dep" => {
306                if let Some(dep) = extract_bazel_dependency(&call) {
307                    dependencies.push(dep);
308                }
309            }
310            "archive_override"
311            | "git_override"
312            | "local_path_override"
313            | "single_version_override"
314            | "multiple_version_override" => {
315                overrides.push(extract_override(function_name, &call));
316            }
317            _ => {}
318        }
319    }
320
321    if package.name.is_none() {
322        return Ok(default_bazel_module_package_data());
323    }
324
325    if !overrides.is_empty() {
326        extra_data.insert("overrides".to_string(), JsonValue::Array(overrides));
327    }
328
329    package.dependencies = dependencies;
330    package.extra_data = (!extra_data.is_empty()).then(|| extra_data.into_iter().collect());
331    Ok(package)
332}
333
334fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
335    let dialect = Dialect {
336        enable_top_level_stmt: true,
337        ..Dialect::Standard
338    };
339    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
340}
341
342fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
343    match &module.statement().node {
344        ast::StmtP::Statements(statements) => statements,
345        _ => std::slice::from_ref(module.statement()),
346    }
347}
348
349fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
350    match &statement.node {
351        ast::StmtP::Expression(expr) => extract_call_expr(expr),
352        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
353        _ => None,
354    }
355}
356
357fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
358    match &expr.node {
359        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
360        _ => None,
361    }
362}
363
364fn extract_call_name<'a>(call: &'a StarlarkCall<'_>) -> Option<&'a str> {
365    match &call.func.node {
366        ast::ExprP::Identifier(identifier) => Some(identifier.node.ident.as_str()),
367        _ => None,
368    }
369}
370
371fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
372    call.args
373        .args
374        .iter()
375        .find_map(|argument| match &argument.node {
376            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
377            _ => None,
378        })
379}
380
381fn extract_string_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
382    extract_named_kwarg(call, key).and_then(expr_as_string)
383}
384
385fn extract_string_list_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
386    let expr = extract_named_kwarg(call, key)?;
387    let items = match &expr.node {
388        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
389        _ => return None,
390    };
391    let values: Vec<_> = items
392        .iter()
393        .take(MAX_ITERATION_COUNT)
394        .filter_map(expr_as_string)
395        .collect();
396    (!values.is_empty()).then_some(values)
397}
398
399fn extract_bool_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<bool> {
400    extract_named_kwarg(call, key).and_then(expr_as_bool)
401}
402
403fn extract_int_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<i64> {
404    extract_named_kwarg(call, key).and_then(expr_as_i64)
405}
406
407fn extract_kwarg_json(call: &StarlarkCall<'_>, key: &str) -> Option<JsonValue> {
408    extract_named_kwarg(call, key)
409        .and_then(|expr| expr_to_json(expr, &mut RecursionGuard::depth_only()))
410}
411
412fn extract_bazel_dependency(call: &StarlarkCall<'_>) -> Option<Dependency> {
413    let name = extract_string_kwarg(call, "name").map(truncate_field)?;
414    let version = extract_string_kwarg(call, "version").map(truncate_field);
415    let is_dev = extract_bool_kwarg(call, "dev_dependency").unwrap_or(false);
416    let mut extra_data = JsonMap::new();
417
418    for field in ["repo_name", "max_compatibility_level", "registry"]
419        .iter()
420        .take(MAX_ITERATION_COUNT)
421    {
422        if let Some(value) = extract_kwarg_json(call, field) {
423            extra_data.insert(field.to_string(), value);
424        }
425    }
426
427    Some(Dependency {
428        purl: build_bazel_purl(&name, version.as_deref()).map(truncate_field),
429        extracted_requirement: version.clone(),
430        scope: Some(if is_dev { "dev" } else { "dependencies" }.to_string()),
431        is_runtime: Some(!is_dev),
432        is_optional: Some(is_dev),
433        is_pinned: Some(version.is_some()),
434        is_direct: Some(true),
435        resolved_package: None,
436        extra_data: (!extra_data.is_empty()).then(|| extra_data.into_iter().collect()),
437    })
438}
439
440fn extract_override(kind: &str, call: &StarlarkCall<'_>) -> JsonValue {
441    let mut override_map = JsonMap::new();
442    override_map.insert("kind".to_string(), JsonValue::String(kind.to_string()));
443    for argument in call.args.args.iter().take(MAX_ITERATION_COUNT) {
444        if let ast::ArgumentP::Named(name, value) = &argument.node
445            && let Some(value) = expr_to_json(value, &mut RecursionGuard::depth_only())
446        {
447            override_map.insert(name.node.clone(), value);
448        }
449    }
450    JsonValue::Object(override_map)
451}
452
453fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
454    match &expr.node {
455        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
456        _ => None,
457    }
458}
459
460fn expr_as_bool(expr: &ast::AstExpr) -> Option<bool> {
461    match &expr.node {
462        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
463            "True" => Some(true),
464            "False" => Some(false),
465            _ => None,
466        },
467        _ => None,
468    }
469}
470
471fn expr_as_i64(expr: &ast::AstExpr) -> Option<i64> {
472    match &expr.node {
473        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value.node.to_string().parse().ok(),
474        _ => None,
475    }
476}
477
478fn expr_to_json(expr: &ast::AstExpr, guard: &mut RecursionGuard<()>) -> Option<JsonValue> {
479    if guard.descend() {
480        return None;
481    }
482    let result = match &expr.node {
483        ast::ExprP::Literal(ast::AstLiteral::String(value)) => {
484            Some(JsonValue::String(value.node.clone()))
485        }
486        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value
487            .node
488            .to_string()
489            .parse::<i64>()
490            .ok()
491            .map(|value| JsonValue::Number(value.into()))
492            .or_else(|| Some(JsonValue::String(value.node.to_string()))),
493        ast::ExprP::Literal(ast::AstLiteral::Float(value)) => {
494            serde_json::Number::from_f64(value.node).map(JsonValue::Number)
495        }
496        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
497            "True" => Some(JsonValue::Bool(true)),
498            "False" => Some(JsonValue::Bool(false)),
499            "None" => Some(JsonValue::Null),
500            _ => None,
501        },
502        ast::ExprP::List(elts) | ast::ExprP::Tuple(elts) => Some(JsonValue::Array(
503            elts.iter()
504                .take(MAX_ITERATION_COUNT)
505                .filter_map(|e| expr_to_json(e, guard))
506                .collect(),
507        )),
508        ast::ExprP::Dict(items) => {
509            let mut map = JsonMap::new();
510            for (key, value) in items.iter().take(MAX_ITERATION_COUNT) {
511                let Some(key) = expr_as_string(key) else {
512                    continue;
513                };
514                if let Some(value) = expr_to_json(value, guard) {
515                    map.insert(key, value);
516                }
517            }
518            Some(JsonValue::Object(map))
519        }
520        _ => None,
521    };
522    guard.ascend();
523    result
524}
525
526fn build_bazel_purl(name: &str, version: Option<&str>) -> Option<String> {
527    let mut purl = PackageUrl::new("bazel", name).ok()?;
528    if let Some(version) = version.filter(|value| !value.trim().is_empty()) {
529        purl.with_version(version).ok()?;
530    }
531    Some(purl.to_string())
532}
533
534fn default_bazel_module_package_data() -> PackageData {
535    PackageData {
536        package_type: Some(BazelModuleParser::PACKAGE_TYPE),
537        datasource_id: Some(DatasourceId::BazelModule),
538        ..Default::default()
539    }
540}
541
542crate::register_parser!(
543    "Bazel MODULE.bazel file",
544    &["**/MODULE.bazel"],
545    "bazel",
546    "",
547    Some("https://bazel.build/external/module"),
548);