Skip to main content

provenant/parsers/
bazel.rs

1//! Bazel BUILD file parser
2//!
3//! Extracts package metadata from Bazel BUILD files using Starlark (Python-like) syntax.
4//!
5//! ## Features
6//! - Parses Starlark syntax using starlark_syntax
7//! - Extracts build rules ending with "binary" or "library" (e.g., cc_binary, cc_library)
8//! - Extracts name and licenses fields from rule arguments
9//! - Falls back to parent directory name if no rules found
10//! - **Supports multiple packages**: `extract_packages()` returns all rules (100% parity)
11//!
12//! ## Usage
13//! - `extract_first_package()` - Returns first package (convenience method)
14//! - `extract_packages()` - Returns ALL packages (recommended for BUILD files)
15//!
16//! ## Reference
17//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py` (BazelBuildHandler)
18
19use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
20use crate::parsers::utils::{MAX_ITERATION_COUNT, RecursionGuard, truncate_field};
21use packageurl::PackageUrl;
22use serde_json::{Map as JsonMap, Value as JsonValue};
23use std::path::Path;
24
25use crate::parser_warn as warn;
26use starlark_syntax::syntax::ast;
27use starlark_syntax::syntax::module::AstModuleFields;
28use starlark_syntax::syntax::{AstModule, Dialect};
29
30use super::PackageParser;
31
32type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
33const SCANCODE_SIMPLE_TOP_LEVEL_KEY: &str = "scancode_simple_top_level";
34
35struct StarlarkCall<'a> {
36    func: &'a ast::AstExpr,
37    args: &'a StarlarkCallArgs,
38}
39
40pub struct BazelBuildParser;
41
42impl PackageParser for BazelBuildParser {
43    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
44
45    fn is_match(path: &Path) -> bool {
46        path.file_name()
47            .and_then(|name| name.to_str())
48            .is_some_and(|name| name == "BUILD")
49    }
50
51    fn extract_packages(path: &Path) -> Vec<PackageData> {
52        match parse_bazel_build(path) {
53            Ok(packages) if !packages.is_empty() => packages,
54            Ok(_) => vec![fallback_package_data(path)],
55            Err(e) => {
56                warn!("Failed to parse Bazel BUILD file {:?}: {}", path, e);
57                vec![fallback_package_data(path)]
58            }
59        }
60    }
61}
62
63/// Parse a Bazel BUILD file and extract all package data
64fn parse_bazel_build(path: &Path) -> Result<Vec<PackageData>, String> {
65    let content =
66        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
67    let module = parse_starlark_module("<BUILD>", content)?;
68    let scancode_simple_top_level = is_scancode_simple_top_level_module(&module);
69
70    let mut packages = Vec::new();
71
72    for statement in top_level_statements(&module)
73        .iter()
74        .take(MAX_ITERATION_COUNT)
75    {
76        if let Some(mut package_data) = extract_package_from_statement(statement) {
77            set_scancode_simple_top_level(&mut package_data, scancode_simple_top_level);
78            packages.push(package_data);
79        }
80    }
81
82    Ok(packages)
83}
84
85/// Extract package data from a single AST statement
86fn extract_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
87    let call = extract_call(statement)?;
88    let rule_name = extract_call_name(&call)?;
89
90    if !check_rule_name_ending(rule_name) {
91        return None;
92    }
93
94    let name = extract_string_kwarg(&call, "name")?;
95    let licenses = extract_string_list_kwarg(&call, "licenses");
96    let purl = build_bazel_purl(&name, None).map(truncate_field);
97
98    Some(PackageData {
99        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
100        name: Some(truncate_field(name)),
101        extracted_license_statement: licenses.map(|licenses| truncate_field(licenses.join(", "))),
102        datasource_id: Some(DatasourceId::BazelBuild),
103        purl,
104        ..Default::default()
105    })
106}
107
108/// Check if rule name ends with "binary" or "library"
109fn check_rule_name_ending(rule_name: &str) -> bool {
110    rule_name.ends_with("binary") || rule_name.ends_with("library")
111}
112
113/// Create fallback package data using parent directory name
114fn fallback_package_data(path: &Path) -> PackageData {
115    let name = path
116        .parent()
117        .and_then(|p| p.file_name())
118        .and_then(|n| n.to_str())
119        .map(|s| truncate_field(s.to_string()));
120
121    PackageData {
122        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
123        purl: name
124            .as_deref()
125            .and_then(|name| build_bazel_purl(name, None))
126            .map(truncate_field),
127        name,
128        datasource_id: Some(DatasourceId::BazelBuild),
129        ..Default::default()
130    }
131}
132
133fn set_scancode_simple_top_level(package_data: &mut PackageData, enabled: bool) {
134    let extra_data = package_data.extra_data.get_or_insert_with(Default::default);
135    extra_data.insert(
136        SCANCODE_SIMPLE_TOP_LEVEL_KEY.to_string(),
137        JsonValue::Bool(enabled),
138    );
139}
140
141fn is_scancode_simple_top_level_module(module: &AstModule) -> bool {
142    top_level_statements(module)
143        .iter()
144        .all(is_scancode_simple_top_level_statement)
145}
146
147fn is_scancode_simple_top_level_statement(statement: &ast::AstStmt) -> bool {
148    match &statement.node {
149        ast::StmtP::Expression(expr) => {
150            matches!(&expr.node, ast::ExprP::Call(func, _) if matches!(&func.node, ast::ExprP::Identifier(_)))
151        }
152        _ => true,
153    }
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use crate::models::PackageType;
160    use std::path::PathBuf;
161
162    #[test]
163    fn test_is_match() {
164        assert!(BazelBuildParser::is_match(&PathBuf::from("BUILD")));
165        assert!(BazelBuildParser::is_match(&PathBuf::from("path/to/BUILD")));
166        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUILD.bazel")));
167        assert!(!BazelBuildParser::is_match(&PathBuf::from("build")));
168        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUCK")));
169    }
170
171    #[test]
172    fn test_check_rule_name_ending() {
173        assert!(check_rule_name_ending("cc_binary"));
174        assert!(check_rule_name_ending("cc_library"));
175        assert!(check_rule_name_ending("java_binary"));
176        assert!(check_rule_name_ending("py_library"));
177        assert!(!check_rule_name_ending("filegroup"));
178        assert!(!check_rule_name_ending("load"));
179        assert!(!check_rule_name_ending("cc_test"));
180    }
181
182    #[test]
183    fn test_fallback_package_data() {
184        let path = PathBuf::from("/path/to/myproject/BUILD");
185        let pkg = fallback_package_data(&path);
186        assert_eq!(pkg.package_type, Some(PackageType::Bazel));
187        assert_eq!(pkg.name, Some("myproject".to_string()));
188        assert_eq!(pkg.purl.as_deref(), Some("pkg:bazel/myproject"));
189    }
190
191    #[test]
192    fn test_scancode_simple_top_level_allows_direct_calls() {
193        let module = parse_starlark_module(
194            "<BUILD>",
195            "cc_library(name = \"demo\")\npy_binary(name = \"tool\")\n".to_string(),
196        )
197        .expect("parse BUILD");
198
199        assert!(is_scancode_simple_top_level_module(&module));
200    }
201
202    #[test]
203    fn test_scancode_simple_top_level_rejects_attribute_calls() {
204        let module = parse_starlark_module(
205            "<BUILD>",
206            "selects.config_setting_group(name = \"demo\")\ncc_library(name = \"demo\")\n"
207                .to_string(),
208        )
209        .expect("parse BUILD");
210
211        assert!(!is_scancode_simple_top_level_module(&module));
212    }
213
214    #[test]
215    fn test_scancode_simple_top_level_rejects_non_call_expressions() {
216        let module =
217            parse_starlark_module("<BUILD>", "[(cc_binary(name = \"demo\"),)]\n".to_string())
218                .expect("parse BUILD");
219
220        assert!(!is_scancode_simple_top_level_module(&module));
221    }
222}
223
224crate::register_parser!(
225    "Bazel BUILD file",
226    &["**/BUILD"],
227    "bazel",
228    "",
229    Some("https://bazel.build/"),
230);
231
232pub struct BazelModuleParser;
233
234impl PackageParser for BazelModuleParser {
235    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
236
237    fn is_match(path: &Path) -> bool {
238        path.file_name()
239            .and_then(|name| name.to_str())
240            .is_some_and(|name| name == "MODULE.bazel")
241    }
242
243    fn extract_packages(path: &Path) -> Vec<PackageData> {
244        match parse_bazel_module(path) {
245            Ok(package) => vec![package],
246            Err(e) => {
247                warn!("Failed to parse Bazel MODULE.bazel {:?}: {}", path, e);
248                vec![default_bazel_module_package_data()]
249            }
250        }
251    }
252}
253
254fn parse_bazel_module(path: &Path) -> Result<PackageData, String> {
255    let content =
256        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
257    let module = parse_starlark_module("<MODULE.bazel>", content)?;
258
259    let mut package = default_bazel_module_package_data();
260    let mut extra_data = JsonMap::new();
261    let mut dependencies = Vec::new();
262    let mut overrides = Vec::new();
263
264    for statement in top_level_statements(&module)
265        .iter()
266        .take(MAX_ITERATION_COUNT)
267    {
268        let Some(call) = extract_call(statement) else {
269            continue;
270        };
271
272        let Some(function_name) = extract_call_name(&call) else {
273            continue;
274        };
275
276        match function_name {
277            "module" => {
278                package.name = extract_string_kwarg(&call, "name").map(truncate_field);
279                package.version = extract_string_kwarg(&call, "version").map(truncate_field);
280                package.purl = package
281                    .name
282                    .as_deref()
283                    .and_then(|name| build_bazel_purl(name, package.version.as_deref()))
284                    .map(truncate_field);
285
286                if let Some(repo_name) =
287                    extract_string_kwarg(&call, "repo_name").map(truncate_field)
288                {
289                    extra_data.insert("repo_name".to_string(), JsonValue::String(repo_name));
290                }
291                if let Some(compatibility_level) = extract_int_kwarg(&call, "compatibility_level") {
292                    extra_data.insert(
293                        "compatibility_level".to_string(),
294                        JsonValue::Number(compatibility_level.into()),
295                    );
296                }
297                if let Some(bazel_compatibility) = extract_kwarg_json(&call, "bazel_compatibility")
298                {
299                    extra_data.insert("bazel_compatibility".to_string(), bazel_compatibility);
300                }
301            }
302            "bazel_dep" => {
303                if let Some(dep) = extract_bazel_dependency(&call) {
304                    dependencies.push(dep);
305                }
306            }
307            "archive_override"
308            | "git_override"
309            | "local_path_override"
310            | "single_version_override"
311            | "multiple_version_override" => {
312                overrides.push(extract_override(function_name, &call));
313            }
314            _ => {}
315        }
316    }
317
318    if package.name.is_none() {
319        return Ok(default_bazel_module_package_data());
320    }
321
322    if !overrides.is_empty() {
323        extra_data.insert("overrides".to_string(), JsonValue::Array(overrides));
324    }
325
326    package.dependencies = dependencies;
327    package.extra_data = (!extra_data.is_empty()).then(|| extra_data.into_iter().collect());
328    Ok(package)
329}
330
331fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
332    let dialect = Dialect {
333        enable_top_level_stmt: true,
334        ..Dialect::Standard
335    };
336    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
337}
338
339fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
340    match &module.statement().node {
341        ast::StmtP::Statements(statements) => statements,
342        _ => std::slice::from_ref(module.statement()),
343    }
344}
345
346fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
347    match &statement.node {
348        ast::StmtP::Expression(expr) => extract_call_expr(expr),
349        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
350        _ => None,
351    }
352}
353
354fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
355    match &expr.node {
356        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
357        _ => None,
358    }
359}
360
361fn extract_call_name<'a>(call: &'a StarlarkCall<'_>) -> Option<&'a str> {
362    match &call.func.node {
363        ast::ExprP::Identifier(identifier) => Some(identifier.node.ident.as_str()),
364        _ => None,
365    }
366}
367
368fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
369    call.args
370        .args
371        .iter()
372        .find_map(|argument| match &argument.node {
373            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
374            _ => None,
375        })
376}
377
378fn extract_string_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
379    extract_named_kwarg(call, key).and_then(expr_as_string)
380}
381
382fn extract_string_list_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
383    let expr = extract_named_kwarg(call, key)?;
384    let items = match &expr.node {
385        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
386        _ => return None,
387    };
388    let values: Vec<_> = items
389        .iter()
390        .take(MAX_ITERATION_COUNT)
391        .filter_map(expr_as_string)
392        .collect();
393    (!values.is_empty()).then_some(values)
394}
395
396fn extract_bool_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<bool> {
397    extract_named_kwarg(call, key).and_then(expr_as_bool)
398}
399
400fn extract_int_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<i64> {
401    extract_named_kwarg(call, key).and_then(expr_as_i64)
402}
403
404fn extract_kwarg_json(call: &StarlarkCall<'_>, key: &str) -> Option<JsonValue> {
405    extract_named_kwarg(call, key)
406        .and_then(|expr| expr_to_json(expr, &mut RecursionGuard::depth_only()))
407}
408
409fn extract_bazel_dependency(call: &StarlarkCall<'_>) -> Option<Dependency> {
410    let name = extract_string_kwarg(call, "name").map(truncate_field)?;
411    let version = extract_string_kwarg(call, "version").map(truncate_field);
412    let is_dev = extract_bool_kwarg(call, "dev_dependency").unwrap_or(false);
413    let mut extra_data = JsonMap::new();
414
415    for field in ["repo_name", "max_compatibility_level", "registry"]
416        .iter()
417        .take(MAX_ITERATION_COUNT)
418    {
419        if let Some(value) = extract_kwarg_json(call, field) {
420            extra_data.insert(field.to_string(), value);
421        }
422    }
423
424    Some(Dependency {
425        purl: build_bazel_purl(&name, version.as_deref()).map(truncate_field),
426        extracted_requirement: version.clone(),
427        scope: Some(if is_dev { "dev" } else { "dependencies" }.to_string()),
428        is_runtime: Some(!is_dev),
429        is_optional: Some(is_dev),
430        is_pinned: Some(version.is_some()),
431        is_direct: Some(true),
432        resolved_package: None,
433        extra_data: (!extra_data.is_empty()).then(|| extra_data.into_iter().collect()),
434    })
435}
436
437fn extract_override(kind: &str, call: &StarlarkCall<'_>) -> JsonValue {
438    let mut override_map = JsonMap::new();
439    override_map.insert("kind".to_string(), JsonValue::String(kind.to_string()));
440    for argument in call.args.args.iter().take(MAX_ITERATION_COUNT) {
441        if let ast::ArgumentP::Named(name, value) = &argument.node
442            && let Some(value) = expr_to_json(value, &mut RecursionGuard::depth_only())
443        {
444            override_map.insert(name.node.clone(), value);
445        }
446    }
447    JsonValue::Object(override_map)
448}
449
450fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
451    match &expr.node {
452        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
453        _ => None,
454    }
455}
456
457fn expr_as_bool(expr: &ast::AstExpr) -> Option<bool> {
458    match &expr.node {
459        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
460            "True" => Some(true),
461            "False" => Some(false),
462            _ => None,
463        },
464        _ => None,
465    }
466}
467
468fn expr_as_i64(expr: &ast::AstExpr) -> Option<i64> {
469    match &expr.node {
470        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value.node.to_string().parse().ok(),
471        _ => None,
472    }
473}
474
475fn expr_to_json(expr: &ast::AstExpr, guard: &mut RecursionGuard<()>) -> Option<JsonValue> {
476    if guard.descend() {
477        return None;
478    }
479    let result = match &expr.node {
480        ast::ExprP::Literal(ast::AstLiteral::String(value)) => {
481            Some(JsonValue::String(value.node.clone()))
482        }
483        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value
484            .node
485            .to_string()
486            .parse::<i64>()
487            .ok()
488            .map(|value| JsonValue::Number(value.into()))
489            .or_else(|| Some(JsonValue::String(value.node.to_string()))),
490        ast::ExprP::Literal(ast::AstLiteral::Float(value)) => {
491            serde_json::Number::from_f64(value.node).map(JsonValue::Number)
492        }
493        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
494            "True" => Some(JsonValue::Bool(true)),
495            "False" => Some(JsonValue::Bool(false)),
496            "None" => Some(JsonValue::Null),
497            _ => None,
498        },
499        ast::ExprP::List(elts) | ast::ExprP::Tuple(elts) => Some(JsonValue::Array(
500            elts.iter()
501                .take(MAX_ITERATION_COUNT)
502                .filter_map(|e| expr_to_json(e, guard))
503                .collect(),
504        )),
505        ast::ExprP::Dict(items) => {
506            let mut map = JsonMap::new();
507            for (key, value) in items.iter().take(MAX_ITERATION_COUNT) {
508                let Some(key) = expr_as_string(key) else {
509                    continue;
510                };
511                if let Some(value) = expr_to_json(value, guard) {
512                    map.insert(key, value);
513                }
514            }
515            Some(JsonValue::Object(map))
516        }
517        _ => None,
518    };
519    guard.ascend();
520    result
521}
522
523fn build_bazel_purl(name: &str, version: Option<&str>) -> Option<String> {
524    let mut purl = PackageUrl::new("bazel", name).ok()?;
525    if let Some(version) = version.filter(|value| !value.trim().is_empty()) {
526        purl.with_version(version).ok()?;
527    }
528    Some(purl.to_string())
529}
530
531fn default_bazel_module_package_data() -> PackageData {
532    PackageData {
533        package_type: Some(BazelModuleParser::PACKAGE_TYPE),
534        datasource_id: Some(DatasourceId::BazelModule),
535        ..Default::default()
536    }
537}
538
539crate::register_parser!(
540    "Bazel MODULE.bazel file",
541    &["**/MODULE.bazel"],
542    "bazel",
543    "",
544    Some("https://bazel.build/external/module"),
545);