Skip to main content

provenant/parsers/
bazel.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Bazel BUILD file parser
5//!
6//! Extracts package metadata from Bazel BUILD files using Starlark (Python-like) syntax.
7//!
8//! ## Features
9//! - Parses Starlark syntax using starlark_syntax
10//! - Extracts build rules ending with "binary" or "library" (e.g., cc_binary, cc_library)
11//! - Extracts name and licenses fields from rule arguments
12//! - Falls back to parent directory name if no rules found
13//! - **Supports multiple packages**: `extract_packages()` returns all rules (100% parity)
14//!
15//! ## Usage
16//! - `extract_first_package()` - Returns first package (convenience method)
17//! - `extract_packages()` - Returns ALL packages (recommended for BUILD files)
18//!
19//! ## Reference
20//! Python implementation: `reference/scancode-toolkit/src/packagedcode/build.py` (BazelBuildHandler)
21
22use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
23use crate::parsers::utils::{MAX_ITERATION_COUNT, RecursionGuard, truncate_field};
24use packageurl::PackageUrl;
25use serde_json::{Map as JsonMap, Value as JsonValue};
26use std::path::Path;
27
28use crate::parser_warn as warn;
29use starlark_syntax::syntax::ast;
30use starlark_syntax::syntax::module::AstModuleFields;
31use starlark_syntax::syntax::{AstModule, Dialect};
32
33use super::PackageParser;
34use super::metadata::ParserMetadata;
35
36type StarlarkCallArgs = ast::CallArgsP<ast::AstNoPayload>;
37const SCANCODE_SIMPLE_TOP_LEVEL_KEY: &str = "scancode_simple_top_level";
38
39struct StarlarkCall<'a> {
40    func: &'a ast::AstExpr,
41    args: &'a StarlarkCallArgs,
42}
43
44pub struct BazelBuildParser;
45
46impl PackageParser for BazelBuildParser {
47    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
48
49    fn metadata() -> Vec<ParserMetadata> {
50        vec![ParserMetadata {
51            description: "Bazel BUILD file",
52            file_patterns: &["**/BUILD"],
53            package_type: "bazel",
54            primary_language: "",
55            documentation_url: Some("https://bazel.build/"),
56        }]
57    }
58
59    fn is_match(path: &Path) -> bool {
60        path.file_name()
61            .and_then(|name| name.to_str())
62            .is_some_and(|name| name == "BUILD")
63    }
64
65    fn extract_packages(path: &Path) -> Vec<PackageData> {
66        match parse_bazel_build(path) {
67            Ok(packages) if !packages.is_empty() => packages,
68            Ok(_) => vec![fallback_package_data(path)],
69            Err(e) => {
70                warn!("Failed to parse Bazel BUILD file {:?}: {}", path, e);
71                vec![fallback_package_data(path)]
72            }
73        }
74    }
75}
76
77/// Parse a Bazel BUILD file and extract all package data
78fn parse_bazel_build(path: &Path) -> Result<Vec<PackageData>, String> {
79    let content =
80        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
81    let module = parse_starlark_module("<BUILD>", content)?;
82    let scancode_simple_top_level = is_scancode_simple_top_level_module(&module);
83
84    let mut packages = Vec::new();
85
86    for statement in top_level_statements(&module)
87        .iter()
88        .take(MAX_ITERATION_COUNT)
89    {
90        if let Some(mut package_data) = extract_package_from_statement(statement) {
91            set_scancode_simple_top_level(&mut package_data, scancode_simple_top_level);
92            packages.push(package_data);
93        }
94    }
95
96    Ok(packages)
97}
98
99/// Extract package data from a single AST statement
100fn extract_package_from_statement(statement: &ast::AstStmt) -> Option<PackageData> {
101    let call = extract_call(statement)?;
102    let rule_name = extract_call_name(&call)?;
103
104    if !check_rule_name_ending(rule_name) {
105        return None;
106    }
107
108    let name = extract_string_kwarg(&call, "name")?;
109    let licenses = extract_string_list_kwarg(&call, "licenses");
110    let purl = build_bazel_purl(&name, None).map(truncate_field);
111
112    Some(PackageData {
113        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
114        name: Some(truncate_field(name)),
115        extracted_license_statement: licenses.map(|licenses| truncate_field(licenses.join(", "))),
116        datasource_id: Some(DatasourceId::BazelBuild),
117        purl,
118        ..Default::default()
119    })
120}
121
122/// Check if rule name ends with "binary" or "library"
123fn check_rule_name_ending(rule_name: &str) -> bool {
124    rule_name.ends_with("binary") || rule_name.ends_with("library")
125}
126
127/// Create fallback package data using parent directory name
128fn fallback_package_data(path: &Path) -> PackageData {
129    let name = path
130        .parent()
131        .and_then(|p| p.file_name())
132        .and_then(|n| n.to_str())
133        .map(|s| truncate_field(s.to_string()));
134
135    PackageData {
136        package_type: Some(BazelBuildParser::PACKAGE_TYPE),
137        purl: name
138            .as_deref()
139            .and_then(|name| build_bazel_purl(name, None))
140            .map(truncate_field),
141        name,
142        datasource_id: Some(DatasourceId::BazelBuild),
143        ..Default::default()
144    }
145}
146
147fn set_scancode_simple_top_level(package_data: &mut PackageData, enabled: bool) {
148    let extra_data = package_data.extra_data.get_or_insert_with(Default::default);
149    extra_data.insert(
150        SCANCODE_SIMPLE_TOP_LEVEL_KEY.to_string(),
151        JsonValue::Bool(enabled),
152    );
153}
154
155fn is_scancode_simple_top_level_module(module: &AstModule) -> bool {
156    top_level_statements(module)
157        .iter()
158        .all(is_scancode_simple_top_level_statement)
159}
160
161fn is_scancode_simple_top_level_statement(statement: &ast::AstStmt) -> bool {
162    match &statement.node {
163        ast::StmtP::Expression(expr) => {
164            matches!(&expr.node, ast::ExprP::Call(func, _) if matches!(&func.node, ast::ExprP::Identifier(_)))
165        }
166        _ => true,
167    }
168}
169
170pub struct BazelModuleParser;
171
172impl PackageParser for BazelModuleParser {
173    const PACKAGE_TYPE: PackageType = PackageType::Bazel;
174
175    fn metadata() -> Vec<ParserMetadata> {
176        vec![ParserMetadata {
177            description: "Bazel MODULE.bazel file",
178            file_patterns: &["**/MODULE.bazel"],
179            package_type: "bazel",
180            primary_language: "",
181            documentation_url: Some("https://bazel.build/external/module"),
182        }]
183    }
184
185    fn is_match(path: &Path) -> bool {
186        path.file_name()
187            .and_then(|name| name.to_str())
188            .is_some_and(|name| name == "MODULE.bazel")
189    }
190
191    fn extract_packages(path: &Path) -> Vec<PackageData> {
192        match parse_bazel_module(path) {
193            Ok(package) => vec![package],
194            Err(e) => {
195                warn!("Failed to parse Bazel MODULE.bazel {:?}: {}", path, e);
196                vec![default_bazel_module_package_data()]
197            }
198        }
199    }
200}
201
202fn parse_bazel_module(path: &Path) -> Result<PackageData, String> {
203    let content =
204        crate::parsers::utils::read_file_to_string(path, None).map_err(|e| e.to_string())?;
205    let module = parse_starlark_module("<MODULE.bazel>", content)?;
206
207    let mut package = default_bazel_module_package_data();
208    let mut extra_data = JsonMap::new();
209    let mut dependencies = Vec::new();
210    let mut overrides = Vec::new();
211
212    for statement in top_level_statements(&module)
213        .iter()
214        .take(MAX_ITERATION_COUNT)
215    {
216        let Some(call) = extract_call(statement) else {
217            continue;
218        };
219
220        let Some(function_name) = extract_call_name(&call) else {
221            continue;
222        };
223
224        match function_name {
225            "module" => {
226                package.name = extract_string_kwarg(&call, "name").map(truncate_field);
227                package.version = extract_string_kwarg(&call, "version").map(truncate_field);
228                package.purl = package
229                    .name
230                    .as_deref()
231                    .and_then(|name| build_bazel_purl(name, package.version.as_deref()))
232                    .map(truncate_field);
233
234                if let Some(repo_name) =
235                    extract_string_kwarg(&call, "repo_name").map(truncate_field)
236                {
237                    extra_data.insert("repo_name".to_string(), JsonValue::String(repo_name));
238                }
239                if let Some(compatibility_level) = extract_int_kwarg(&call, "compatibility_level") {
240                    extra_data.insert(
241                        "compatibility_level".to_string(),
242                        JsonValue::Number(compatibility_level.into()),
243                    );
244                }
245                if let Some(bazel_compatibility) = extract_kwarg_json(&call, "bazel_compatibility")
246                {
247                    extra_data.insert("bazel_compatibility".to_string(), bazel_compatibility);
248                }
249            }
250            "bazel_dep" => {
251                if let Some(dep) = extract_bazel_dependency(&call) {
252                    dependencies.push(dep);
253                }
254            }
255            "archive_override"
256            | "git_override"
257            | "local_path_override"
258            | "single_version_override"
259            | "multiple_version_override" => {
260                overrides.push(extract_override(function_name, &call));
261            }
262            _ => {}
263        }
264    }
265
266    if package.name.is_none() {
267        return Ok(default_bazel_module_package_data());
268    }
269
270    if !overrides.is_empty() {
271        extra_data.insert("overrides".to_string(), JsonValue::Array(overrides));
272    }
273
274    package.dependencies = dependencies;
275    package.extra_data = (!extra_data.is_empty()).then(|| extra_data.into_iter().collect());
276    Ok(package)
277}
278
279fn parse_starlark_module(filename: &str, content: String) -> Result<AstModule, String> {
280    let dialect = Dialect {
281        enable_top_level_stmt: true,
282        ..Dialect::Standard
283    };
284    AstModule::parse(filename, content, &dialect).map_err(|error| error.to_string())
285}
286
287fn top_level_statements(module: &AstModule) -> &[ast::AstStmt] {
288    match &module.statement().node {
289        ast::StmtP::Statements(statements) => statements,
290        _ => std::slice::from_ref(module.statement()),
291    }
292}
293
294fn extract_call(statement: &ast::AstStmt) -> Option<StarlarkCall<'_>> {
295    match &statement.node {
296        ast::StmtP::Expression(expr) => extract_call_expr(expr),
297        ast::StmtP::Assign(assign) => extract_call_expr(&assign.rhs),
298        _ => None,
299    }
300}
301
302fn extract_call_expr(expr: &ast::AstExpr) -> Option<StarlarkCall<'_>> {
303    match &expr.node {
304        ast::ExprP::Call(func, args) => Some(StarlarkCall { func, args }),
305        _ => None,
306    }
307}
308
309fn extract_call_name<'a>(call: &'a StarlarkCall<'_>) -> Option<&'a str> {
310    match &call.func.node {
311        ast::ExprP::Identifier(identifier) => Some(identifier.node.ident.as_str()),
312        _ => None,
313    }
314}
315
316fn extract_named_kwarg<'a>(call: &'a StarlarkCall<'_>, key: &str) -> Option<&'a ast::AstExpr> {
317    call.args
318        .args
319        .iter()
320        .find_map(|argument| match &argument.node {
321            ast::ArgumentP::Named(name, value) if name.node == key => Some(value),
322            _ => None,
323        })
324}
325
326fn extract_string_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<String> {
327    extract_named_kwarg(call, key).and_then(expr_as_string)
328}
329
330fn extract_string_list_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<Vec<String>> {
331    let expr = extract_named_kwarg(call, key)?;
332    let items = match &expr.node {
333        ast::ExprP::List(items) | ast::ExprP::Tuple(items) => items,
334        _ => return None,
335    };
336    let values: Vec<_> = items
337        .iter()
338        .take(MAX_ITERATION_COUNT)
339        .filter_map(expr_as_string)
340        .collect();
341    (!values.is_empty()).then_some(values)
342}
343
344fn extract_bool_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<bool> {
345    extract_named_kwarg(call, key).and_then(expr_as_bool)
346}
347
348fn extract_int_kwarg(call: &StarlarkCall<'_>, key: &str) -> Option<i64> {
349    extract_named_kwarg(call, key).and_then(expr_as_i64)
350}
351
352fn extract_kwarg_json(call: &StarlarkCall<'_>, key: &str) -> Option<JsonValue> {
353    extract_named_kwarg(call, key)
354        .and_then(|expr| expr_to_json(expr, &mut RecursionGuard::depth_only()))
355}
356
357fn extract_bazel_dependency(call: &StarlarkCall<'_>) -> Option<Dependency> {
358    let name = extract_string_kwarg(call, "name").map(truncate_field)?;
359    let version = extract_string_kwarg(call, "version").map(truncate_field);
360    let is_dev = extract_bool_kwarg(call, "dev_dependency").unwrap_or(false);
361    let mut extra_data = JsonMap::new();
362
363    for field in ["repo_name", "max_compatibility_level", "registry"]
364        .iter()
365        .take(MAX_ITERATION_COUNT)
366    {
367        if let Some(value) = extract_kwarg_json(call, field) {
368            extra_data.insert(field.to_string(), value);
369        }
370    }
371
372    Some(Dependency {
373        purl: build_bazel_purl(&name, version.as_deref()).map(truncate_field),
374        extracted_requirement: version.clone(),
375        scope: Some(if is_dev { "dev" } else { "dependencies" }.to_string()),
376        is_runtime: Some(!is_dev),
377        is_optional: Some(is_dev),
378        is_pinned: Some(version.is_some()),
379        is_direct: Some(true),
380        resolved_package: None,
381        extra_data: (!extra_data.is_empty()).then(|| extra_data.into_iter().collect()),
382    })
383}
384
385fn extract_override(kind: &str, call: &StarlarkCall<'_>) -> JsonValue {
386    let mut override_map = JsonMap::new();
387    override_map.insert("kind".to_string(), JsonValue::String(kind.to_string()));
388    for argument in call.args.args.iter().take(MAX_ITERATION_COUNT) {
389        if let ast::ArgumentP::Named(name, value) = &argument.node
390            && let Some(value) = expr_to_json(value, &mut RecursionGuard::depth_only())
391        {
392            override_map.insert(name.node.clone(), value);
393        }
394    }
395    JsonValue::Object(override_map)
396}
397
398fn expr_as_string(expr: &ast::AstExpr) -> Option<String> {
399    match &expr.node {
400        ast::ExprP::Literal(ast::AstLiteral::String(value)) => Some(value.node.clone()),
401        _ => None,
402    }
403}
404
405fn expr_as_bool(expr: &ast::AstExpr) -> Option<bool> {
406    match &expr.node {
407        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
408            "True" => Some(true),
409            "False" => Some(false),
410            _ => None,
411        },
412        _ => None,
413    }
414}
415
416fn expr_as_i64(expr: &ast::AstExpr) -> Option<i64> {
417    match &expr.node {
418        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value.node.to_string().parse().ok(),
419        _ => None,
420    }
421}
422
423fn expr_to_json(expr: &ast::AstExpr, guard: &mut RecursionGuard<()>) -> Option<JsonValue> {
424    if guard.descend() {
425        return None;
426    }
427    let result = match &expr.node {
428        ast::ExprP::Literal(ast::AstLiteral::String(value)) => {
429            Some(JsonValue::String(value.node.clone()))
430        }
431        ast::ExprP::Literal(ast::AstLiteral::Int(value)) => value
432            .node
433            .to_string()
434            .parse::<i64>()
435            .ok()
436            .map(|value| JsonValue::Number(value.into()))
437            .or_else(|| Some(JsonValue::String(value.node.to_string()))),
438        ast::ExprP::Literal(ast::AstLiteral::Float(value)) => {
439            serde_json::Number::from_f64(value.node).map(JsonValue::Number)
440        }
441        ast::ExprP::Identifier(identifier) => match identifier.node.ident.as_str() {
442            "True" => Some(JsonValue::Bool(true)),
443            "False" => Some(JsonValue::Bool(false)),
444            "None" => Some(JsonValue::Null),
445            _ => None,
446        },
447        ast::ExprP::List(elts) | ast::ExprP::Tuple(elts) => Some(JsonValue::Array(
448            elts.iter()
449                .take(MAX_ITERATION_COUNT)
450                .filter_map(|e| expr_to_json(e, guard))
451                .collect(),
452        )),
453        ast::ExprP::Dict(items) => {
454            let mut map = JsonMap::new();
455            for (key, value) in items.iter().take(MAX_ITERATION_COUNT) {
456                let Some(key) = expr_as_string(key) else {
457                    continue;
458                };
459                if let Some(value) = expr_to_json(value, guard) {
460                    map.insert(key, value);
461                }
462            }
463            Some(JsonValue::Object(map))
464        }
465        _ => None,
466    };
467    guard.ascend();
468    result
469}
470
471fn build_bazel_purl(name: &str, version: Option<&str>) -> Option<String> {
472    let mut purl = PackageUrl::new("bazel", name).ok()?;
473    if let Some(version) = version.filter(|value| !value.trim().is_empty()) {
474        purl.with_version(version).ok()?;
475    }
476    Some(purl.to_string())
477}
478
479fn default_bazel_module_package_data() -> PackageData {
480    PackageData {
481        package_type: Some(BazelModuleParser::PACKAGE_TYPE),
482        datasource_id: Some(DatasourceId::BazelModule),
483        ..Default::default()
484    }
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490    use crate::models::PackageType;
491    use std::path::PathBuf;
492
493    #[test]
494    fn test_is_match() {
495        assert!(BazelBuildParser::is_match(&PathBuf::from("BUILD")));
496        assert!(BazelBuildParser::is_match(&PathBuf::from("path/to/BUILD")));
497        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUILD.bazel")));
498        assert!(!BazelBuildParser::is_match(&PathBuf::from("build")));
499        assert!(!BazelBuildParser::is_match(&PathBuf::from("BUCK")));
500    }
501
502    #[test]
503    fn test_check_rule_name_ending() {
504        assert!(check_rule_name_ending("cc_binary"));
505        assert!(check_rule_name_ending("cc_library"));
506        assert!(check_rule_name_ending("java_binary"));
507        assert!(check_rule_name_ending("py_library"));
508        assert!(!check_rule_name_ending("filegroup"));
509        assert!(!check_rule_name_ending("load"));
510        assert!(!check_rule_name_ending("cc_test"));
511    }
512
513    #[test]
514    fn test_fallback_package_data() {
515        let path = PathBuf::from("/path/to/myproject/BUILD");
516        let pkg = fallback_package_data(&path);
517        assert_eq!(pkg.package_type, Some(PackageType::Bazel));
518        assert_eq!(pkg.name, Some("myproject".to_string()));
519        assert_eq!(pkg.purl.as_deref(), Some("pkg:bazel/myproject"));
520    }
521
522    #[test]
523    fn test_scancode_simple_top_level_allows_direct_calls() {
524        let module = parse_starlark_module(
525            "<BUILD>",
526            "cc_library(name = \"demo\")\npy_binary(name = \"tool\")\n".to_string(),
527        )
528        .expect("parse BUILD");
529
530        assert!(is_scancode_simple_top_level_module(&module));
531    }
532
533    #[test]
534    fn test_scancode_simple_top_level_rejects_attribute_calls() {
535        let module = parse_starlark_module(
536            "<BUILD>",
537            "selects.config_setting_group(name = \"demo\")\ncc_library(name = \"demo\")\n"
538                .to_string(),
539        )
540        .expect("parse BUILD");
541
542        assert!(!is_scancode_simple_top_level_module(&module));
543    }
544
545    #[test]
546    fn test_scancode_simple_top_level_rejects_non_call_expressions() {
547        let module =
548            parse_starlark_module("<BUILD>", "[(cc_binary(name = \"demo\"),)]\n".to_string())
549                .expect("parse BUILD");
550
551        assert!(!is_scancode_simple_top_level_module(&module));
552    }
553}