Skip to main content

provenant/parsers/
docker.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::HashMap;
5use std::path::Path;
6
7use crate::parser_warn as warn;
8use serde_json::json;
9
10use crate::models::{DatasourceId, PackageData, PackageType};
11use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
12
13use super::PackageParser;
14use super::license_normalization::normalize_spdx_declared_license;
15
16const PACKAGE_TYPE: PackageType = PackageType::Docker;
17const OCI_LABEL_PREFIX: &str = "org.opencontainers.image.";
18
19fn default_package_data() -> PackageData {
20    PackageData {
21        package_type: Some(PACKAGE_TYPE),
22        primary_language: Some("Dockerfile".to_string()),
23        datasource_id: Some(DatasourceId::Dockerfile),
24        ..Default::default()
25    }
26}
27
28pub struct DockerfileParser;
29
30impl PackageParser for DockerfileParser {
31    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
32
33    fn is_match(path: &Path) -> bool {
34        path.file_name()
35            .and_then(|name| name.to_str())
36            .map(|name| name.to_ascii_lowercase())
37            .is_some_and(|name| {
38                matches!(
39                    name.as_str(),
40                    "dockerfile" | "containerfile" | "containerfile.core"
41                )
42            })
43    }
44
45    fn extract_packages(path: &Path) -> Vec<PackageData> {
46        let content = match read_file_to_string(path, None) {
47            Ok(content) => content,
48            Err(error) => {
49                warn!("Failed to read Dockerfile {:?}: {}", path, error);
50                return vec![default_package_data()];
51            }
52        };
53
54        vec![parse_dockerfile(&content)]
55    }
56}
57
58pub(crate) fn parse_dockerfile(content: &str) -> PackageData {
59    let oci_labels = extract_oci_labels(content);
60    let extra_data = (!oci_labels.is_empty())
61        .then(|| HashMap::from([("oci_labels".to_string(), json!(oci_labels))]));
62    let extracted_license_statement = oci_labels.get("org.opencontainers.image.licenses").cloned();
63    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
64        normalize_spdx_declared_license(extracted_license_statement.as_deref());
65
66    PackageData {
67        package_type: Some(PACKAGE_TYPE),
68        primary_language: Some("Dockerfile".to_string()),
69        datasource_id: Some(DatasourceId::Dockerfile),
70        name: oci_labels
71            .get("org.opencontainers.image.title")
72            .map(|v| truncate_field(v.clone())),
73        description: oci_labels
74            .get("org.opencontainers.image.description")
75            .map(|v| truncate_field(v.clone())),
76        homepage_url: oci_labels
77            .get("org.opencontainers.image.url")
78            .map(|v| truncate_field(v.clone())),
79        vcs_url: oci_labels
80            .get("org.opencontainers.image.source")
81            .map(|v| truncate_field(v.clone())),
82        version: oci_labels
83            .get("org.opencontainers.image.version")
84            .map(|v| truncate_field(v.clone())),
85        declared_license_expression,
86        declared_license_expression_spdx,
87        license_detections,
88        extracted_license_statement: extracted_license_statement.map(truncate_field),
89        extra_data,
90        ..Default::default()
91    }
92}
93
94fn extract_oci_labels(content: &str) -> HashMap<String, String> {
95    let mut labels = HashMap::new();
96
97    for instruction in logical_lines(content) {
98        let trimmed = instruction.trim_start();
99        if !starts_with_instruction(trimmed, "LABEL") {
100            continue;
101        }
102
103        parse_label_instruction(trimmed[5..].trim_start(), &mut labels);
104    }
105
106    labels
107}
108
109fn logical_lines(content: &str) -> Vec<String> {
110    let mut lines = Vec::new();
111    let mut current = String::new();
112    let mut iterations = 0usize;
113
114    for raw_line in content.lines() {
115        iterations += 1;
116        if iterations > MAX_ITERATION_COUNT {
117            warn!("logical_lines: exceeded MAX_ITERATION_COUNT, truncating");
118            break;
119        }
120        let line = raw_line.trim_end();
121        let trimmed = line.trim();
122
123        if current.is_empty() && (trimmed.is_empty() || trimmed.starts_with('#')) {
124            continue;
125        }
126
127        let has_continuation = ends_with_unescaped_backslash(line);
128        let segment = if has_continuation {
129            let mut without_backslash = line.trim_end().to_string();
130            without_backslash.pop();
131            without_backslash.trim().to_string()
132        } else {
133            trimmed.to_string()
134        };
135
136        if !segment.is_empty() {
137            if !current.is_empty() {
138                current.push(' ');
139            }
140            current.push_str(&segment);
141        }
142
143        if !has_continuation && !current.is_empty() {
144            lines.push(current.trim().to_string());
145            current.clear();
146        }
147    }
148
149    if !current.is_empty() {
150        lines.push(current.trim().to_string());
151    }
152
153    lines
154}
155
156fn ends_with_unescaped_backslash(line: &str) -> bool {
157    let trailing = line.chars().rev().take_while(|char| *char == '\\').count();
158    trailing % 2 == 1
159}
160
161fn starts_with_instruction(line: &str, instruction: &str) -> bool {
162    if line.len() < instruction.len()
163        || !line[..instruction.len()].eq_ignore_ascii_case(instruction)
164    {
165        return false;
166    }
167
168    line.chars()
169        .nth(instruction.len())
170        .is_none_or(|next| next.is_whitespace())
171}
172
173fn parse_label_instruction(rest: &str, labels: &mut HashMap<String, String>) {
174    let tokens = tokenize_label_arguments(rest);
175    if tokens.is_empty() {
176        return;
177    }
178
179    if tokens.first().is_some_and(|token| token.contains('=')) {
180        for (i, token) in tokens.into_iter().enumerate() {
181            if i >= MAX_ITERATION_COUNT {
182                warn!("parse_label_instruction: exceeded MAX_ITERATION_COUNT, truncating");
183                break;
184            }
185            let Some((key, value)) = token.split_once('=') else {
186                continue;
187            };
188            let key = key.trim();
189            if key.starts_with(OCI_LABEL_PREFIX) {
190                labels.insert(key.to_string(), truncate_field(value.trim().to_string()));
191            }
192        }
193        return;
194    }
195
196    if let Some((key, values)) = tokens.split_first()
197        && key.starts_with(OCI_LABEL_PREFIX)
198    {
199        labels.insert(
200            key.to_string(),
201            truncate_field(values.join(" ").trim().to_string()),
202        );
203    }
204}
205
206fn tokenize_label_arguments(input: &str) -> Vec<String> {
207    let mut tokens = Vec::new();
208    let mut current = String::new();
209    let mut chars = input.chars().peekable();
210    let mut quote: Option<char> = None;
211    let mut iterations = 0usize;
212
213    while let Some(ch) = chars.next() {
214        iterations += 1;
215        if iterations > MAX_ITERATION_COUNT {
216            warn!("tokenize_label_arguments: exceeded MAX_ITERATION_COUNT, truncating");
217            break;
218        }
219        match quote {
220            Some(current_quote) => {
221                if ch == '\\' {
222                    if let Some(next) = chars.next() {
223                        current.push(next);
224                    }
225                } else if ch == current_quote {
226                    quote = None;
227                } else {
228                    current.push(ch);
229                }
230            }
231            None => match ch {
232                '"' | '\'' => quote = Some(ch),
233                '\\' => {
234                    if let Some(next) = chars.next() {
235                        current.push(next);
236                    }
237                }
238                whitespace if whitespace.is_whitespace() => {
239                    if !current.is_empty() {
240                        tokens.push(std::mem::take(&mut current));
241                    }
242                }
243                _ => current.push(ch),
244            },
245        }
246    }
247
248    if !current.is_empty() {
249        tokens.push(current);
250    }
251
252    tokens
253}
254
255crate::register_parser!(
256    "Dockerfile or Containerfile OCI image metadata",
257    &[
258        "**/Dockerfile",
259        "**/dockerfile",
260        "**/Containerfile",
261        "**/containerfile",
262        "**/Containerfile.core",
263        "**/containerfile.core",
264    ],
265    "docker",
266    "Dockerfile",
267    Some("https://github.com/opencontainers/image-spec/blob/main/annotations.md"),
268);