Skip to main content

provenant/parsers/
docker.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::HashMap;
5use std::path::Path;
6
7use crate::parser_warn as warn;
8use serde_json::json;
9
10use crate::models::{DatasourceId, PackageData, PackageType};
11use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
12
13use super::PackageParser;
14use super::license_normalization::normalize_spdx_declared_license;
15use super::metadata::ParserMetadata;
16
17const PACKAGE_TYPE: PackageType = PackageType::Docker;
18const OCI_LABEL_PREFIX: &str = "org.opencontainers.image.";
19
20fn default_package_data() -> PackageData {
21    PackageData {
22        package_type: Some(PACKAGE_TYPE),
23        primary_language: Some("Dockerfile".to_string()),
24        datasource_id: Some(DatasourceId::Dockerfile),
25        ..Default::default()
26    }
27}
28
29pub struct DockerfileParser;
30
31impl PackageParser for DockerfileParser {
32    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
33
34    fn metadata() -> Vec<ParserMetadata> {
35        vec![ParserMetadata {
36            description: "Dockerfile or Containerfile OCI image metadata",
37            file_patterns: &[
38                "**/Dockerfile",
39                "**/dockerfile",
40                "**/Containerfile",
41                "**/containerfile",
42                "**/Containerfile.core",
43                "**/containerfile.core",
44            ],
45            package_type: "docker",
46            primary_language: "Dockerfile",
47            documentation_url: Some(
48                "https://github.com/opencontainers/image-spec/blob/main/annotations.md",
49            ),
50        }]
51    }
52
53    fn is_match(path: &Path) -> bool {
54        path.file_name()
55            .and_then(|name| name.to_str())
56            .map(|name| name.to_ascii_lowercase())
57            .is_some_and(|name| {
58                matches!(
59                    name.as_str(),
60                    "dockerfile" | "containerfile" | "containerfile.core"
61                )
62            })
63    }
64
65    fn extract_packages(path: &Path) -> Vec<PackageData> {
66        let content = match read_file_to_string(path, None) {
67            Ok(content) => content,
68            Err(error) => {
69                warn!("Failed to read Dockerfile {:?}: {}", path, error);
70                return vec![default_package_data()];
71            }
72        };
73
74        vec![parse_dockerfile(&content)]
75    }
76}
77
78pub(crate) fn parse_dockerfile(content: &str) -> PackageData {
79    let oci_labels = extract_oci_labels(content);
80    let extra_data = (!oci_labels.is_empty())
81        .then(|| HashMap::from([("oci_labels".to_string(), json!(oci_labels))]));
82    let extracted_license_statement = oci_labels.get("org.opencontainers.image.licenses").cloned();
83    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
84        normalize_spdx_declared_license(extracted_license_statement.as_deref());
85
86    PackageData {
87        package_type: Some(PACKAGE_TYPE),
88        primary_language: Some("Dockerfile".to_string()),
89        datasource_id: Some(DatasourceId::Dockerfile),
90        name: oci_labels
91            .get("org.opencontainers.image.title")
92            .map(|v| truncate_field(v.clone())),
93        description: oci_labels
94            .get("org.opencontainers.image.description")
95            .map(|v| truncate_field(v.clone())),
96        homepage_url: oci_labels
97            .get("org.opencontainers.image.url")
98            .map(|v| truncate_field(v.clone())),
99        vcs_url: oci_labels
100            .get("org.opencontainers.image.source")
101            .map(|v| truncate_field(v.clone())),
102        version: oci_labels
103            .get("org.opencontainers.image.version")
104            .map(|v| truncate_field(v.clone())),
105        declared_license_expression,
106        declared_license_expression_spdx,
107        license_detections,
108        extracted_license_statement: extracted_license_statement.map(truncate_field),
109        extra_data,
110        ..Default::default()
111    }
112}
113
114fn extract_oci_labels(content: &str) -> HashMap<String, String> {
115    let mut labels = HashMap::new();
116
117    for instruction in logical_lines(content) {
118        let trimmed = instruction.trim_start();
119        if !starts_with_instruction(trimmed, "LABEL") {
120            continue;
121        }
122
123        parse_label_instruction(trimmed[5..].trim_start(), &mut labels);
124    }
125
126    labels
127}
128
129fn logical_lines(content: &str) -> Vec<String> {
130    let mut lines = Vec::new();
131    let mut current = String::new();
132    let mut iterations = 0usize;
133
134    for raw_line in content.lines() {
135        iterations += 1;
136        if iterations > MAX_ITERATION_COUNT {
137            warn!("logical_lines: exceeded MAX_ITERATION_COUNT, truncating");
138            break;
139        }
140        let line = raw_line.trim_end();
141        let trimmed = line.trim();
142
143        if current.is_empty() && (trimmed.is_empty() || trimmed.starts_with('#')) {
144            continue;
145        }
146
147        let has_continuation = ends_with_unescaped_backslash(line);
148        let segment = if has_continuation {
149            let mut without_backslash = line.trim_end().to_string();
150            without_backslash.pop();
151            without_backslash.trim().to_string()
152        } else {
153            trimmed.to_string()
154        };
155
156        if !segment.is_empty() {
157            if !current.is_empty() {
158                current.push(' ');
159            }
160            current.push_str(&segment);
161        }
162
163        if !has_continuation && !current.is_empty() {
164            lines.push(current.trim().to_string());
165            current.clear();
166        }
167    }
168
169    if !current.is_empty() {
170        lines.push(current.trim().to_string());
171    }
172
173    lines
174}
175
176fn ends_with_unescaped_backslash(line: &str) -> bool {
177    let trailing = line.chars().rev().take_while(|char| *char == '\\').count();
178    trailing % 2 == 1
179}
180
181fn starts_with_instruction(line: &str, instruction: &str) -> bool {
182    if line.len() < instruction.len()
183        || !line[..instruction.len()].eq_ignore_ascii_case(instruction)
184    {
185        return false;
186    }
187
188    line.chars()
189        .nth(instruction.len())
190        .is_none_or(|next| next.is_whitespace())
191}
192
193fn parse_label_instruction(rest: &str, labels: &mut HashMap<String, String>) {
194    let tokens = tokenize_label_arguments(rest);
195    if tokens.is_empty() {
196        return;
197    }
198
199    if tokens.first().is_some_and(|token| token.contains('=')) {
200        for (i, token) in tokens.into_iter().enumerate() {
201            if i >= MAX_ITERATION_COUNT {
202                warn!("parse_label_instruction: exceeded MAX_ITERATION_COUNT, truncating");
203                break;
204            }
205            let Some((key, value)) = token.split_once('=') else {
206                continue;
207            };
208            let key = key.trim();
209            if key.starts_with(OCI_LABEL_PREFIX) {
210                labels.insert(key.to_string(), truncate_field(value.trim().to_string()));
211            }
212        }
213        return;
214    }
215
216    if let Some((key, values)) = tokens.split_first()
217        && key.starts_with(OCI_LABEL_PREFIX)
218    {
219        labels.insert(
220            key.to_string(),
221            truncate_field(values.join(" ").trim().to_string()),
222        );
223    }
224}
225
226fn tokenize_label_arguments(input: &str) -> Vec<String> {
227    let mut tokens = Vec::new();
228    let mut current = String::new();
229    let mut chars = input.chars().peekable();
230    let mut quote: Option<char> = None;
231    let mut iterations = 0usize;
232
233    while let Some(ch) = chars.next() {
234        iterations += 1;
235        if iterations > MAX_ITERATION_COUNT {
236            warn!("tokenize_label_arguments: exceeded MAX_ITERATION_COUNT, truncating");
237            break;
238        }
239        match quote {
240            Some(current_quote) => {
241                if ch == '\\' {
242                    if let Some(next) = chars.next() {
243                        current.push(next);
244                    }
245                } else if ch == current_quote {
246                    quote = None;
247                } else {
248                    current.push(ch);
249                }
250            }
251            None => match ch {
252                '"' | '\'' => quote = Some(ch),
253                '\\' => {
254                    if let Some(next) = chars.next() {
255                        current.push(next);
256                    }
257                }
258                whitespace if whitespace.is_whitespace() => {
259                    if !current.is_empty() {
260                        tokens.push(std::mem::take(&mut current));
261                    }
262                }
263                _ => current.push(ch),
264            },
265        }
266    }
267
268    if !current.is_empty() {
269        tokens.push(current);
270    }
271
272    tokens
273}