Skip to main content

provenant/parsers/
misc.rs

1//! File type recognizers for various package archives and binary formats.
2//!
3//! This module contains simple file-type recognizers that identify packages by
4//! their file extensions or path patterns. These recognizers do NOT parse file
5//! contents - they only tag files with the appropriate package_type and datasource_id.
6//!
7//! # Implementation Notes
8//!
9//! - All recognizers use the `file_recognizer!` macro to reduce boilerplate
10//! - Recognizers return minimal PackageData with only package_type and datasource_id set
11//! - These correspond to Python's misc.py NonAssemblableDatafileHandler classes
12//! - No actual parsing is performed (Python also has `# TODO: parse me!!!`)
13//! - Some recognizers use magic byte detection for disambiguation (Squashfs, NSIS, InstallShield)
14
15use std::path::Path;
16
17use super::PackageParser;
18use crate::models::{DatasourceId, PackageData, PackageType};
19use crate::utils::magic;
20
21/// Helper macro to define file-type recognizers with minimal boilerplate.
22///
23/// Each recognizer matches specific file patterns and returns a minimal
24/// PackageData structure with only package_type and datasource_id populated.
25///
26/// # Arguments
27///
28/// * `$name` - Struct name for the recognizer
29/// * `$pkg_type` - Package type string (e.g., "jar", "war", "meteor")
30/// * `$datasource` - Datasource ID string (e.g., "java_jar", "meteor_package")
31/// * `$match_fn` - Closure that takes a &Path and returns bool for matching
32macro_rules! file_recognizer {
33    ($name:ident, $pkg_type:expr, $datasource:expr, $match_fn:expr) => {
34        pub struct $name;
35
36        impl PackageParser for $name {
37            const PACKAGE_TYPE: PackageType = $pkg_type;
38
39            fn is_match(path: &Path) -> bool {
40                ($match_fn)(path)
41            }
42
43            fn extract_packages(path: &Path) -> Vec<PackageData> {
44                let _ = path;
45                vec![PackageData {
46                    package_type: Some($pkg_type),
47                    datasource_id: Some($datasource),
48                    ..Default::default()
49                }]
50            }
51        }
52    };
53}
54
55// Java Archives
56
57file_recognizer!(
58    JavaJarRecognizer,
59    PackageType::Jar,
60    DatasourceId::JavaJar,
61    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("jar")
62);
63
64file_recognizer!(
65    IvyXmlRecognizer,
66    PackageType::Ivy,
67    DatasourceId::AntIvyXml,
68    |path: &Path| path.to_str().is_some_and(|p| p.ends_with("/ivy.xml"))
69);
70
71file_recognizer!(
72    JavaWarRecognizer,
73    PackageType::War,
74    DatasourceId::JavaWarArchive,
75    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("war")
76);
77
78file_recognizer!(
79    JavaWarWebXmlRecognizer,
80    PackageType::War,
81    DatasourceId::JavaWarWebXml,
82    |path: &Path| path
83        .to_str()
84        .is_some_and(|p| p.ends_with("/WEB-INF/web.xml") || p.ends_with("WEB-INF/web.xml"))
85);
86
87file_recognizer!(
88    JavaEarRecognizer,
89    PackageType::Ear,
90    DatasourceId::JavaEarArchive,
91    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("ear")
92);
93
94file_recognizer!(
95    JavaEarAppXmlRecognizer,
96    PackageType::Ear,
97    DatasourceId::JavaEarApplicationXml,
98    |path: &Path| path.to_str().is_some_and(
99        |p| p.ends_with("/META-INF/application.xml") || p.ends_with("META-INF/application.xml")
100    )
101);
102
103// Apache Axis2
104
105file_recognizer!(
106    Axis2ModuleXmlRecognizer,
107    PackageType::Axis2,
108    DatasourceId::Axis2ModuleXml,
109    |path: &Path| {
110        path.to_str().is_some_and(|p| {
111            let lower = p.to_lowercase();
112            lower.ends_with("/meta-inf/module.xml") || lower.ends_with("meta-inf/module.xml")
113        })
114    }
115);
116
117file_recognizer!(
118    Axis2MarRecognizer,
119    PackageType::Axis2,
120    DatasourceId::Axis2Mar,
121    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("mar")
122);
123
124// JBoss
125
126file_recognizer!(
127    JBossSarRecognizer,
128    PackageType::JbossService,
129    DatasourceId::JbossSar,
130    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("sar")
131);
132
133file_recognizer!(
134    JBossServiceXmlRecognizer,
135    PackageType::JbossService,
136    DatasourceId::JbossServiceXml,
137    |path: &Path| {
138        path.to_str().is_some_and(|p| {
139            let lower = p.to_lowercase();
140            lower.ends_with("/meta-inf/jboss-service.xml")
141                || lower.ends_with("meta-inf/jboss-service.xml")
142        })
143    }
144);
145
146// Meteor
147
148file_recognizer!(
149    MeteorPackageRecognizer,
150    PackageType::Meteor,
151    DatasourceId::MeteorPackage,
152    |path: &Path| path.to_str().is_some_and(|p| p.ends_with("/package.js"))
153);
154
155// Mobile Apps
156
157file_recognizer!(
158    AndroidApkRecognizer,
159    PackageType::Android,
160    DatasourceId::AndroidApk,
161    |path: &Path| {
162        path.extension()
163            .and_then(|e| e.to_str())
164            .is_some_and(|ext| ext == "apk")
165            && magic::is_zip(path)
166    }
167);
168
169file_recognizer!(
170    AndroidLibraryRecognizer,
171    PackageType::AndroidLib,
172    DatasourceId::AndroidAarLibrary,
173    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("aar")
174);
175
176file_recognizer!(
177    MozillaXpiRecognizer,
178    PackageType::Mozilla,
179    DatasourceId::MozillaXpi,
180    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("xpi")
181);
182
183file_recognizer!(
184    ChromeCrxRecognizer,
185    PackageType::Chrome,
186    DatasourceId::ChromeCrx,
187    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("crx")
188);
189
190file_recognizer!(
191    IosIpaRecognizer,
192    PackageType::Ios,
193    DatasourceId::IosIpa,
194    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("ipa")
195);
196
197// Archives
198
199file_recognizer!(
200    CabArchiveRecognizer,
201    PackageType::Cab,
202    DatasourceId::MicrosoftCabinet,
203    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("cab")
204);
205
206file_recognizer!(
207    SharArchiveRecognizer,
208    PackageType::Shar,
209    DatasourceId::SharShellArchive,
210    |path: &Path| path.extension().and_then(|e| e.to_str()) == Some("shar")
211);
212
213// Disk Images
214
215file_recognizer!(
216    AppleDmgRecognizer,
217    PackageType::Dmg,
218    DatasourceId::AppleDmg,
219    |path: &Path| {
220        path.extension()
221            .and_then(|e| e.to_str())
222            .is_some_and(|ext| ext == "dmg" || ext == "sparseimage")
223    }
224);
225
226file_recognizer!(
227    IsoImageRecognizer,
228    PackageType::Iso,
229    DatasourceId::IsoDiskImage,
230    |path: &Path| {
231        path.extension()
232            .and_then(|e| e.to_str())
233            .is_some_and(|ext| ext == "iso" || ext == "udf" || ext == "img")
234    }
235);
236
237// Installers and Binary Formats (require magic byte detection)
238
239file_recognizer!(
240    SquashfsRecognizer,
241    PackageType::Squashfs,
242    DatasourceId::SquashfsDiskImage,
243    |path: &Path| magic::is_squashfs(path)
244);
245
246file_recognizer!(
247    NsisRecognizer,
248    PackageType::Nsis,
249    DatasourceId::NsisInstaller,
250    |path: &Path| {
251        path.extension()
252            .and_then(|e| e.to_str())
253            .is_some_and(|ext| ext == "exe")
254            && magic::is_nsis_installer(path)
255    }
256);
257
258file_recognizer!(
259    InstallShieldRecognizer,
260    PackageType::Installshield,
261    DatasourceId::InstallshieldInstaller,
262    |path: &Path| {
263        path.extension()
264            .and_then(|e| e.to_str())
265            .is_some_and(|ext| ext == "exe")
266            && magic::is_zip(path)
267    }
268);
269
270crate::register_parser!(
271    "Misc file type recognizers (JAR, WAR, EAR, Android, iOS, Chrome, Mozilla, installers, disk images, etc.)",
272    &[
273        "**/*.jar",
274        "**/ivy.xml",
275        "**/*.war",
276        "**/WEB-INF/web.xml",
277        "**/*.ear",
278        "**/META-INF/application.xml",
279        "**/meta-inf/module.xml",
280        "**/*.mar",
281        "**/*.sar",
282        "**/meta-inf/jboss-service.xml",
283        "**/package.js",
284        "**/*.apk",
285        "**/*.aar",
286        "**/*.xpi",
287        "**/*.crx",
288        "**/*.ipa",
289        "**/*.cab",
290        "**/*.shar",
291        "**/*.dmg",
292        "**/*.sparseimage",
293        "**/*.iso",
294        "**/*.udf",
295        "**/*.img",
296        "**/*.exe",
297    ],
298    "",
299    "",
300    None,
301);