use crate::error::ParaglobError;
use crate::DatabaseBuilder;
use matchy_data_format::DataValue;
use matchy_match_mode::MatchMode;
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
fn deserialize_value<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: Deserializer<'de>,
{
use serde_json::Value;
let value = Value::deserialize(deserializer)?;
match value {
Value::String(s) => Ok(s),
Value::Number(n) => Ok(n.to_string()),
Value::Bool(b) => Ok(b.to_string()),
Value::Null | Value::Array(_) | Value::Object(_) => Ok(String::new()),
}
}
fn deserialize_u8_flexible<'de, D>(deserializer: D) -> Result<Option<u8>, D::Error>
where
D: Deserializer<'de>,
{
use serde::de::Error;
use serde_json::Value;
let value = Value::deserialize(deserializer)?;
match value {
Value::Number(n) => n
.as_u64()
.and_then(|n| u8::try_from(n).ok())
.ok_or_else(|| Error::custom("number out of u8 range"))
.map(Some),
Value::String(s) => s
.parse::<u8>()
.map(Some)
.map_err(|_| Error::custom("invalid u8 string")),
Value::Null => Ok(None),
_ => Err(Error::custom("expected number or string for u8")),
}
}
fn deserialize_u64_flexible<'de, D>(deserializer: D) -> Result<Option<u64>, D::Error>
where
D: Deserializer<'de>,
{
use serde::de::Error;
use serde_json::Value;
let value = Value::deserialize(deserializer)?;
match value {
Value::Number(n) => n
.as_u64()
.ok_or_else(|| Error::custom("number out of u64 range"))
.map(Some),
Value::String(s) => s
.parse::<u64>()
.map(Some)
.map_err(|_| Error::custom("invalid u64 string")),
Value::Null => Ok(None),
_ => Err(Error::custom("expected number or string for u64")),
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispEvent {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub info: Option<String>,
#[serde(default, deserialize_with = "deserialize_u8_flexible")]
pub threat_level_id: Option<u8>,
#[serde(default, deserialize_with = "deserialize_u8_flexible")]
pub analysis: Option<u8>,
#[serde(default)]
pub date: Option<String>,
#[serde(default, deserialize_with = "deserialize_u64_flexible")]
pub timestamp: Option<u64>,
#[serde(default)]
pub published: Option<bool>,
#[serde(rename = "Orgc", default)]
pub orgc: Option<MispOrg>,
#[serde(rename = "Tag", default)]
pub tags: Vec<MispTag>,
#[serde(rename = "Attribute", default)]
pub attributes: Vec<MispAttribute>,
#[serde(rename = "Object", default)]
pub objects: Vec<MispObject>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispOrg {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub name: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispTag {
pub name: String,
#[serde(default)]
pub colour: Option<String>,
#[serde(default)]
pub exportable: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispAttribute {
#[serde(default)]
pub uuid: Option<String>,
#[serde(rename = "type")]
pub attribute_type: String,
#[serde(deserialize_with = "deserialize_value")]
pub value: String,
#[serde(default)]
pub category: Option<String>,
#[serde(default)]
pub to_ids: Option<bool>,
#[serde(default)]
pub comment: Option<String>,
#[serde(default, deserialize_with = "deserialize_u64_flexible")]
pub timestamp: Option<u64>,
#[serde(default)]
pub object_relation: Option<String>,
#[serde(rename = "Tag", default)]
pub tags: Vec<MispTag>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispObject {
#[serde(default)]
pub uuid: Option<String>,
pub name: String,
#[serde(rename = "meta-category", default)]
pub meta_category: Option<String>,
#[serde(default)]
pub description: Option<String>,
#[serde(default)]
pub comment: Option<String>,
#[serde(default, deserialize_with = "deserialize_u64_flexible")]
pub timestamp: Option<u64>,
#[serde(rename = "Attribute", default)]
pub attributes: Vec<MispAttribute>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MispDocument {
#[serde(rename = "Event")]
pub event: MispEvent,
}
pub struct MispImporter {
events: Vec<MispEvent>,
}
impl MispImporter {
pub fn from_json(json: &str) -> Result<Self, ParaglobError> {
let doc: MispDocument = serde_json::from_str(json).map_err(|e| {
ParaglobError::InvalidPattern(format!("Failed to parse MISP JSON: {e}"))
})?;
Ok(Self {
events: vec![doc.event],
})
}
pub fn build_from_files<P: AsRef<Path>>(
paths: &[P],
match_mode: MatchMode,
minimal_metadata: bool,
) -> Result<DatabaseBuilder, ParaglobError> {
let mut builder = DatabaseBuilder::new(match_mode)
.with_database_type("MISP-ThreatIntel")
.with_description("en", "Threat intelligence database from MISP JSON feeds");
let mut skipped_files = Vec::new();
let mut processed_events = 0;
for path in paths {
let path_ref = path.as_ref();
let json = fs::read_to_string(path_ref)
.map_err(|e| ParaglobError::InvalidPattern(format!("Failed to read file: {e}")))?;
match serde_json::from_str::<MispDocument>(&json) {
Ok(doc) => {
let temp_importer = Self {
events: vec![doc.event],
};
for event in &temp_importer.events {
if minimal_metadata {
temp_importer.process_event_minimal(event, &mut builder)?;
} else {
temp_importer.process_event(event, &mut builder)?;
}
}
processed_events += 1;
}
Err(e) => {
let filename = path_ref
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
if filename == "manifest.json" || filename == "hashes.csv" {
skipped_files.push((filename.to_string(), "metadata file".to_string()));
} else if json.trim_start().starts_with('{') && json.contains("\"Event\"") {
return Err(ParaglobError::InvalidPattern(format!(
"Failed to parse MISP JSON in {filename}: {e}"
)));
} else {
skipped_files.push((filename.to_string(), "not a MISP event".to_string()));
}
}
}
}
if !skipped_files.is_empty() {
eprintln!("Warning: Skipped {} non-MISP file(s):", skipped_files.len());
for (filename, reason) in &skipped_files {
eprintln!(" - {filename}: {reason}");
}
}
if processed_events == 0 {
return Err(ParaglobError::InvalidPattern(
"No valid MISP events found in provided files".to_string(),
));
}
Ok(builder)
}
pub fn from_files<P: AsRef<Path>>(paths: &[P]) -> Result<Self, ParaglobError> {
let mut events = Vec::new();
let mut skipped_files = Vec::new();
for path in paths {
let path_ref = path.as_ref();
let json = fs::read_to_string(path_ref)
.map_err(|e| ParaglobError::InvalidPattern(format!("Failed to read file: {e}")))?;
match serde_json::from_str::<MispDocument>(&json) {
Ok(doc) => {
events.push(doc.event);
}
Err(e) => {
let filename = path_ref
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
if filename == "manifest.json" || filename == "hashes.csv" {
skipped_files.push((filename.to_string(), "metadata file".to_string()));
} else if json.trim_start().starts_with('{') && json.contains("\"Event\"") {
return Err(ParaglobError::InvalidPattern(format!(
"Failed to parse MISP JSON in {filename}: {e}"
)));
} else {
skipped_files.push((filename.to_string(), "not a MISP event".to_string()));
}
}
}
}
if !skipped_files.is_empty() {
eprintln!("Warning: Skipped {} non-MISP file(s):", skipped_files.len());
for (filename, reason) in &skipped_files {
eprintln!(" - {filename}: {reason}");
}
}
if events.is_empty() {
return Err(ParaglobError::InvalidPattern(
"No valid MISP events found in provided files".to_string(),
));
}
Ok(Self { events })
}
pub fn build_database(&self, match_mode: MatchMode) -> Result<DatabaseBuilder, ParaglobError> {
self.build_database_with_options(match_mode, false)
}
pub fn build_database_with_options(
&self,
match_mode: MatchMode,
minimal_metadata: bool,
) -> Result<DatabaseBuilder, ParaglobError> {
let mut builder = DatabaseBuilder::new(match_mode)
.with_database_type("MISP-ThreatIntel")
.with_description("en", "Threat intelligence database from MISP JSON feeds");
for event in &self.events {
if minimal_metadata {
self.process_event_minimal(event, &mut builder)?;
} else {
self.process_event(event, &mut builder)?;
}
}
Ok(builder)
}
pub fn build_database_threatdb(
&self,
match_mode: MatchMode,
) -> Result<DatabaseBuilder, ParaglobError> {
let mut builder = DatabaseBuilder::new(match_mode)
.with_database_type("ThreatDB-v1")
.with_description("en", "Threat intelligence database from MISP JSON feeds");
for event in &self.events {
self.process_event_threatdb(event, &mut builder)?;
}
Ok(builder)
}
fn process_event_threatdb(
&self,
event: &MispEvent,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
let event_metadata = self.build_event_metadata_threatdb(event);
for attr in &event.attributes {
self.process_attribute_threatdb(attr, &event_metadata, builder)?;
}
for obj in &event.objects {
for attr in &obj.attributes {
self.process_attribute_threatdb(attr, &event_metadata, builder)?;
}
}
Ok(())
}
fn build_event_metadata_threatdb(&self, event: &MispEvent) -> HashMap<String, DataValue> {
let mut metadata = HashMap::new();
let threat_level = match event.threat_level_id {
Some(1) => "high",
Some(2) => "medium",
Some(3) => "low",
_ => "unknown",
};
metadata.insert(
"threat_level".to_string(),
DataValue::String(threat_level.to_string()),
);
if let Some(orgc) = &event.orgc {
if let Some(name) = &orgc.name {
metadata.insert("source".to_string(), DataValue::String(name.clone()));
}
}
if !metadata.contains_key("source") {
metadata.insert(
"source".to_string(),
DataValue::String("misp-import".to_string()),
);
}
if let Some(info) = &event.info {
metadata.insert("description".to_string(), DataValue::String(info.clone()));
}
if let Some(date) = &event.date {
metadata.insert(
"first_seen".to_string(),
DataValue::String(format!("{date}T00:00:00Z")),
);
}
if !event.tags.is_empty() {
let tag_values: Vec<DataValue> = event
.tags
.iter()
.map(|t| DataValue::String(t.name.clone()))
.collect();
metadata.insert("tags".to_string(), DataValue::Array(tag_values));
for tag in &event.tags {
let tag_lower = tag.name.to_lowercase();
if tag_lower.starts_with("tlp:") {
let tlp = match tag_lower.as_str() {
"tlp:white" | "tlp:clear" => "CLEAR",
"tlp:green" => "GREEN",
"tlp:amber" => "AMBER",
"tlp:amber+strict" => "AMBER+STRICT",
"tlp:red" => "RED",
_ => continue,
};
metadata.insert("tlp".to_string(), DataValue::String(tlp.to_string()));
break;
}
}
}
metadata
}
fn process_attribute_threatdb(
&self,
attr: &MispAttribute,
base_metadata: &HashMap<String, DataValue>,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
let mut metadata = base_metadata.clone();
metadata.insert(
"indicator_type".to_string(),
DataValue::String(attr.attribute_type.clone()),
);
if let Some(category) = &attr.category {
metadata.insert("category".to_string(), DataValue::String(category.clone()));
} else {
metadata.insert(
"category".to_string(),
DataValue::String("Other".to_string()),
);
}
if let Some(comment) = &attr.comment {
if !comment.is_empty() && !metadata.contains_key("description") {
metadata.insert(
"description".to_string(),
DataValue::String(comment.clone()),
);
}
}
if !attr.tags.is_empty() {
let mut all_tags: Vec<DataValue> = metadata
.get("tags")
.and_then(|v| {
if let DataValue::Array(arr) = v {
Some(arr.clone())
} else {
None
}
})
.unwrap_or_default();
for tag in &attr.tags {
all_tags.push(DataValue::String(tag.name.clone()));
let tag_lower = tag.name.to_lowercase();
if tag_lower.starts_with("tlp:") && !metadata.contains_key("tlp") {
let tlp = match tag_lower.as_str() {
"tlp:white" | "tlp:clear" => "CLEAR",
"tlp:green" => "GREEN",
"tlp:amber" => "AMBER",
"tlp:amber+strict" => "AMBER+STRICT",
"tlp:red" => "RED",
_ => continue,
};
metadata.insert("tlp".to_string(), DataValue::String(tlp.to_string()));
}
}
metadata.insert("tags".to_string(), DataValue::Array(all_tags));
}
self.extract_indicators(&attr.attribute_type, &attr.value, metadata, builder)?;
Ok(())
}
fn process_event_minimal(
&self,
event: &MispEvent,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
let mut event_metadata = HashMap::new();
if let Some(threat_level) = event.threat_level_id {
let threat_name = match threat_level {
1 => "High",
2 => "Medium",
3 => "Low",
_ => "Undefined",
};
event_metadata.insert(
"threat_level".to_string(),
DataValue::String(threat_name.to_string()),
);
}
if !event.tags.is_empty() {
let tag_names: Vec<String> = event.tags.iter().map(|t| t.name.clone()).collect();
event_metadata.insert("tags".to_string(), DataValue::String(tag_names.join(",")));
}
for attr in &event.attributes {
let mut metadata = event_metadata.clone();
metadata.insert(
"type".to_string(),
DataValue::String(attr.attribute_type.clone()),
);
if !attr.tags.is_empty() {
let tag_names: Vec<String> = attr.tags.iter().map(|t| t.name.clone()).collect();
let existing_tags = metadata
.get("tags")
.and_then(|v| {
if let DataValue::String(s) = v {
Some(s.as_str())
} else {
None
}
})
.unwrap_or("");
let combined = if existing_tags.is_empty() {
tag_names.join(",")
} else {
format!("{},{}", existing_tags, tag_names.join(","))
};
metadata.insert("tags".to_string(), DataValue::String(combined));
}
self.extract_indicators(&attr.attribute_type, &attr.value, metadata, builder)?;
}
for obj in &event.objects {
for attr in &obj.attributes {
let mut metadata = event_metadata.clone();
metadata.insert(
"type".to_string(),
DataValue::String(attr.attribute_type.clone()),
);
if !attr.tags.is_empty() {
let tag_names: Vec<String> = attr.tags.iter().map(|t| t.name.clone()).collect();
let existing_tags = metadata
.get("tags")
.and_then(|v| {
if let DataValue::String(s) = v {
Some(s.as_str())
} else {
None
}
})
.unwrap_or("");
let combined = if existing_tags.is_empty() {
tag_names.join(",")
} else {
format!("{},{}", existing_tags, tag_names.join(","))
};
metadata.insert("tags".to_string(), DataValue::String(combined));
}
self.extract_indicators(&attr.attribute_type, &attr.value, metadata, builder)?;
}
}
Ok(())
}
fn process_event(
&self,
event: &MispEvent,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
let event_metadata = self.build_event_metadata(event);
for attr in &event.attributes {
self.process_attribute(attr, &event_metadata, builder)?;
}
for obj in &event.objects {
let mut obj_metadata = event_metadata.clone();
obj_metadata.insert(
"object_type".to_string(),
DataValue::String(obj.name.clone()),
);
if let Some(comment) = &obj.comment {
obj_metadata.insert(
"object_comment".to_string(),
DataValue::String(comment.clone()),
);
}
for attr in &obj.attributes {
self.process_attribute(attr, &obj_metadata, builder)?;
}
}
Ok(())
}
fn build_event_metadata(&self, event: &MispEvent) -> HashMap<String, DataValue> {
let mut metadata = HashMap::new();
if let Some(info) = &event.info {
metadata.insert("event_info".to_string(), DataValue::String(info.clone()));
}
if let Some(uuid) = &event.uuid {
metadata.insert("event_uuid".to_string(), DataValue::String(uuid.clone()));
}
if let Some(threat_level) = event.threat_level_id {
let threat_name = match threat_level {
1 => "High",
2 => "Medium",
3 => "Low",
_ => "Undefined",
};
metadata.insert(
"threat_level".to_string(),
DataValue::String(threat_name.to_string()),
);
}
if let Some(analysis) = event.analysis {
let analysis_name = match analysis {
0 => "Initial",
1 => "Ongoing",
2 => "Complete",
_ => "Unknown",
};
metadata.insert(
"analysis".to_string(),
DataValue::String(analysis_name.to_string()),
);
}
if let Some(date) = &event.date {
metadata.insert("event_date".to_string(), DataValue::String(date.clone()));
}
if let Some(orgc) = &event.orgc {
if let Some(name) = &orgc.name {
metadata.insert("org_name".to_string(), DataValue::String(name.clone()));
}
}
if !event.tags.is_empty() {
let tag_names: Vec<String> = event.tags.iter().map(|t| t.name.clone()).collect();
metadata.insert("tags".to_string(), DataValue::String(tag_names.join(",")));
}
metadata
}
fn process_attribute(
&self,
attr: &MispAttribute,
base_metadata: &HashMap<String, DataValue>,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
let mut metadata = base_metadata.clone();
metadata.insert(
"type".to_string(),
DataValue::String(attr.attribute_type.clone()),
);
if let Some(category) = &attr.category {
metadata.insert("category".to_string(), DataValue::String(category.clone()));
}
if let Some(to_ids) = attr.to_ids {
metadata.insert("to_ids".to_string(), DataValue::Bool(to_ids));
}
if let Some(comment) = &attr.comment {
if !comment.is_empty() {
metadata.insert("comment".to_string(), DataValue::String(comment.clone()));
}
}
if !attr.tags.is_empty() {
let tag_names: Vec<String> = attr.tags.iter().map(|t| t.name.clone()).collect();
let existing_tags = metadata
.get("tags")
.and_then(|v| {
if let DataValue::String(s) = v {
Some(s.as_str())
} else {
None
}
})
.unwrap_or("");
let combined = if existing_tags.is_empty() {
tag_names.join(",")
} else {
format!("{},{}", existing_tags, tag_names.join(","))
};
metadata.insert("tags".to_string(), DataValue::String(combined));
}
self.extract_indicators(&attr.attribute_type, &attr.value, metadata, builder)?;
Ok(())
}
fn extract_indicators(
&self,
attr_type: &str,
value: &str,
metadata: HashMap<String, DataValue>,
builder: &mut DatabaseBuilder,
) -> Result<(), ParaglobError> {
if value.trim().is_empty() {
return Ok(());
}
match attr_type {
"ip-src" | "ip-dst" | "ip" => {
builder.add_ip(value, metadata)?;
}
"ip-src/netmask" | "ip-dst/netmask" => {
builder.add_ip(value, metadata)?;
}
"ip-src|port" | "ip-dst|port" => {
if let Some(pipe_pos) = value.find('|') {
let ip = &value[..pipe_pos];
builder.add_ip(ip, metadata)?;
}
}
"domain" | "hostname" => {
builder.add_literal(value, metadata)?;
}
"domain|ip" => {
if let Some(pipe_pos) = value.find('|') {
let domain = &value[..pipe_pos];
let ip = &value[pipe_pos + 1..];
builder.add_literal(domain, metadata.clone())?;
builder.add_ip(ip, metadata)?;
}
}
"url" | "uri" => {
if let Some(domain) = self.extract_domain_from_url(value) {
builder.add_literal(domain, metadata.clone())?;
}
builder.add_literal(value, metadata)?;
}
"md5" | "sha1" | "sha224" | "sha256" | "sha384" | "sha512" | "sha512/224"
| "sha512/256" | "sha3-224" | "sha3-256" | "sha3-384" | "sha3-512" | "ssdeep"
| "imphash" | "tlsh" | "authentihash" | "vhash" | "cdhash" | "pehash" | "impfuzzy"
| "telfhash" => {
builder.add_literal(value, metadata)?;
}
"filename|md5"
| "filename|sha1"
| "filename|sha256"
| "filename|sha384"
| "filename|sha512"
| "filename|imphash"
| "filename|ssdeep"
| "filename|tlsh"
| "filename|authentihash"
| "filename|vhash"
| "filename|pehash"
| "filename|impfuzzy" => {
if let Some(pipe_pos) = value.find('|') {
let filename = &value[..pipe_pos];
let hash = &value[pipe_pos + 1..];
builder.add_literal(filename, metadata.clone())?;
builder.add_literal(hash, metadata)?;
}
}
"filename" | "filename-pattern" => {
builder.add_literal(value, metadata)?;
}
"email" | "email-src" | "email-dst" | "email-reply-to" => {
builder.add_literal(value, metadata)?;
}
"email-subject" | "email-body" => {
builder.add_literal(value, metadata)?;
}
"user-agent" | "http-method" => {
builder.add_literal(value, metadata)?;
}
"mac-address" | "mac-eui-64" => {
builder.add_literal(value, metadata)?;
}
"AS" => {
builder.add_literal(value, metadata)?;
}
"btc" | "xmr" | "dash" => {
builder.add_literal(value, metadata)?;
}
"yara" | "snort" | "sigma" | "pattern-in-file" | "pattern-in-traffic"
| "pattern-in-memory" => {
builder.add_literal(value, metadata)?;
}
"mutex" | "named pipe" | "regkey" | "regkey|value" => {
builder.add_literal(value, metadata)?;
}
"comment" | "text" | "other" | "link" | "datetime" | "size-in-bytes" | "counter"
| "float" | "hex" | "port" | "attachment" | "malware-sample" => {
}
_ => {
if !value.is_empty() && value.len() < 1000 {
builder.add_literal(value, metadata)?;
}
}
}
Ok(())
}
fn extract_domain_from_url<'a>(&self, url: &'a str) -> Option<&'a str> {
let url = url.trim();
let without_protocol = if let Some(pos) = url.find("://") {
&url[pos + 3..]
} else {
url
};
let domain_end = without_protocol
.find('/')
.or_else(|| without_protocol.find('?'))
.or_else(|| without_protocol.find('#'))
.unwrap_or(without_protocol.len());
let domain = &without_protocol[..domain_end];
let domain = if let Some(colon_pos) = domain.rfind(':') {
if domain[colon_pos + 1..].chars().all(char::is_numeric) {
&domain[..colon_pos]
} else {
domain
}
} else {
domain
};
if domain.is_empty() {
None
} else {
Some(domain)
}
}
#[must_use]
pub fn stats(&self) -> ImportStats {
let mut stats = ImportStats::default();
for event in &self.events {
stats.total_events += 1;
stats.total_attributes += event.attributes.len();
for obj in &event.objects {
stats.total_objects += 1;
stats.total_attributes += obj.attributes.len();
}
}
stats
}
}
#[derive(Debug, Default, Clone)]
pub struct ImportStats {
pub total_events: usize,
pub total_attributes: usize,
pub total_objects: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_domain_from_url() {
let importer = MispImporter { events: vec![] };
assert_eq!(
importer.extract_domain_from_url("http://example.com/path"),
Some("example.com")
);
assert_eq!(
importer.extract_domain_from_url("https://test.org:8080/"),
Some("test.org")
);
assert_eq!(
importer.extract_domain_from_url("example.net"),
Some("example.net")
);
assert_eq!(
importer.extract_domain_from_url("http://evil.com?param=value"),
Some("evil.com")
);
}
#[test]
fn test_parse_misp_json() {
let json = r#"{
"Event": {
"uuid": "test-uuid",
"info": "Test Event",
"threat_level_id": 2,
"Attribute": [
{
"type": "ip-src",
"value": "192.168.1.1"
}
],
"Object": []
}
}"#;
let importer = MispImporter::from_json(json).unwrap();
assert_eq!(importer.events.len(), 1);
assert_eq!(importer.events[0].attributes.len(), 1);
}
}