use std::collections::HashSet;
use std::sync::RwLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::error::{Error, Result};
use super::types::Document;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
#[serde(rename_all = "snake_case")]
pub enum IocKind {
Cve,
Ipv4,
Ipv6,
Md5,
Sha1,
Sha256,
Domain,
Url,
RegistryKey,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct Ioc {
pub kind: IocKind,
pub value: String,
}
impl Ioc {
pub fn new(kind: IocKind, value: impl Into<String>) -> Self {
Self {
kind,
value: value.into(),
}
}
}
pub trait IocExtractor {
fn extract(&self, doc: &Document) -> Vec<Ioc>;
}
pub trait IocBaseline: Send + Sync {
fn contains(&self, ioc: &Ioc) -> impl std::future::Future<Output = Result<bool>> + Send;
}
#[derive(Debug, Default)]
pub struct InMemoryIocBaseline {
inner: RwLock<HashSet<Ioc>>,
}
impl InMemoryIocBaseline {
pub fn new() -> Self {
Self::default()
}
pub fn with_iocs<I>(self, iocs: I) -> Result<Self>
where
I: IntoIterator<Item = Ioc>,
{
self.extend(iocs)?;
Ok(self)
}
pub fn insert(&self, ioc: Ioc) -> Result<()> {
let mut guard = self
.inner
.write()
.map_err(|_| Error::Ingestion("ioc baseline lock poisoned".into()))?;
guard.insert(ioc);
Ok(())
}
pub fn extend<I>(&self, iocs: I) -> Result<()>
where
I: IntoIterator<Item = Ioc>,
{
let mut guard = self
.inner
.write()
.map_err(|_| Error::Ingestion("ioc baseline lock poisoned".into()))?;
guard.extend(iocs);
Ok(())
}
pub fn len(&self) -> Result<usize> {
let guard = self
.inner
.read()
.map_err(|_| Error::Ingestion("ioc baseline lock poisoned".into()))?;
Ok(guard.len())
}
pub fn is_empty(&self) -> Result<bool> {
Ok(self.len()? == 0)
}
}
impl IocBaseline for InMemoryIocBaseline {
fn contains(&self, ioc: &Ioc) -> impl std::future::Future<Output = Result<bool>> + Send {
let result = (|| -> Result<bool> {
let guard = self
.inner
.read()
.map_err(|_| Error::Ingestion("ioc baseline lock poisoned".into()))?;
Ok(guard.contains(ioc))
})();
async move { result }
}
}
#[derive(Debug)]
pub struct RegexIocExtractor {
cve: Regex,
ipv4: Regex,
ipv6: Regex,
md5: Regex,
sha1: Regex,
sha256: Regex,
domain: Regex,
url: Regex,
reg_key: Regex,
}
impl RegexIocExtractor {
pub fn new() -> Result<Self> {
Ok(Self {
cve: compile(r"\bCVE-\d{4}-\d{4,7}\b")?,
ipv4: compile(
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b",
)?,
ipv6: compile(r"\b(?:[A-Fa-f0-9]{1,4}:){2,7}[A-Fa-f0-9]{1,4}\b")?,
md5: compile(r"\b[A-Fa-f0-9]{32}\b")?,
sha1: compile(r"\b[A-Fa-f0-9]{40}\b")?,
sha256: compile(r"\b[A-Fa-f0-9]{64}\b")?,
domain: compile(
r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b",
)?,
url: compile(r#"https?://[^\s<>\[\]\(\)\{\}\\"',]+"#)?,
reg_key: compile(r"\bHK(?:LM|CU|CR|U|CC)\\[^\s,;]+")?,
})
}
fn collect(&self, text: &str, out: &mut Vec<Ioc>) {
let mut url_spans = Vec::new();
for m in self.url.find_iter(text) {
url_spans.push((m.start(), m.end()));
out.push(Ioc::new(IocKind::Url, m.as_str().to_ascii_lowercase()));
}
for m in self.cve.find_iter(text) {
out.push(Ioc::new(IocKind::Cve, m.as_str().to_ascii_uppercase()));
}
for m in self.ipv4.find_iter(text) {
out.push(Ioc::new(IocKind::Ipv4, m.as_str().to_string()));
}
for m in self.ipv6.find_iter(text) {
out.push(Ioc::new(IocKind::Ipv6, m.as_str().to_ascii_lowercase()));
}
for m in self.sha256.find_iter(text) {
out.push(Ioc::new(IocKind::Sha256, m.as_str().to_ascii_lowercase()));
}
for m in self.sha1.find_iter(text) {
out.push(Ioc::new(IocKind::Sha1, m.as_str().to_ascii_lowercase()));
}
for m in self.md5.find_iter(text) {
out.push(Ioc::new(IocKind::Md5, m.as_str().to_ascii_lowercase()));
}
for m in self.domain.find_iter(text) {
let (start, end) = (m.start(), m.end());
let inside_url = url_spans.iter().any(|&(s, e)| start >= s && end <= e);
if inside_url {
continue;
}
out.push(Ioc::new(IocKind::Domain, m.as_str().to_ascii_lowercase()));
}
for m in self.reg_key.find_iter(text) {
out.push(Ioc::new(IocKind::RegistryKey, m.as_str().to_string()));
}
}
}
impl IocExtractor for RegexIocExtractor {
fn extract(&self, doc: &Document) -> Vec<Ioc> {
let mut out = Vec::new();
self.collect(&doc.text, &mut out);
for section in &doc.sections {
self.collect(§ion.text, &mut out);
}
let mut seen: HashSet<Ioc> = HashSet::with_capacity(out.len());
out.retain(|i| seen.insert(i.clone()));
out
}
}
fn compile(pattern: &str) -> Result<Regex> {
Regex::new(pattern).map_err(|err| {
Error::Ingestion(format!(
"RegexIocExtractor: pattern {pattern:?} failed to compile: {err}"
))
})
}