indicator_extractor/parser/
mod.rs

1//! Parser to extract indicators from a byte array.
2
3use bitcoin::{
4    is_valid_bitcoin_p2pkh_address, is_valid_bitcoin_p2sh_address, is_valid_bitcoin_p2wpkh_address,
5    is_valid_bitcoin_p2wsh_address, is_valid_litecoin_p2wpkh_address,
6};
7use helpers::{
8    bytes_to_string, dec_u8, defanged_colon, defanged_period, hex_u16, is_base58, is_bech32,
9    is_multispace, is_not_digit, is_not_hex_digit,
10};
11use nom::{
12    branch::alt,
13    bytes::complete::{is_not, tag, tag_no_case, take_till, take_while},
14    character::{
15        complete::{alphanumeric1, hex_digit1, multispace0, multispace1},
16        is_alphanumeric,
17    },
18    combinator::{complete, opt},
19    error::{make_error, ErrorKind},
20    multi::{many1, separated_list0, separated_list1},
21    sequence::preceded,
22    Err, IResult, Parser,
23};
24use std::{
25    net::{Ipv4Addr, Ipv6Addr},
26    sync::LazyLock,
27};
28
29mod bitcoin;
30mod helpers;
31
32static TLD_EXTRACTOR: LazyLock<tldextract::TldExtractor> =
33    LazyLock::new(|| tldextract::TldExtractor::new(Default::default()));
34
35/// Data representing a single indicator with a kind and a value.
36///
37/// If the value contained defanged data, the fangs will be removed. Meaning that `https[:]//github(.)com` will be represented as `Url("https://github.com")`.
38#[derive(Debug, PartialEq, PartialOrd, Ord, Eq, serde::Serialize)]
39#[serde(tag = "kind", content = "value", rename_all = "snake_case")]
40pub enum Indicator {
41    /// An URL starting with `http` or `https`
42    Url(String),
43    /// A domain name with a valid TLD (e.g., `github.com`) with validation using [tldextract](https://github.com/john-kurkowski/tldextract)
44    Domain(String),
45    /// A filename with very basic validation (e.g. if it has a `.`), but no extension validation. It's not guaranteed to be a valid filename and is mostly a catch-all if it wasn't able to match any other indicator.
46    File(String),
47    /// An email address, e.g. `benoit@jeaurond.dev`
48    Email(String),
49    /// An IPv4 address, e.g. `127.0.0.1`
50    Ipv4(Ipv4Addr),
51    /// An IPv6 address, e.g. `2001:0db8:85a3:0000:0000:8a2e:0370:7334`
52    Ipv6(Ipv6Addr),
53    /// A case-insentive SHA512 hash, e.g. `f1d9d8f153ec808a44cd63fb85f7cb811845b1596e46e78febd8c8b505a9b7d3a242c98b2b51261e5402f37334beefd7ba4066873a6dc56cd030cf29f4aef6dc`
54    Sha512(String),
55    /// A case-insentive SHA256 hash, e.g. `e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
56    Sha256(String),
57    /// A case-insentive SHA1 hash, e.g. `da39a3ee5e6b4b0d3255bfef95601890afd80709`
58    Sha1(String),
59    /// A case-insentive MD5 hash, e.g. `d41d8cd98f00b204e9800998ecf8427e`
60    Md5(String),
61    /// A Bitcoin [P2PKH](https://learnmeabitcoin.com/technical/script/p2pkh/) address, e.g. `1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa`
62    BitcoinP2pkhAddress(String),
63    /// A Bitcoin [P2SH](https://learnmeabitcoin.com/technical/script/p2sh/) address, e.g. `32jmM9eev8E7CGCAWLSHQnqgHBifcHzgQf`
64    BitcoinP2shAddress(String),
65    /// A Bitcoin [P2WPKH](https://learnmeabitcoin.com/technical/script/p2wpkh/) address, e.g. `bc1p4w46h2at4w46h2at4w46h2at4w46h2at5kreae`
66    BitcoinP2wpkhAddress(String),
67    /// A Bitcoin [P2WSH](https://learnmeabitcoin.com/technical/script/p2wsh/) address, e.g. `bc1qrp33g0q5c5txsp9arysrx4k6zdkfs4nce4xj0gdcccefvpysxf3qccfmv3`
68    BitcoinP2wshAddress(String),
69    LitecoinP2pkhAddress(String),
70    /// A Litecoin P2WPKH address, e.g. `ltc1q8c6fshw2dlwun7ekn9qwf37cu2rn755u9ym7p0`
71    LitecoinP2wpkhAddress(String),
72}
73
74/// Extracts and validates indicators from a byte array using nom combinators and functions returning a vector of sorted deduplicated indicators.
75///
76/// This shouldn't error if it can't extract any indicators.
77pub fn extract_indicators(input: &[u8]) -> IResult<&[u8], Vec<Indicator>> {
78    let (input, _) = multispace0(input)?;
79    let (input, indicator) = complete(separated_list0(
80        opt(is_not(" \t\r\n")).and(multispace1),
81        opt(extract_indicator),
82    ))(input)?;
83
84    let mut indicators = indicator.into_iter().flatten().collect::<Vec<Indicator>>();
85    indicators.sort();
86    indicators.dedup();
87
88    Ok((input, indicators))
89}
90
91/// Extracts and validates a single indicator from a byte array using nom combinators and functions returning an indicator.
92///
93/// This will error if its not able to extract an indicator.
94pub fn extract_indicator(input: &[u8]) -> IResult<&[u8], Indicator> {
95    alt((
96        extract_url,
97        extract_email,
98        extract_ipv4,
99        extract_ipv6,
100        extract_hash,
101        extract_domain,
102        extract_bitcoin_p2pkh_address,
103        extract_bitcoin_p2sh_address,
104        extract_bitcoin_p2wpkh_address,
105        extract_bitcoin_p2wsh_address,
106        extract_litecoin_p2wpkh_address,
107    ))(input)
108}
109
110fn extract_url(input: &[u8]) -> IResult<&[u8], Indicator> {
111    let (input, scheme) = alt((tag_no_case("https"), tag_no_case("http")))(input)?;
112    let (input, _) = defanged_colon(input)?;
113    let (input, _) = tag("//")(input)?;
114    let (input, host) = separated_list1(defanged_period, alt((alphanumeric1, tag("-"))))(input)?;
115    let (input, rest) = take_till(is_multispace)(input)?;
116
117    Ok((
118        input,
119        Indicator::Url(format!(
120            "{}://{}{}",
121            std::str::from_utf8(scheme).unwrap(),
122            host.into_iter()
123                .map(|s| std::str::from_utf8(s).unwrap())
124                .collect::<Vec<&str>>()
125                .join("."),
126            std::str::from_utf8(rest).unwrap()
127        )),
128    ))
129}
130
131fn extract_domain(input: &[u8]) -> IResult<&[u8], Indicator> {
132    let (input, data) = separated_list1(defanged_period, alt((alphanumeric1, tag("-"))))(input)?;
133
134    if data.len() < 2 {
135        return Err(Err::Error(make_error(input, ErrorKind::Verify)));
136    }
137
138    let potential_domain = data
139        .into_iter()
140        .map(|s| std::str::from_utf8(s).unwrap())
141        .collect::<Vec<&str>>()
142        .join(".");
143
144    let Ok(tld) = TLD_EXTRACTOR.extract(&potential_domain) else {
145        return Ok((input, Indicator::File(potential_domain)));
146    };
147
148    if tld.domain.is_some() && tld.suffix.is_some() {
149        return Ok((
150            input,
151            Indicator::Domain(format!(
152                "{}{}.{}",
153                if let Some(subdomain) = tld.subdomain.as_ref() {
154                    format!("{}.", subdomain)
155                } else {
156                    "".to_string()
157                },
158                tld.domain.unwrap(),
159                tld.suffix.unwrap()
160            )),
161        ));
162    }
163
164    Ok((input, Indicator::File(potential_domain)))
165}
166
167fn extract_email(input: &[u8]) -> IResult<&[u8], Indicator> {
168    let (input, _) = opt(take_while(|c| {
169        c != b'.' && c != b'-' && c != b'_' && c != b'+' && !is_alphanumeric(c) && !is_multispace(c)
170    }))(input)?;
171
172    let (input, email) =
173        many1(alt((alphanumeric1, tag("."), tag("-"), tag("_"), tag("+"))))(input)?;
174    let (input, _) = tag("@")(input)?;
175    let (input, first_part) = many1(alt((alphanumeric1, tag("-"))))(input)?;
176    let (input, domain) = preceded(defanged_period, many1(alt((alphanumeric1, tag("-")))))(input)?;
177
178    Ok((
179        input,
180        Indicator::Email(format!(
181            "{}@{}.{}",
182            std::str::from_utf8(&email.concat()).unwrap(),
183            std::str::from_utf8(&first_part.concat()).unwrap(),
184            std::str::from_utf8(&domain.concat()).unwrap()
185        )),
186    ))
187}
188
189fn extract_ipv4(input: &[u8]) -> IResult<&[u8], Indicator> {
190    let (input, _) = opt(take_while(|c| is_not_digit(c) && !is_multispace(c)))(input)?;
191
192    let (input, octects) = separated_list1(defanged_period, dec_u8)(input)?;
193    if octects.len() != 4 {
194        return Err(Err::Error(make_error(input, ErrorKind::Verify)));
195    }
196
197    let ipv4_addr = Ipv4Addr::new(octects[0], octects[1], octects[2], octects[3]);
198    Ok((input, Indicator::Ipv4(ipv4_addr)))
199}
200
201fn extract_ipv6(input: &[u8]) -> IResult<&[u8], Indicator> {
202    let (input, _) = opt(take_while(|c| {
203        c != b':' && !is_alphanumeric(c) && !is_multispace(c)
204    }))(input)?;
205
206    let (input, hexes) = separated_list1(defanged_colon, hex_u16)(input)?;
207    if hexes.len() != 8 {
208        return Err(Err::Error(make_error(input, ErrorKind::Verify)));
209    }
210
211    let ipv6_addr = Ipv6Addr::new(
212        hexes[0], hexes[1], hexes[2], hexes[3], hexes[4], hexes[5], hexes[6], hexes[7],
213    );
214    Ok((input, Indicator::Ipv6(ipv6_addr)))
215}
216
217fn extract_hash(input: &[u8]) -> IResult<&[u8], Indicator> {
218    let (input, _) = opt(take_while(|c| is_not_hex_digit(c) && !is_multispace(c)))(input)?;
219
220    let (input, hash) = hex_digit1(input)?;
221
222    match hash.len() {
223        32 => Ok((input, Indicator::Md5(bytes_to_string(hash)))),
224        40 => Ok((input, Indicator::Sha1(bytes_to_string(hash)))),
225        64 => Ok((input, Indicator::Sha256(bytes_to_string(hash)))),
226        128 => Ok((input, Indicator::Sha512(bytes_to_string(hash)))),
227        _ => Err(Err::Error(make_error(input, ErrorKind::Verify))),
228    }
229}
230
231fn extract_bitcoin_p2pkh_address(input: &[u8]) -> IResult<&[u8], Indicator> {
232    let (input, _) = opt(take_while(|c| c != b'1' && !is_multispace(c)))(input)?;
233
234    let (input, prefix) = tag("1")(input)?;
235    let (input, address) = is_base58(input)?;
236
237    if is_valid_bitcoin_p2pkh_address(address) {
238        Ok((
239            input,
240            Indicator::BitcoinP2pkhAddress(bytes_to_string(&[prefix, address].concat())),
241        ))
242    } else {
243        Err(Err::Error(make_error(input, ErrorKind::Verify)))
244    }
245}
246
247fn extract_bitcoin_p2sh_address(input: &[u8]) -> IResult<&[u8], Indicator> {
248    let (input, _) = opt(take_while(|c| c != b'3' && !is_multispace(c)))(input)?;
249
250    let (input, prefix) = tag("3")(input)?;
251    let (input, address) = is_base58(input)?;
252
253    if is_valid_bitcoin_p2sh_address(address) {
254        Ok((
255            input,
256            Indicator::BitcoinP2shAddress(bytes_to_string(&[prefix, address].concat())),
257        ))
258    } else {
259        Err(Err::Error(make_error(input, ErrorKind::Verify)))
260    }
261}
262
263fn extract_bitcoin_p2wpkh_address(input: &[u8]) -> IResult<&[u8], Indicator> {
264    let (input, _) = opt(take_while(|c| c != b'b' && c != b't' && !is_multispace(c)))(input)?;
265
266    let (input, prefix) = alt((tag_no_case("bc1"), tag_no_case("tb1")))(input)?;
267    let (input, address) = is_bech32(input)?;
268
269    let address = &[prefix, address].concat();
270
271    if is_valid_bitcoin_p2wpkh_address(address) {
272        Ok((
273            input,
274            Indicator::BitcoinP2wpkhAddress(bytes_to_string(address)),
275        ))
276    } else {
277        Err(Err::Error(make_error(input, ErrorKind::Verify)))
278    }
279}
280
281fn extract_bitcoin_p2wsh_address(input: &[u8]) -> IResult<&[u8], Indicator> {
282    let (input, _) = opt(take_while(|c| c != b'b' && c != b't' && !is_multispace(c)))(input)?;
283
284    let (input, prefix) = alt((tag_no_case("bc1"), tag_no_case("tb1")))(input)?;
285    let (input, address) = is_bech32(input)?;
286
287    let address = &[prefix, address].concat();
288
289    if is_valid_bitcoin_p2wsh_address(address) {
290        Ok((
291            input,
292            Indicator::BitcoinP2wshAddress(bytes_to_string(address)),
293        ))
294    } else {
295        Err(Err::Error(make_error(input, ErrorKind::Verify)))
296    }
297}
298
299fn extract_litecoin_p2wpkh_address(input: &[u8]) -> IResult<&[u8], Indicator> {
300    let (input, _) = opt(take_while(|c| c != b'l' && !is_multispace(c)))(input)?;
301
302    let (input, prefix) = tag("ltc1")(input)?;
303    let (input, address) = is_bech32(input)?;
304
305    let address = &[prefix, address].concat();
306
307    if is_valid_litecoin_p2wpkh_address(address) {
308        Ok((
309            input,
310            Indicator::LitecoinP2wpkhAddress(bytes_to_string(address)),
311        ))
312    } else {
313        Err(Err::Error(make_error(input, ErrorKind::Verify)))
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_extract_single_litecoin_p2wpkh_address() {
323        let input = "ltc1q8c6fshw2dlwun7ekn9qwf37cu2rn755u9ym7p0";
324        let expected = Indicator::LitecoinP2wpkhAddress(
325            "ltc1q8c6fshw2dlwun7ekn9qwf37cu2rn755u9ym7p0".to_string(),
326        );
327
328        assert_eq!(
329            extract_indicator(input.as_bytes()),
330            Ok(("".as_bytes(), expected))
331        );
332    }
333
334    #[test]
335    fn test_extract_single_bitcoin_pubkey() {
336        let input = "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa";
337        let expected =
338            Indicator::BitcoinP2pkhAddress("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa".to_string());
339
340        assert_eq!(
341            extract_indicator(input.as_bytes()),
342            Ok(("".as_bytes(), expected))
343        );
344    }
345
346    #[test]
347    fn test_extract_single_url() {
348        let input = "http://www.example.com/foo/bar";
349        let expected = Indicator::Url("http://www.example.com/foo/bar".to_string());
350
351        assert_eq!(
352            extract_indicator(input.as_bytes()),
353            Ok(("".as_bytes(), expected))
354        );
355    }
356
357    #[test]
358    fn test_extract_single_ipv4_with_garbage() {
359        let input = "asdf127.0.0.1";
360        let expected = Indicator::Ipv4(Ipv4Addr::new(127, 0, 0, 1));
361
362        assert_eq!(
363            extract_indicator(input.as_bytes()),
364            Ok(("".as_bytes(), expected))
365        );
366    }
367
368    #[test]
369    fn test_extract_single_ipv4() {
370        let input = "127.0.0.1";
371        let expected = Indicator::Ipv4(Ipv4Addr::new(127, 0, 0, 1));
372
373        assert_eq!(
374            extract_indicator(input.as_bytes()),
375            Ok(("".as_bytes(), expected))
376        );
377    }
378
379    #[test]
380    fn test_extract_single_partially_defanged_ipv4() {
381        let input = "127[.]0.0[.]1";
382        let expected = Indicator::Ipv4(Ipv4Addr::new(127, 0, 0, 1));
383
384        assert_eq!(
385            extract_indicator(input.as_bytes()),
386            Ok(("".as_bytes(), expected))
387        );
388    }
389
390    #[test]
391    fn test_extract_single_fully_defanged_ipv4() {
392        let input = "127[.]0[.]0[.]1";
393        let expected = Indicator::Ipv4(Ipv4Addr::new(127, 0, 0, 1));
394
395        assert_eq!(
396            extract_indicator(input.as_bytes()),
397            Ok(("".as_bytes(), expected))
398        );
399    }
400
401    #[test]
402    fn test_extract_single_partially_defanged_ipv6() {
403        let input = "2001:0db8[:]85a3[:]0000:0000[:]8a2e:0370:7334";
404        let expected = Indicator::Ipv6(Ipv6Addr::new(
405            0x2001, 0x0db8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334,
406        ));
407
408        assert_eq!(
409            extract_indicator(input.as_bytes()),
410            Ok(("".as_bytes(), expected))
411        );
412    }
413
414    #[test]
415    fn test_extract_md5() {
416        let input = "MD5 hash: d41d8cd98f00b204e9800998ecf8427e";
417        let expected = vec![Indicator::Md5(
418            "d41d8cd98f00b204e9800998ecf8427e".to_string(),
419        )];
420
421        assert_eq!(
422            extract_indicators(input.as_bytes()),
423            Ok(("".as_bytes(), expected))
424        );
425    }
426
427    #[test]
428    fn test_extract_single_md5() {
429        let input = "d41d8cd98f00b204e9800998ecf8427e";
430        let expected = Indicator::Md5("d41d8cd98f00b204e9800998ecf8427e".to_string());
431
432        assert_eq!(
433            extract_indicator(input.as_bytes()),
434            Ok(("".as_bytes(), expected))
435        );
436    }
437
438    #[test]
439    fn test_extract_sha1() {
440        let input = "SHA1 hash: da39a3ee5e6b4b0d3255bfef95601890afd80709";
441        let expected = vec![Indicator::Sha1(
442            "da39a3ee5e6b4b0d3255bfef95601890afd80709".to_string(),
443        )];
444
445        assert_eq!(
446            extract_indicators(input.as_bytes()),
447            Ok(("".as_bytes(), expected))
448        );
449    }
450
451    #[test]
452    fn test_extract_single_sha1() {
453        let input = "da39a3ee5e6b4b0d3255bfef95601890afd80709";
454        let expected = Indicator::Sha1("da39a3ee5e6b4b0d3255bfef95601890afd80709".to_string());
455
456        assert_eq!(
457            extract_indicator(input.as_bytes()),
458            Ok(("".as_bytes(), expected))
459        );
460    }
461
462    #[test]
463    fn test_extract_sha256() {
464        let input = "SHA256 hash: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
465        let expected = vec![Indicator::Sha256(
466            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855".to_string(),
467        )];
468
469        assert_eq!(
470            extract_indicators(input.as_bytes()),
471            Ok(("".as_bytes(), expected))
472        );
473    }
474
475    #[test]
476    fn test_extract_single_sha256() {
477        let input = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
478        let expected = Indicator::Sha256(
479            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855".to_string(),
480        );
481
482        assert_eq!(
483            extract_indicator(input.as_bytes()),
484            Ok(("".as_bytes(), expected))
485        );
486    }
487
488    #[test]
489    fn test_extract_sha512() {
490        let input = "SHA512 hash: f1d9d8f153ec808a44cd63fb85f7cb811845b1596e46e78febd8c8b505a9b7d3a242c98b2b51261e5402f37334beefd7ba4066873a6dc56cd030cf29f4aef6dc";
491        let expected = vec![Indicator::Sha512(
492            "f1d9d8f153ec808a44cd63fb85f7cb811845b1596e46e78febd8c8b505a9b7d3a242c98b2b51261e5402f37334beefd7ba4066873a6dc56cd030cf29f4aef6dc".to_string()
493        )];
494
495        assert_eq!(
496            extract_indicators(input.as_bytes()),
497            Ok(("".as_bytes(), expected))
498        );
499    }
500
501    #[test]
502    fn test_extract_single_sha512() {
503        let input = "f1d9d8f153ec808a44cd63fb85f7cb811845b1596e46e78febd8c8b505a9b7d3a242c98b2b51261e5402f37334beefd7ba4066873a6dc56cd030cf29f4aef6dc";
504        let expected = Indicator::Sha512("f1d9d8f153ec808a44cd63fb85f7cb811845b1596e46e78febd8c8b505a9b7d3a242c98b2b51261e5402f37334beefd7ba4066873a6dc56cd030cf29f4aef6dc".to_string());
505
506        assert_eq!(
507            extract_indicator(input.as_bytes()),
508            Ok(("".as_bytes(), expected))
509        );
510    }
511
512    #[test]
513    fn test_multiple_indicators() {
514        let input = r#"    Domain: AM6P194CA0000.outlook.office.com
515    Domain: AMS0EPF000000A0.eurprd01.prod.outlook.com
516    Domain: me512.com
517    File: 1.0
518    File: 15.21.7897.01
519    File: 15.26.7918.123
520    File: AA6P194CA0000.EURP001.PROD.OUTLOOK.COM
521    File: CC3PR84AB3445.LAPE210.PROD.OUTLOOK.COM
522    Email: 8ab3fa386978525c7fd59cb135f0fbc598c8@outlook.com
523    Email: ALLOW@OUTLOOK.COM
524    Email: allow@outlook.com
525    Ipv4: 10.167.20.233
526    Ipv4: 96.21.95.53
527    BitcoinP2pkhAddress: 15N6Q12yFN3xa8ChqXDWWGgZPYcZdoTyRa
528    LitecoinP2wpkhAddress: ltc1q8c6fshw2dlwun7ekn9qwf37cu2rn755u9ym7p0"#;
529
530        let expected = vec![
531            Indicator::Domain("AM6P194CA0000.outlook.office.com".to_string()),
532            Indicator::Domain("AMS0EPF000000A0.eurprd01.prod.outlook.com".to_string()),
533            Indicator::Domain("me512.com".to_string()),
534            Indicator::File("1.0".to_string()),
535            Indicator::File("15.21.7897.01".to_string()),
536            Indicator::File("15.26.7918.123".to_string()),
537            Indicator::File("AA6P194CA0000.EURP001.PROD.OUTLOOK.COM".to_string()),
538            Indicator::File("CC3PR84AB3445.LAPE210.PROD.OUTLOOK.COM".to_string()),
539            Indicator::Email("8ab3fa386978525c7fd59cb135f0fbc598c8@outlook.com".to_string()),
540            Indicator::Email("ALLOW@OUTLOOK.COM".to_string()),
541            Indicator::Email("allow@outlook.com".to_string()),
542            Indicator::Ipv4(Ipv4Addr::new(10, 167, 20, 233)),
543            Indicator::Ipv4(Ipv4Addr::new(96, 21, 95, 53)),
544            Indicator::BitcoinP2pkhAddress("15N6Q12yFN3xa8ChqXDWWGgZPYcZdoTyRa".to_string()),
545            Indicator::LitecoinP2wpkhAddress(
546                "ltc1q8c6fshw2dlwun7ekn9qwf37cu2rn755u9ym7p0".to_string(),
547            ),
548        ];
549
550        let (input, result) = extract_indicators(input.as_bytes()).unwrap();
551
552        for indicator in expected.iter() {
553            assert!(
554                result.contains(indicator),
555                "indicator {indicator:?} doesn't match"
556            );
557        }
558
559        assert_eq!(input.len(), 0);
560    }
561}