use std::io;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use std::ops::Range;
use std::sync::OnceLock;
use regex_automata::dfa::dense::DFA;
use regex_automata::dfa::Automaton;
use regex_automata::Input;
mod tag;
pub use tag::{Tag, Tagged, TextData};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IpKind {
V4,
V6,
}
#[derive(Debug, Clone)]
pub struct IpMatch<'a> {
bytes: &'a [u8],
range: Range<usize>,
kind: IpKind,
}
impl<'a> IpMatch<'a> {
#[inline]
pub fn as_bytes(&self) -> &'a [u8] {
self.bytes
}
pub fn as_str(&self) -> std::borrow::Cow<'a, str> {
if memchr::memchr(b'[', self.bytes).is_none() {
std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.bytes) })
} else {
let cleaned = strip_brackets(self.bytes);
std::borrow::Cow::Owned(unsafe { String::from_utf8_unchecked(cleaned) })
}
}
#[inline]
pub fn as_matched_str(&self) -> &'a str {
unsafe { std::str::from_utf8_unchecked(self.bytes) }
}
#[inline]
pub fn range(&self) -> Range<usize> {
self.range.clone()
}
#[inline]
pub fn kind(&self) -> IpKind {
self.kind
}
pub fn ip(&self) -> IpAddr {
let s = self.as_str();
match self.kind {
IpKind::V4 => IpAddr::V4(s.parse::<Ipv4Addr>().expect("validated by DFA")),
IpKind::V6 => IpAddr::V6(s.parse::<Ipv6Addr>().expect("validated by DFA")),
}
}
}
#[repr(C, align(4))]
struct AlignedDfa<T: ?Sized>(T);
static IPV4_DFA_BYTES: &AlignedDfa<[u8]> =
&AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv4.dfa")));
static IPV6_DFA_BYTES: &AlignedDfa<[u8]> =
&AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv6.dfa")));
static BOTH_DFA_BYTES: &AlignedDfa<[u8]> =
&AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/both.dfa")));
static DFA_IPV4: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
static DFA_IPV6: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
static DFA_BOTH: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
fn load_dfa(aligned: &'static AlignedDfa<[u8]>) -> DFA<&'static [u32]> {
let (dfa, _) = DFA::from_bytes(&aligned.0).expect("valid dfa from build.rs");
dfa
}
fn get_ipv4_dfa() -> &'static DFA<&'static [u32]> {
DFA_IPV4.get_or_init(|| load_dfa(IPV4_DFA_BYTES))
}
fn get_ipv6_dfa() -> &'static DFA<&'static [u32]> {
DFA_IPV6.get_or_init(|| load_dfa(IPV6_DFA_BYTES))
}
fn get_both_dfa() -> &'static DFA<&'static [u32]> {
DFA_BOTH.get_or_init(|| load_dfa(BOTH_DFA_BYTES))
}
#[derive(Clone, Debug)]
enum ValidatorType {
IPv4 {
include_private: bool,
include_loopback: bool,
include_broadcast: bool,
},
IPv6 {
include_private: bool,
include_loopback: bool,
},
}
impl ValidatorType {
#[inline(always)]
fn validate(&self, bytes: &[u8]) -> bool {
match *self {
ValidatorType::IPv4 {
include_private,
include_loopback,
include_broadcast,
} => validate_ipv4(bytes, include_private, include_loopback, include_broadcast),
ValidatorType::IPv6 {
include_private,
include_loopback,
} => validate_ipv6(bytes, include_private, include_loopback),
}
}
#[inline(always)]
fn kind(&self) -> IpKind {
match self {
ValidatorType::IPv4 { .. } => IpKind::V4,
ValidatorType::IPv6 { .. } => IpKind::V6,
}
}
}
pub struct Extractor {
dfa: &'static DFA<&'static [u32]>,
validators: [ValidatorType; 2],
}
impl Extractor {
#[inline]
pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = Range<usize>> + 'a {
self.match_iter(haystack).map(|m| m.range())
}
#[inline]
pub fn match_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = IpMatch<'a>> + 'a {
let mut input = Input::new(haystack);
std::iter::from_fn(move || loop {
let Ok(Some(m)) = self.dfa.try_search_fwd(&input) else {
return None;
};
let end = m.offset();
let pid = m.pattern().as_usize();
let validator = &self.validators[pid];
input.set_start(end);
let floor = end.saturating_sub(55); let raw_start = (floor..end)
.rev()
.find(|&i| i == 0 || !is_ip_or_bracket_char(haystack[i - 1]))
.unwrap_or(floor);
let start = if raw_start < end
&& haystack[raw_start] == b'['
&& raw_start + 1 < end
&& haystack[raw_start + 1] != b'.'
&& haystack[raw_start + 1] != b':'
{
raw_start + 1
} else if raw_start > 0 && raw_start + 1 < end && haystack[raw_start + 1] == b':' && {
let prev = haystack[raw_start - 1];
prev.is_ascii_alphabetic() && !matches!(prev, b'a'..=b'f' | b'A'..=b'F')
} {
raw_start + 2
} else {
raw_start
};
let valid_right_boundary = match end.cmp(&haystack.len()) {
std::cmp::Ordering::Less => {
let next = haystack[end];
match validator {
ValidatorType::IPv4 { .. } => {
!(next.is_ascii_digit()
|| next == b'.'
&& end + 1 < haystack.len()
&& haystack[end + 1].is_ascii_digit())
}
ValidatorType::IPv6 { .. } => {
!matches!(next, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F'
| b'.' | b':' | b'[')
}
}
}
_ => true,
};
if !valid_right_boundary {
continue;
}
let candidate = &haystack[start..end];
if memchr::memchr(b'[', candidate).is_some() {
let cleaned = strip_brackets(candidate);
if validator.validate(&cleaned) {
return Some(IpMatch {
bytes: candidate,
range: start..end,
kind: validator.kind(),
});
}
} else if validator.validate(candidate) {
return Some(IpMatch {
bytes: candidate,
range: start..end,
kind: validator.kind(),
});
}
})
}
pub fn replace_iter<W, F>(
&self,
haystack: &[u8],
wtr: &mut W,
mut replacer: F,
) -> io::Result<usize>
where
W: io::Write,
F: FnMut(&IpMatch, &mut W) -> io::Result<()>,
{
let mut last = 0;
let mut count = 0;
for m in self.match_iter(haystack) {
let range = m.range();
wtr.write_all(&haystack[last..range.start])?;
replacer(&m, wtr)?;
last = range.end;
count += 1;
}
wtr.write_all(&haystack[last..])?;
Ok(count)
}
}
#[inline(always)]
fn is_ip_or_bracket_char(b: u8) -> bool {
matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'.' | b':' | b'[' | b']')
}
fn strip_brackets(bytes: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(bytes.len());
for &b in bytes {
if b != b'[' && b != b']' {
out.push(b);
}
}
out
}
pub struct ExtractorBuilder {
include_ipv4: bool,
include_ipv6: bool,
include_private: bool,
include_loopback: bool,
include_broadcast: bool,
}
impl Default for ExtractorBuilder {
fn default() -> Self {
Self::new()
}
}
impl ExtractorBuilder {
#[must_use]
pub fn new() -> Self {
Self {
include_ipv4: true,
include_ipv6: true,
include_private: true,
include_loopback: true,
include_broadcast: true,
}
}
pub fn ipv4(&mut self, include: bool) -> &mut Self {
self.include_ipv4 = include;
self
}
pub fn ipv6(&mut self, include: bool) -> &mut Self {
self.include_ipv6 = include;
self
}
pub fn private_ips(&mut self, include: bool) -> &mut Self {
self.include_private = include;
self
}
pub fn loopback_ips(&mut self, include: bool) -> &mut Self {
self.include_loopback = include;
self
}
pub fn broadcast_ips(&mut self, include: bool) -> &mut Self {
self.include_broadcast = include;
self
}
pub fn ignore_private(&mut self) -> &mut Self {
self.include_private = false;
self
}
pub fn ignore_loopback(&mut self) -> &mut Self {
self.include_loopback = false;
self
}
pub fn ignore_broadcast(&mut self) -> &mut Self {
self.include_broadcast = false;
self
}
pub fn only_public(&mut self) -> &mut Self {
self.include_private = false;
self.include_loopback = false;
self.include_broadcast = false;
self
}
pub fn build(&self) -> anyhow::Result<Extractor> {
let ipv4 = ValidatorType::IPv4 {
include_private: self.include_private,
include_loopback: self.include_loopback,
include_broadcast: self.include_broadcast,
};
let ipv6 = ValidatorType::IPv6 {
include_private: self.include_private,
include_loopback: self.include_loopback,
};
let (dfa, validators) = match (self.include_ipv4, self.include_ipv6) {
(true, true) => (get_both_dfa(), [ipv4, ipv6]),
(true, false) => (get_ipv4_dfa(), [ipv4, ipv6]),
(false, true) => (get_ipv6_dfa(), [ipv6, ipv4]),
_ => anyhow::bail!("No IP address patterns selected"),
};
Ok(Extractor { dfa, validators })
}
}
#[inline]
fn validate_ipv4(
bytes: &[u8],
include_private: bool,
include_loopback: bool,
include_broadcast: bool,
) -> bool {
let Some(ipv4) = parse_ipv4_bytes(bytes) else {
return false;
};
if !include_private && ipv4.is_private() {
return false;
}
if !include_loopback && ipv4.is_loopback() {
return false;
}
if !include_broadcast && (ipv4.is_broadcast() || ipv4.is_link_local()) {
return false;
}
true
}
pub fn extract(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
let extractor = ExtractorBuilder::new().build()?;
Ok(extractor
.find_iter(haystack)
.map(|range| String::from_utf8_lossy(&haystack[range]).to_string())
.collect())
}
pub fn extract_unique(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
use std::collections::HashSet;
let extractor = ExtractorBuilder::new().build()?;
let mut seen = HashSet::new();
let mut result = Vec::new();
for range in extractor.find_iter(haystack) {
let ip_str = String::from_utf8_lossy(&haystack[range]).to_string();
if seen.insert(ip_str.clone()) {
result.push(ip_str);
}
}
Ok(result)
}
pub fn extract_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
let extractor = ExtractorBuilder::new().build()?;
extractor
.find_iter(haystack)
.map(|range| {
let s = std::str::from_utf8(&haystack[range])
.map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
s.parse::<IpAddr>()
.map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))
})
.collect()
}
pub fn extract_unique_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
use std::collections::HashSet;
let extractor = ExtractorBuilder::new().build()?;
let mut seen = HashSet::new();
let mut result = Vec::new();
for range in extractor.find_iter(haystack) {
let s = std::str::from_utf8(&haystack[range])
.map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
let addr = s
.parse::<IpAddr>()
.map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))?;
if seen.insert(addr) {
result.push(addr);
}
}
Ok(result)
}
#[must_use]
#[inline]
pub fn parse_ipv4_bytes(bytes: &[u8]) -> Option<Ipv4Addr> {
if bytes.len() < 7 || bytes.len() > 15 {
return None;
}
let mut octets = [0u8; 4];
let mut octet_idx = 0;
let mut current_val = 0u16;
let mut digits_in_octet = 0;
for &b in bytes {
match b {
b'.' => {
if digits_in_octet == 0 || octet_idx == 3 {
return None;
}
#[allow(clippy::cast_possible_truncation)]
{
octets[octet_idx] = current_val as u8;
}
octet_idx += 1;
current_val = 0;
digits_in_octet = 0;
}
b'0'..=b'9' => {
let digit = u16::from(b - b'0');
if digits_in_octet > 0 && current_val == 0 {
return None;
}
current_val = current_val * 10 + digit;
if current_val > 255 {
return None;
}
digits_in_octet += 1;
}
_ => return None,
}
}
if octet_idx != 3 || digits_in_octet == 0 {
return None;
}
#[allow(clippy::cast_possible_truncation)]
{
octets[3] = current_val as u8;
}
Some(Ipv4Addr::new(octets[0], octets[1], octets[2], octets[3]))
}
#[inline]
fn is_unique_local(ip: &Ipv6Addr) -> bool {
matches!(ip.octets()[0], 0xfc | 0xfd)
}
#[inline]
fn validate_ipv6(bytes: &[u8], include_private: bool, include_loopback: bool) -> bool {
if bytes.len() < 2 {
return false;
}
let s = unsafe { std::str::from_utf8_unchecked(bytes) };
let Ok(ip) = s.parse::<IpAddr>() else {
return false;
};
match ip {
IpAddr::V6(ipv6) => {
if !include_private && (ipv6.is_unicast_link_local() || is_unique_local(&ipv6)) {
return false;
}
if !include_loopback && ipv6.is_loopback() {
return false;
}
true
}
IpAddr::V4(_) => false,
}
}
impl std::fmt::Debug for Extractor {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Extractor")
.field("validators", &self.validators)
.finish()
}
}