ip_extract/lib.rs
1//! High-performance IP address extraction and tagging engine.
2//!
3//! `ip-extract` provides a blazingly fast, configurable extractor for finding IPv4 and IPv6
4//! addresses in unstructured text. It achieves maximum throughput through:
5//!
6//! - **Compile-time DFA**: IP patterns are converted to dense Forward DFAs during build,
7//! eliminating runtime regex compilation and heap allocation.
8//! - **Zero-overhead scanning**: The DFA scans at O(n) with no backtracking; validation
9//! is performed only on candidates.
10//! - **Strict validation**: Deep checks eliminate false positives (e.g., `1.2.3.4.5` is rejected).
11//!
12//! ## Quick Start
13//!
14//! By default, **all IP addresses are extracted**:
15//!
16//! ```no_run
17//! use ip_extract::ExtractorBuilder;
18//!
19//! # fn main() -> anyhow::Result<()> {
20//! // Extract all IPs (default: includes private, loopback, broadcast)
21//! let extractor = ExtractorBuilder::new().build()?;
22//!
23//! let input = b"Connect from 192.168.1.1 to 2001:db8::1";
24//! for range in extractor.find_iter(input) {
25//! let ip = std::str::from_utf8(&input[range])?;
26//! println!("Found: {}", ip);
27//! }
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## Tagging and Output
33//!
34//! For more structured output (e.g., JSON), use the `Tagged` and `Tag` types:
35//!
36//! ```no_run
37//! use ip_extract::{ExtractorBuilder, Tagged, Tag};
38//!
39//! # fn main() -> anyhow::Result<()> {
40//! let extractor = ExtractorBuilder::new().build()?;
41//! let data = b"Server at 8.8.8.8";
42//! let mut tagged = Tagged::new(data);
43//!
44//! for range in extractor.find_iter(data) {
45//! let ip = std::str::from_utf8(&data[range.clone()])?;
46//! let tag = Tag::new(ip, ip).with_range(range);
47//! tagged = tagged.tag(tag);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ## Configuration
54//!
55//! Use `ExtractorBuilder` to filter specific IP categories:
56//!
57//! ```no_run
58//! use ip_extract::ExtractorBuilder;
59//!
60//! # fn main() -> anyhow::Result<()> {
61//! // Extract only publicly routable IPs
62//! let extractor = ExtractorBuilder::new()
63//! .only_public()
64//! .build()?;
65//!
66//! // Or use granular control
67//! let extractor = ExtractorBuilder::new()
68//! .ipv4(true) // Extract IPv4 (default: true)
69//! .ipv6(false) // Skip IPv6
70//! .ignore_private() // Skip RFC 1918 ranges
71//! .ignore_loopback() // Skip loopback (127.0.0.1, ::1)
72//! .build()?;
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Performance
78//!
79//! Typical throughput on modern hardware:
80//! - Dense IPs (mostly IP addresses): **160+ MiB/s**
81//! - Sparse logs (IPs mixed with text): **360+ MiB/s**
82//! - No IPs (pure scanning): **620+ MiB/s**
83//!
84//! See `benches/ip_benchmark.rs` for details.
85
86use std::io;
87use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
88use std::ops::Range;
89use std::sync::OnceLock;
90
91use regex_automata::dfa::dense::DFA;
92use regex_automata::dfa::Automaton;
93use regex_automata::Input;
94
95mod tag;
96pub use tag::{Tag, Tagged, TextData};
97
98/// Whether a validated IP match is IPv4 or IPv6.
99///
100/// Known at zero cost from the DFA pattern ID — no parsing required.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum IpKind {
103 V4,
104 V6,
105}
106
107/// A validated IP address match within a haystack.
108///
109/// Provides zero-copy access to the matched bytes and their position within
110/// the original haystack, plus the IP version. Parsing to [`IpAddr`] is
111/// available via [`ip()`][IpMatch::ip] but not cached — callers who look up
112/// the same IP repeatedly should cache at a higher level.
113#[derive(Debug, Clone)]
114pub struct IpMatch<'a> {
115 bytes: &'a [u8],
116 range: Range<usize>,
117 kind: IpKind,
118}
119
120impl<'a> IpMatch<'a> {
121 /// The matched IP address as a byte slice.
122 ///
123 /// Zero-copy: this is a slice directly into the haystack.
124 #[inline]
125 pub fn as_bytes(&self) -> &'a [u8] {
126 self.bytes
127 }
128
129 /// The clean IP address as a string, with any defang brackets removed.
130 ///
131 /// For normal (fanged) input this is a zero-copy borrow (`Cow::Borrowed`).
132 /// For defanged input (e.g. `"192.168.1[.]50"`) this allocates and strips
133 /// brackets, returning `Cow::Owned("192.168.1.50")`.
134 ///
135 /// This is the right default for MMDB lookups, deduplication, output, and
136 /// parsing. For the raw matched text (which may contain brackets), use
137 /// [`as_matched_str`][Self::as_matched_str].
138 pub fn as_str(&self) -> std::borrow::Cow<'a, str> {
139 if memchr::memchr(b'[', self.bytes).is_none() {
140 // SAFETY: IP characters and brackets are all ASCII.
141 std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.bytes) })
142 } else {
143 let cleaned = strip_brackets(self.bytes);
144 // SAFETY: strip_brackets retains only IP characters (ASCII).
145 std::borrow::Cow::Owned(unsafe { String::from_utf8_unchecked(cleaned) })
146 }
147 }
148
149 /// The raw matched text as a string slice.
150 ///
151 /// Returns the exact bytes matched in the haystack — for defanged input,
152 /// this may include bracket characters (e.g. `"192.168.1[.]50"`). Use
153 /// [`as_str`][Self::as_str] when you need the canonical IP form.
154 ///
155 /// Zero-copy: this is a slice directly into the haystack. Safe without
156 /// UTF-8 validation because all matched characters (digits, hex, `.`, `:`,
157 /// `[`, `]`) are ASCII.
158 #[inline]
159 pub fn as_matched_str(&self) -> &'a str {
160 // SAFETY: IP characters and brackets are all ASCII.
161 unsafe { std::str::from_utf8_unchecked(self.bytes) }
162 }
163
164 /// The byte range of this match within the original haystack.
165 #[inline]
166 pub fn range(&self) -> Range<usize> {
167 self.range.clone()
168 }
169
170 /// Whether this match is IPv4 or IPv6.
171 #[inline]
172 pub fn kind(&self) -> IpKind {
173 self.kind
174 }
175
176 /// Parse the matched bytes into an [`IpAddr`].
177 ///
178 /// Automatically strips defang brackets before parsing — safe to call on
179 /// both normal and defanged matches. Not cached; callers processing the
180 /// same IP repeatedly should cache at a higher level.
181 ///
182 /// # Panics
183 ///
184 /// Panics if the validated bytes cannot be parsed as an IP address.
185 /// This should not happen in practice because matches are validated by the DFA.
186 pub fn ip(&self) -> IpAddr {
187 let s = self.as_str();
188 match self.kind {
189 IpKind::V4 => IpAddr::V4(s.parse::<Ipv4Addr>().expect("validated by DFA")),
190 IpKind::V6 => IpAddr::V6(s.parse::<Ipv6Addr>().expect("validated by DFA")),
191 }
192 }
193}
194
195// Alignment wrapper: guarantees u32 alignment for DFA deserialization.
196// DFA::from_bytes() requires the byte slice to be u32-aligned; include_bytes!() only
197// guarantees byte alignment. Wrapping in repr(C, align(4)) satisfies this at compile time,
198// with zero runtime cost: no allocation, no copy, no Box::leak.
199#[repr(C, align(4))]
200struct AlignedDfa<T: ?Sized>(T);
201
202static IPV4_DFA_BYTES: &AlignedDfa<[u8]> =
203 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv4.dfa")));
204static IPV6_DFA_BYTES: &AlignedDfa<[u8]> =
205 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv6.dfa")));
206static BOTH_DFA_BYTES: &AlignedDfa<[u8]> =
207 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/both.dfa")));
208
209static DFA_IPV4: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
210static DFA_IPV6: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
211static DFA_BOTH: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
212
213fn load_dfa(aligned: &'static AlignedDfa<[u8]>) -> DFA<&'static [u32]> {
214 let (dfa, _) = DFA::from_bytes(&aligned.0).expect("valid dfa from build.rs");
215 dfa
216}
217
218fn get_ipv4_dfa() -> &'static DFA<&'static [u32]> {
219 DFA_IPV4.get_or_init(|| load_dfa(IPV4_DFA_BYTES))
220}
221fn get_ipv6_dfa() -> &'static DFA<&'static [u32]> {
222 DFA_IPV6.get_or_init(|| load_dfa(IPV6_DFA_BYTES))
223}
224fn get_both_dfa() -> &'static DFA<&'static [u32]> {
225 DFA_BOTH.get_or_init(|| load_dfa(BOTH_DFA_BYTES))
226}
227
228#[derive(Clone, Debug)]
229enum ValidatorType {
230 IPv4 {
231 include_private: bool,
232 include_loopback: bool,
233 include_broadcast: bool,
234 },
235 IPv6 {
236 include_private: bool,
237 include_loopback: bool,
238 },
239}
240
241impl ValidatorType {
242 #[inline(always)]
243 fn validate(&self, bytes: &[u8]) -> bool {
244 match *self {
245 ValidatorType::IPv4 {
246 include_private,
247 include_loopback,
248 include_broadcast,
249 } => validate_ipv4(bytes, include_private, include_loopback, include_broadcast),
250 ValidatorType::IPv6 {
251 include_private,
252 include_loopback,
253 } => validate_ipv6(bytes, include_private, include_loopback),
254 }
255 }
256
257 #[inline(always)]
258 fn kind(&self) -> IpKind {
259 match self {
260 ValidatorType::IPv4 { .. } => IpKind::V4,
261 ValidatorType::IPv6 { .. } => IpKind::V6,
262 }
263 }
264}
265
266/// The main IP address extractor.
267///
268/// An `Extractor` scans byte slices for IPv4 and/or IPv6 addresses, applying configurable
269/// filters to include or exclude certain address classes (private, loopback, broadcast).
270///
271/// Extractors are best created via [`ExtractorBuilder`] and are designed to be reused
272/// across many calls to `find_iter` for maximum efficiency.
273///
274/// # Bytes vs. Strings
275///
276/// This extractor works directly on byte slices rather than strings. This avoids UTF-8
277/// validation overhead and enables zero-copy scanning of very large inputs.
278///
279/// # Performance
280///
281/// The extractor uses a compile-time DFA (Deterministic Finite Automaton) for O(n)
282/// scanning with minimal overhead. See the crate-level documentation for throughput benchmarks.
283pub struct Extractor {
284 dfa: &'static DFA<&'static [u32]>,
285 validators: [ValidatorType; 2],
286}
287
288impl Extractor {
289 /// Find all IP addresses in a byte slice.
290 ///
291 /// Returns an iterator of byte ranges `[start, end)` pointing to each IP
292 /// address found. Ranges are guaranteed to be valid indices into `haystack`.
293 ///
294 /// For richer match information (IP version, direct string access), use
295 /// [`match_iter`][Extractor::match_iter] instead.
296 ///
297 /// # Example
298 ///
299 /// ```no_run
300 /// use ip_extract::ExtractorBuilder;
301 ///
302 /// # fn main() -> anyhow::Result<()> {
303 /// let extractor = ExtractorBuilder::new().build()?;
304 /// let data = b"Connecting from 192.168.1.1";
305 ///
306 /// for range in extractor.find_iter(data) {
307 /// let ip = std::str::from_utf8(&data[range])?;
308 /// println!("Found: {ip}");
309 /// }
310 /// # Ok(())
311 /// # }
312 /// ```
313 #[inline]
314 pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = Range<usize>> + 'a {
315 self.match_iter(haystack).map(|m| m.range())
316 }
317
318 /// Find all IP addresses in a byte slice, yielding rich [`IpMatch`] values.
319 ///
320 /// Like [`find_iter`][Extractor::find_iter], but each match carries the
321 /// matched bytes, their position in the haystack, and the IP version —
322 /// eliminating the need to re-parse or guess the version at the call site.
323 ///
324 /// # Example
325 ///
326 /// ```no_run
327 /// use ip_extract::ExtractorBuilder;
328 ///
329 /// # fn main() -> anyhow::Result<()> {
330 /// let extractor = ExtractorBuilder::new().build()?;
331 /// let data = b"Log: 192.168.1.1 sent request to 2001:db8::1";
332 ///
333 /// for m in extractor.match_iter(data) {
334 /// println!("{} ({:?})", m.as_matched_str(), m.kind());
335 /// }
336 /// # Ok(())
337 /// # }
338 /// ```
339 #[inline]
340 pub fn match_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = IpMatch<'a>> + 'a {
341 let mut input = Input::new(haystack);
342
343 std::iter::from_fn(move || loop {
344 let Ok(Some(m)) = self.dfa.try_search_fwd(&input) else {
345 return None;
346 };
347
348 let end = m.offset();
349 let pid = m.pattern().as_usize();
350 let validator = &self.validators[pid];
351
352 input.set_start(end);
353
354 // Bracket-aware boundary scan (defang always-on: [.] and [:] are valid IP chars).
355 let floor = end.saturating_sub(55); // wider for bracket notation:
356 // max defanged IPv6 ≈ 53 chars
357 let raw_start = (floor..end)
358 .rev()
359 .find(|&i| i == 0 || !is_ip_or_bracket_char(haystack[i - 1]))
360 .unwrap_or(floor);
361
362 // A lone `[` at the start of the candidate is a surrounding bracket (e.g. "[3.3.3.3]"),
363 // not a defang bracket. Defang brackets always surround a separator character:
364 // `[.]`, `[:]`, or `[::]`. Skip a leading `[` that is followed by a digit or hex
365 // character (not `.` or `:`), since that pattern is never valid defang notation.
366 //
367 // Additionally, handle the RFC 5321 SMTP `IPv6:` tag prefix. MTAs write IPv6
368 // addresses as `[IPv6:2001:db8::1]`. The lookback stops at `v` (non-hex letter)
369 // leaving the candidate as `6:2001:db8::1`. Detect this via the heuristic: if the
370 // candidate starts with a single hex digit immediately followed by `:`, and the
371 // character immediately before the candidate in the haystack is a non-hex ASCII
372 // letter, skip that `x:` prefix. This specifically targets the `IPv6:` suffix
373 // pattern (`…v6:` → skip `6:`).
374 let start = if raw_start < end
375 && haystack[raw_start] == b'['
376 && raw_start + 1 < end
377 && haystack[raw_start + 1] != b'.'
378 && haystack[raw_start + 1] != b':'
379 {
380 raw_start + 1
381 } else if raw_start > 0 && raw_start + 1 < end && haystack[raw_start + 1] == b':' && {
382 let prev = haystack[raw_start - 1];
383 prev.is_ascii_alphabetic() && !matches!(prev, b'a'..=b'f' | b'A'..=b'F')
384 } {
385 // Skip the single-hex-char + `:` prefix (e.g. `6:` from `IPv6:`).
386 raw_start + 2
387 } else {
388 raw_start
389 };
390
391 let valid_right_boundary = match end.cmp(&haystack.len()) {
392 std::cmp::Ordering::Less => {
393 let next = haystack[end];
394 match validator {
395 ValidatorType::IPv4 { .. } => {
396 !(next.is_ascii_digit()
397 || next == b'.'
398 && end + 1 < haystack.len()
399 && haystack[end + 1].is_ascii_digit())
400 }
401 // `]` is allowed as a right boundary for IPv6. In SMTP literal
402 // notation (RFC 5321) addresses appear as `[2001:db8::1]` or
403 // `[IPv6:2001:db8::1]`. In defang notation brackets only appear
404 // in the middle of the address (`[:]`), never at the very end of
405 // the DFA match, so a trailing `]` is always a closing bracket.
406 ValidatorType::IPv6 { .. } => {
407 !matches!(next, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F'
408 | b'.' | b':' | b'[')
409 }
410 }
411 }
412 _ => true,
413 };
414
415 if !valid_right_boundary {
416 continue;
417 }
418
419 let candidate = &haystack[start..end];
420
421 // Strip brackets before validation (handles both fanged and defanged input).
422 // On normal (fanged) input, memchr scans ~7-15 bytes per match and finds
423 // nothing — falling straight to the else branch with no allocation. The
424 // strip_brackets path only runs when brackets are actually present.
425 if memchr::memchr(b'[', candidate).is_some() {
426 let cleaned = strip_brackets(candidate);
427 if validator.validate(&cleaned) {
428 return Some(IpMatch {
429 bytes: candidate,
430 range: start..end,
431 kind: validator.kind(),
432 });
433 }
434 } else if validator.validate(candidate) {
435 return Some(IpMatch {
436 bytes: candidate,
437 range: start..end,
438 kind: validator.kind(),
439 });
440 }
441 })
442 }
443
444 /// Scan `haystack` for IP addresses, writing non-IP text to `wtr` and
445 /// calling `replacer` for each match.
446 ///
447 /// This is the efficient single-pass decoration primitive: the caller
448 /// never needs to track byte offsets or manage gap writes. The replacer
449 /// writes the substitution directly to `wtr` — no intermediate allocation.
450 ///
451 /// Returns the number of IP addresses found.
452 ///
453 /// # Errors
454 ///
455 /// Returns the first `io::Error` from either a gap write or the replacer.
456 ///
457 /// # Example
458 ///
459 /// ```no_run
460 /// use ip_extract::ExtractorBuilder;
461 /// use std::io::Write;
462 ///
463 /// # fn main() -> anyhow::Result<()> {
464 /// let extractor = ExtractorBuilder::new().build()?;
465 /// let data = b"Server 192.168.1.1 is up";
466 /// let mut out = Vec::new();
467 ///
468 /// let count = extractor.replace_iter(data, &mut out, |m, w| {
469 /// write!(w, "[{}]", m.as_matched_str())
470 /// })?;
471 ///
472 /// assert_eq!(count, 1);
473 /// assert_eq!(out, b"Server [192.168.1.1] is up");
474 /// # Ok(())
475 /// # }
476 /// ```
477 pub fn replace_iter<W, F>(
478 &self,
479 haystack: &[u8],
480 wtr: &mut W,
481 mut replacer: F,
482 ) -> io::Result<usize>
483 where
484 W: io::Write,
485 F: FnMut(&IpMatch, &mut W) -> io::Result<()>,
486 {
487 let mut last = 0;
488 let mut count = 0;
489
490 for m in self.match_iter(haystack) {
491 let range = m.range();
492 wtr.write_all(&haystack[last..range.start])?;
493 replacer(&m, wtr)?;
494 last = range.end;
495 count += 1;
496 }
497
498 wtr.write_all(&haystack[last..])?;
499 Ok(count)
500 }
501}
502
503/// Boundary check for IP characters including defang brackets `[` and `]`.
504#[inline(always)]
505fn is_ip_or_bracket_char(b: u8) -> bool {
506 matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'.' | b':' | b'[' | b']')
507}
508
509/// Strip `[` and `]` from a byte slice, returning a cleaned copy.
510///
511/// Used by the defang DFA approach to normalize `192[.]168[.]1[.]1` → `192.168.1.1`
512/// before feeding to the standard validator.
513fn strip_brackets(bytes: &[u8]) -> Vec<u8> {
514 let mut out = Vec::with_capacity(bytes.len());
515 for &b in bytes {
516 if b != b'[' && b != b']' {
517 out.push(b);
518 }
519 }
520 out
521}
522
523/// A builder for configuring IP extraction behavior.
524///
525/// Use `ExtractorBuilder` to specify which types of IP addresses should be extracted.
526/// By default, it extracts both IPv4 and IPv6 but excludes private, loopback, and
527/// broadcast addresses.
528///
529/// # Example
530///
531/// ```no_run
532/// use ip_extract::ExtractorBuilder;
533///
534/// # fn main() -> anyhow::Result<()> {
535/// let extractor = ExtractorBuilder::new()
536/// .ipv4(true)
537/// .ipv6(false) // Only IPv4
538/// .private_ips(true) // Include private ranges
539/// .build()?;
540/// # Ok(())
541/// # }
542/// ```
543pub struct ExtractorBuilder {
544 include_ipv4: bool,
545 include_ipv6: bool,
546 include_private: bool,
547 include_loopback: bool,
548 include_broadcast: bool,
549}
550
551impl Default for ExtractorBuilder {
552 fn default() -> Self {
553 Self::new()
554 }
555}
556
557impl ExtractorBuilder {
558 /// Create a new builder with default settings.
559 ///
560 /// By default, **all IP addresses are extracted** (principle of least surprise).
561 /// Use `.only_public()` or `.ignore_*()` methods to filter specific categories.
562 ///
563 /// Defaults:
564 /// - IPv4: enabled
565 /// - IPv6: enabled
566 /// - Private IPs: **enabled** (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, fc00::/7)
567 /// - Loopback IPs: **enabled** (127.0.0.0/8, ::1)
568 /// - Broadcast IPs: **enabled** (255.255.255.255, link-local)
569 ///
570 /// # Examples
571 ///
572 /// ```no_run
573 /// use ip_extract::ExtractorBuilder;
574 ///
575 /// # fn main() -> anyhow::Result<()> {
576 /// // Extract all IPs (default)
577 /// let extractor = ExtractorBuilder::new().build()?;
578 ///
579 /// // Extract only public IPs
580 /// let extractor = ExtractorBuilder::new().only_public().build()?;
581 ///
582 /// // Granular control
583 /// let extractor = ExtractorBuilder::new()
584 /// .ignore_private()
585 /// .ignore_loopback()
586 /// .build()?;
587 /// # Ok(())
588 /// # }
589 /// ```
590 #[must_use]
591 pub fn new() -> Self {
592 Self {
593 include_ipv4: true,
594 include_ipv6: true,
595 include_private: true,
596 include_loopback: true,
597 include_broadcast: true,
598 }
599 }
600 /// Enable or disable IPv4 address extraction.
601 ///
602 /// Default: `true`
603 pub fn ipv4(&mut self, include: bool) -> &mut Self {
604 self.include_ipv4 = include;
605 self
606 }
607
608 /// Enable or disable IPv6 address extraction.
609 ///
610 /// Default: `true`
611 pub fn ipv6(&mut self, include: bool) -> &mut Self {
612 self.include_ipv6 = include;
613 self
614 }
615
616 /// Include private IP addresses (RFC 1918 for IPv4, ULA for IPv6).
617 ///
618 /// Private ranges include:
619 /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
620 /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
621 ///
622 /// Default: `true`
623 pub fn private_ips(&mut self, include: bool) -> &mut Self {
624 self.include_private = include;
625 self
626 }
627
628 /// Include loopback addresses.
629 ///
630 /// Loopback ranges:
631 /// - IPv4: 127.0.0.0/8
632 /// - IPv6: ::1
633 ///
634 /// Default: `true`
635 pub fn loopback_ips(&mut self, include: bool) -> &mut Self {
636 self.include_loopback = include;
637 self
638 }
639
640 /// Include broadcast addresses.
641 ///
642 /// Covers:
643 /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
644 /// - IPv6: link-local and other special ranges
645 ///
646 /// Default: `true`
647 pub fn broadcast_ips(&mut self, include: bool) -> &mut Self {
648 self.include_broadcast = include;
649 self
650 }
651
652 /// Ignore private IP addresses (convenience for `.private_ips(false)`).
653 ///
654 /// Excludes:
655 /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
656 /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
657 pub fn ignore_private(&mut self) -> &mut Self {
658 self.include_private = false;
659 self
660 }
661
662 /// Ignore loopback addresses (convenience for `.loopback_ips(false)`).
663 ///
664 /// Excludes:
665 /// - IPv4: 127.0.0.0/8
666 /// - IPv6: ::1
667 pub fn ignore_loopback(&mut self) -> &mut Self {
668 self.include_loopback = false;
669 self
670 }
671
672 /// Ignore broadcast addresses (convenience for `.broadcast_ips(false)`).
673 ///
674 /// Excludes:
675 /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
676 /// - IPv6: link-local and other special ranges
677 pub fn ignore_broadcast(&mut self) -> &mut Self {
678 self.include_broadcast = false;
679 self
680 }
681
682 /// Extract only publicly routable IP addresses.
683 ///
684 /// This is a convenience method equivalent to:
685 /// ```
686 /// # use ip_extract::ExtractorBuilder;
687 /// # let mut builder = ExtractorBuilder::new();
688 /// builder
689 /// .ignore_private()
690 /// .ignore_loopback()
691 /// .ignore_broadcast();
692 /// ```
693 ///
694 /// Excludes:
695 /// - Private: RFC 1918 (IPv4), ULA (IPv6)
696 /// - Loopback: 127.0.0.0/8, ::1
697 /// - Broadcast: 255.255.255.255, link-local ranges
698 ///
699 /// # Example
700 ///
701 /// ```no_run
702 /// use ip_extract::ExtractorBuilder;
703 ///
704 /// # fn main() -> anyhow::Result<()> {
705 /// let extractor = ExtractorBuilder::new()
706 /// .only_public()
707 /// .build()?;
708 /// # Ok(())
709 /// # }
710 /// ```
711 pub fn only_public(&mut self) -> &mut Self {
712 self.include_private = false;
713 self.include_loopback = false;
714 self.include_broadcast = false;
715 self
716 }
717
718 /// Build and return an `Extractor` with the configured settings.
719 ///
720 /// # Errors
721 ///
722 /// Returns an error if no IP version (IPv4 or IPv6) is enabled. At least one
723 /// must be selected.
724 ///
725 /// # Example
726 ///
727 /// ```no_run
728 /// use ip_extract::ExtractorBuilder;
729 ///
730 /// # fn main() -> anyhow::Result<()> {
731 /// let extractor = ExtractorBuilder::new()
732 /// .ipv4(true)
733 /// .ipv6(true)
734 /// .build()?;
735 /// # Ok(())
736 /// # }
737 /// ```
738 pub fn build(&self) -> anyhow::Result<Extractor> {
739 let ipv4 = ValidatorType::IPv4 {
740 include_private: self.include_private,
741 include_loopback: self.include_loopback,
742 include_broadcast: self.include_broadcast,
743 };
744 let ipv6 = ValidatorType::IPv6 {
745 include_private: self.include_private,
746 include_loopback: self.include_loopback,
747 };
748 // Pattern IDs assigned by build_many order: 0 = IPv4, 1 = IPv6.
749 // All DFAs are defang-aware (match both normal and bracket notation).
750 // validators[pid] must stay in sync with build.rs build_many order.
751 let (dfa, validators) = match (self.include_ipv4, self.include_ipv6) {
752 (true, true) => (get_both_dfa(), [ipv4, ipv6]),
753 (true, false) => (get_ipv4_dfa(), [ipv4, ipv6]),
754 // ipv6_only DFA has a single pattern: pid=0 maps to IPv6
755 (false, true) => (get_ipv6_dfa(), [ipv6, ipv4]),
756 _ => anyhow::bail!("No IP address patterns selected"),
757 };
758 Ok(Extractor { dfa, validators })
759 }
760}
761
762/// Validate an IPv4 address from a byte slice, applying filters.
763///
764/// This function uses `parse_ipv4_bytes` for strict validation and then checks
765/// against the provided inclusion filters.
766///
767/// # Arguments
768///
769/// * `bytes` - Candidate byte slice to validate.
770/// * `include_private` - Whether to include RFC 1918 addresses.
771/// * `include_loopback` - Whether to include 127.0.0.0/8 addresses.
772/// * `include_broadcast` - Whether to include broadcast and link-local addresses.
773#[inline]
774fn validate_ipv4(
775 bytes: &[u8],
776 include_private: bool,
777 include_loopback: bool,
778 include_broadcast: bool,
779) -> bool {
780 let Some(ipv4) = parse_ipv4_bytes(bytes) else {
781 return false;
782 };
783
784 if !include_private && ipv4.is_private() {
785 return false;
786 }
787 if !include_loopback && ipv4.is_loopback() {
788 return false;
789 }
790 if !include_broadcast && (ipv4.is_broadcast() || ipv4.is_link_local()) {
791 return false;
792 }
793 true
794}
795
796/// Extract all IPv4 and IPv6 addresses from input, returning them as strings.
797///
798/// This is a convenience function that uses default settings (all IP types included).
799/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
800///
801/// # Errors
802///
803/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
804///
805/// # Example
806///
807/// ```no_run
808/// use ip_extract::extract;
809///
810/// # fn main() -> anyhow::Result<()> {
811/// let ips = extract(b"Server at 192.168.1.1 and 2001:db8::1")?;
812/// assert_eq!(ips, vec!["192.168.1.1", "2001:db8::1"]);
813/// # Ok(())
814/// # }
815/// ```
816pub fn extract(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
817 let extractor = ExtractorBuilder::new().build()?;
818 Ok(extractor
819 .find_iter(haystack)
820 .map(|range| String::from_utf8_lossy(&haystack[range]).to_string())
821 .collect())
822}
823
824/// Extract unique IPv4 and IPv6 addresses from input, returning them as strings.
825///
826/// Maintains order of first observation (not lexicographic order).
827/// This is a convenience function that uses default settings (all IP types included).
828/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
829///
830/// # Errors
831///
832/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
833///
834/// # Example
835///
836/// ```no_run
837/// use ip_extract::extract_unique;
838///
839/// # fn main() -> anyhow::Result<()> {
840/// let ips = extract_unique(b"Server at 192.168.1.1, another at 192.168.1.1")?;
841/// assert_eq!(ips, vec!["192.168.1.1"]);
842/// # Ok(())
843/// # }
844/// ```
845pub fn extract_unique(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
846 use std::collections::HashSet;
847
848 let extractor = ExtractorBuilder::new().build()?;
849 let mut seen = HashSet::new();
850 let mut result = Vec::new();
851
852 for range in extractor.find_iter(haystack) {
853 let ip_str = String::from_utf8_lossy(&haystack[range]).to_string();
854 if seen.insert(ip_str.clone()) {
855 result.push(ip_str);
856 }
857 }
858
859 Ok(result)
860}
861
862/// Extract all IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
863///
864/// This is a convenience function that uses default settings (all IP types included).
865/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
866///
867/// # Errors
868///
869/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
870/// or if an extracted address cannot be parsed (should not happen in practice).
871///
872/// # Example
873///
874/// ```no_run
875/// use ip_extract::extract_parsed;
876///
877/// # fn main() -> anyhow::Result<()> {
878/// let ips = extract_parsed(b"Server at 192.168.1.1 and 2001:db8::1")?;
879/// assert_eq!(ips.len(), 2);
880/// assert!(ips[0].is_ipv4());
881/// assert!(ips[1].is_ipv6());
882/// # Ok(())
883/// # }
884/// ```
885pub fn extract_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
886 let extractor = ExtractorBuilder::new().build()?;
887 extractor
888 .find_iter(haystack)
889 .map(|range| {
890 let s = std::str::from_utf8(&haystack[range])
891 .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
892 s.parse::<IpAddr>()
893 .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))
894 })
895 .collect()
896}
897
898/// Extract unique IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
899///
900/// Maintains order of first observation (not lexicographic order).
901/// This is a convenience function that uses default settings (all IP types included).
902/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
903///
904/// # Errors
905///
906/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
907/// or if an extracted address cannot be parsed (should not happen in practice).
908///
909/// # Example
910///
911/// ```no_run
912/// use ip_extract::extract_unique_parsed;
913///
914/// # fn main() -> anyhow::Result<()> {
915/// let ips = extract_unique_parsed(b"Server at 192.168.1.1, another at 192.168.1.1")?;
916/// assert_eq!(ips.len(), 1);
917/// assert!(ips[0].is_ipv4());
918/// # Ok(())
919/// # }
920/// ```
921pub fn extract_unique_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
922 use std::collections::HashSet;
923
924 let extractor = ExtractorBuilder::new().build()?;
925 let mut seen = HashSet::new();
926 let mut result = Vec::new();
927
928 for range in extractor.find_iter(haystack) {
929 let s = std::str::from_utf8(&haystack[range])
930 .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
931 let addr = s
932 .parse::<IpAddr>()
933 .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))?;
934 if seen.insert(addr) {
935 result.push(addr);
936 }
937 }
938
939 Ok(result)
940}
941
942/// Parse an IPv4 address from a byte slice.
943///
944/// Performs strict validation of dotted-quad notation (e.g., `192.168.1.1`).
945/// Rejects:
946/// - Octet values > 255
947/// - Leading zeros (e.g., `192.168.001.1`)
948/// - Invalid formats
949///
950/// # Example
951///
952/// ```
953/// use ip_extract::parse_ipv4_bytes;
954///
955/// assert_eq!(parse_ipv4_bytes(b"192.168.1.1"), Some("192.168.1.1".parse().unwrap()));
956/// assert_eq!(parse_ipv4_bytes(b"256.1.1.1"), None); // Out of range
957/// assert_eq!(parse_ipv4_bytes(b"192.168.01.1"), None); // Leading zero
958/// ```
959#[must_use]
960#[inline]
961pub fn parse_ipv4_bytes(bytes: &[u8]) -> Option<Ipv4Addr> {
962 if bytes.len() < 7 || bytes.len() > 15 {
963 return None;
964 }
965 let mut octets = [0u8; 4];
966 let mut octet_idx = 0;
967 let mut current_val = 0u16;
968 let mut digits_in_octet = 0;
969 for &b in bytes {
970 match b {
971 b'.' => {
972 if digits_in_octet == 0 || octet_idx == 3 {
973 return None;
974 }
975 #[allow(clippy::cast_possible_truncation)]
976 {
977 octets[octet_idx] = current_val as u8;
978 }
979 octet_idx += 1;
980 current_val = 0;
981 digits_in_octet = 0;
982 }
983 b'0'..=b'9' => {
984 let digit = u16::from(b - b'0');
985 if digits_in_octet > 0 && current_val == 0 {
986 return None;
987 }
988 current_val = current_val * 10 + digit;
989 if current_val > 255 {
990 return None;
991 }
992 digits_in_octet += 1;
993 }
994 _ => return None,
995 }
996 }
997 if octet_idx != 3 || digits_in_octet == 0 {
998 return None;
999 }
1000 #[allow(clippy::cast_possible_truncation)]
1001 {
1002 octets[3] = current_val as u8;
1003 }
1004 Some(Ipv4Addr::new(octets[0], octets[1], octets[2], octets[3]))
1005}
1006
1007/// Check if an IPv6 address is a Unique Local Address (ULA) per RFC 4193.
1008/// ULA addresses are in the fc00::/7 range (fc00:: to fdff::).
1009#[inline]
1010fn is_unique_local(ip: &Ipv6Addr) -> bool {
1011 matches!(ip.octets()[0], 0xfc | 0xfd)
1012}
1013
1014/// Validate an IPv6 address from a byte slice, applying filters.
1015///
1016/// This function performs parsing and category-based filtering. It uses
1017/// `unsafe` `from_utf8_unchecked` for performance, as the candidates are
1018/// already filtered by the DFA for IP-like characters.
1019///
1020/// # Arguments
1021///
1022/// * `bytes` - Candidate byte slice to validate.
1023/// * `include_private` - Whether to include ULA and link-local addresses.
1024/// * `include_loopback` - Whether to include the loopback address (`::1`).
1025#[inline]
1026fn validate_ipv6(bytes: &[u8], include_private: bool, include_loopback: bool) -> bool {
1027 if bytes.len() < 2 {
1028 return false;
1029 }
1030 let s = unsafe { std::str::from_utf8_unchecked(bytes) };
1031 let Ok(ip) = s.parse::<IpAddr>() else {
1032 return false;
1033 };
1034
1035 match ip {
1036 IpAddr::V6(ipv6) => {
1037 if !include_private && (ipv6.is_unicast_link_local() || is_unique_local(&ipv6)) {
1038 return false;
1039 }
1040 if !include_loopback && ipv6.is_loopback() {
1041 return false;
1042 }
1043 true
1044 }
1045 IpAddr::V4(_) => false,
1046 }
1047}
1048
1049impl std::fmt::Debug for Extractor {
1050 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1051 f.debug_struct("Extractor")
1052 .field("validators", &self.validators)
1053 .finish()
1054 }
1055}