ip_extract/lib.rs
1//! High-performance IP address extraction and tagging engine.
2//!
3//! `ip-extract` provides a blazingly fast, configurable extractor for finding IPv4 and IPv6
4//! addresses in unstructured text. It achieves maximum throughput through:
5//!
6//! - **Compile-time DFA**: IP patterns are converted to dense Forward DFAs during build,
7//! eliminating runtime regex compilation and heap allocation.
8//! - **Zero-overhead scanning**: The DFA scans at O(n) with no backtracking; validation
9//! is performed only on candidates.
10//! - **Strict validation**: Deep checks eliminate false positives (e.g., `1.2.3.4.5` is rejected).
11//!
12//! ## Quick Start
13//!
14//! By default, **all IP addresses are extracted**:
15//!
16//! ```no_run
17//! use ip_extract::ExtractorBuilder;
18//!
19//! # fn main() -> anyhow::Result<()> {
20//! // Extract all IPs (default: includes private, loopback, broadcast)
21//! let extractor = ExtractorBuilder::new().build()?;
22//!
23//! let input = b"Connect from 192.168.1.1 to 2001:db8::1";
24//! for range in extractor.find_iter(input) {
25//! let ip = std::str::from_utf8(&input[range])?;
26//! println!("Found: {}", ip);
27//! }
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## Tagging and Output
33//!
34//! For more structured output (e.g., JSON), use the `Tagged` and `Tag` types:
35//!
36//! ```no_run
37//! use ip_extract::{ExtractorBuilder, Tagged, Tag};
38//!
39//! # fn main() -> anyhow::Result<()> {
40//! let extractor = ExtractorBuilder::new().build()?;
41//! let data = b"Server at 8.8.8.8";
42//! let mut tagged = Tagged::new(data);
43//!
44//! for range in extractor.find_iter(data) {
45//! let ip = std::str::from_utf8(&data[range.clone()])?;
46//! let tag = Tag::new(ip, ip).with_range(range);
47//! tagged = tagged.tag(tag);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ## Configuration
54//!
55//! Use `ExtractorBuilder` to filter specific IP categories:
56//!
57//! ```no_run
58//! use ip_extract::ExtractorBuilder;
59//!
60//! # fn main() -> anyhow::Result<()> {
61//! // Extract only publicly routable IPs
62//! let extractor = ExtractorBuilder::new()
63//! .only_public()
64//! .build()?;
65//!
66//! // Or use granular control
67//! let extractor = ExtractorBuilder::new()
68//! .ipv4(true) // Extract IPv4 (default: true)
69//! .ipv6(false) // Skip IPv6
70//! .ignore_private() // Skip RFC 1918 ranges
71//! .ignore_loopback() // Skip loopback (127.0.0.1, ::1)
72//! .build()?;
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Performance
78//!
79//! Typical throughput on modern hardware:
80//! - Dense IPs (mostly IP addresses): **160+ MiB/s**
81//! - Sparse logs (IPs mixed with text): **360+ MiB/s**
82//! - No IPs (pure scanning): **620+ MiB/s**
83//!
84//! See `benches/ip_benchmark.rs` for details.
85
86use std::io;
87use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
88use std::ops::Range;
89use std::sync::OnceLock;
90
91use regex_automata::dfa::dense::DFA;
92use regex_automata::dfa::Automaton;
93use regex_automata::Input;
94
95mod tag;
96pub use tag::{Tag, Tagged, TextData};
97
98/// Whether a validated IP match is IPv4 or IPv6.
99///
100/// Known at zero cost from the DFA pattern ID — no parsing required.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum IpKind {
103 V4,
104 V6,
105}
106
107/// A validated IP address match within a haystack.
108///
109/// Provides zero-copy access to the matched bytes and their position within
110/// the original haystack, plus the IP version. Parsing to [`IpAddr`] is
111/// available via [`ip()`][IpMatch::ip] but not cached — callers who look up
112/// the same IP repeatedly should cache at a higher level.
113#[derive(Debug, Clone)]
114pub struct IpMatch<'a> {
115 bytes: &'a [u8],
116 range: Range<usize>,
117 kind: IpKind,
118}
119
120impl<'a> IpMatch<'a> {
121 /// The matched IP address as a byte slice.
122 ///
123 /// Zero-copy: this is a slice directly into the haystack.
124 #[inline]
125 pub fn as_bytes(&self) -> &'a [u8] {
126 self.bytes
127 }
128
129 /// The clean IP address as a string, with any defang brackets removed.
130 ///
131 /// For normal (fanged) input this is a zero-copy borrow (`Cow::Borrowed`).
132 /// For defanged input (e.g. `"192.168.1[.]50"`) this allocates and strips
133 /// brackets, returning `Cow::Owned("192.168.1.50")`.
134 ///
135 /// This is the right default for MMDB lookups, deduplication, output, and
136 /// parsing. For the raw matched text (which may contain brackets), use
137 /// [`as_matched_str`][Self::as_matched_str].
138 pub fn as_str(&self) -> std::borrow::Cow<'a, str> {
139 if memchr::memchr(b'[', self.bytes).is_none() {
140 // SAFETY: IP characters and brackets are all ASCII.
141 std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.bytes) })
142 } else {
143 let cleaned = strip_brackets(self.bytes);
144 // SAFETY: strip_brackets retains only IP characters (ASCII).
145 std::borrow::Cow::Owned(unsafe { String::from_utf8_unchecked(cleaned) })
146 }
147 }
148
149 /// The raw matched text as a string slice.
150 ///
151 /// Returns the exact bytes matched in the haystack — for defanged input,
152 /// this may include bracket characters (e.g. `"192.168.1[.]50"`). Use
153 /// [`as_str`][Self::as_str] when you need the canonical IP form.
154 ///
155 /// Zero-copy: this is a slice directly into the haystack. Safe without
156 /// UTF-8 validation because all matched characters (digits, hex, `.`, `:`,
157 /// `[`, `]`) are ASCII.
158 #[inline]
159 pub fn as_matched_str(&self) -> &'a str {
160 // SAFETY: IP characters and brackets are all ASCII.
161 unsafe { std::str::from_utf8_unchecked(self.bytes) }
162 }
163
164 /// The byte range of this match within the original haystack.
165 #[inline]
166 pub fn range(&self) -> Range<usize> {
167 self.range.clone()
168 }
169
170 /// Whether this match is IPv4 or IPv6.
171 #[inline]
172 pub fn kind(&self) -> IpKind {
173 self.kind
174 }
175
176 /// Parse the matched bytes into an [`IpAddr`].
177 ///
178 /// Automatically strips defang brackets before parsing — safe to call on
179 /// both normal and defanged matches. Not cached; callers processing the
180 /// same IP repeatedly should cache at a higher level.
181 ///
182 /// # Panics
183 ///
184 /// Panics if the validated bytes cannot be parsed as an IP address.
185 /// This should not happen in practice because matches are validated by the DFA.
186 pub fn ip(&self) -> IpAddr {
187 let s = self.as_str();
188 match self.kind {
189 IpKind::V4 => IpAddr::V4(s.parse::<Ipv4Addr>().expect("validated by DFA")),
190 IpKind::V6 => IpAddr::V6(s.parse::<Ipv6Addr>().expect("validated by DFA")),
191 }
192 }
193}
194
195// Alignment wrapper: guarantees u32 alignment for DFA deserialization.
196// DFA::from_bytes() requires the byte slice to be u32-aligned; include_bytes!() only
197// guarantees byte alignment. Wrapping in repr(C, align(4)) satisfies this at compile time,
198// with zero runtime cost: no allocation, no copy, no Box::leak.
199#[repr(C, align(4))]
200struct AlignedDfa<T: ?Sized>(T);
201
202static IPV4_DFA_BYTES: &AlignedDfa<[u8]> =
203 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv4.dfa")));
204static IPV6_DFA_BYTES: &AlignedDfa<[u8]> =
205 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv6.dfa")));
206static BOTH_DFA_BYTES: &AlignedDfa<[u8]> =
207 &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/both.dfa")));
208
209static DFA_IPV4: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
210static DFA_IPV6: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
211static DFA_BOTH: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
212
213fn load_dfa(aligned: &'static AlignedDfa<[u8]>) -> DFA<&'static [u32]> {
214 let (dfa, _) = DFA::from_bytes(&aligned.0).expect("valid dfa from build.rs");
215 dfa
216}
217
218fn get_ipv4_dfa() -> &'static DFA<&'static [u32]> {
219 DFA_IPV4.get_or_init(|| load_dfa(IPV4_DFA_BYTES))
220}
221fn get_ipv6_dfa() -> &'static DFA<&'static [u32]> {
222 DFA_IPV6.get_or_init(|| load_dfa(IPV6_DFA_BYTES))
223}
224fn get_both_dfa() -> &'static DFA<&'static [u32]> {
225 DFA_BOTH.get_or_init(|| load_dfa(BOTH_DFA_BYTES))
226}
227
228#[derive(Clone, Debug)]
229enum ValidatorType {
230 IPv4 {
231 include_private: bool,
232 include_loopback: bool,
233 include_broadcast: bool,
234 },
235 IPv6 {
236 include_private: bool,
237 include_loopback: bool,
238 },
239}
240
241impl ValidatorType {
242 #[inline(always)]
243 fn validate(&self, bytes: &[u8]) -> bool {
244 match *self {
245 ValidatorType::IPv4 {
246 include_private,
247 include_loopback,
248 include_broadcast,
249 } => validate_ipv4(bytes, include_private, include_loopback, include_broadcast),
250 ValidatorType::IPv6 {
251 include_private,
252 include_loopback,
253 } => validate_ipv6(bytes, include_private, include_loopback),
254 }
255 }
256
257 #[inline(always)]
258 fn kind(&self) -> IpKind {
259 match self {
260 ValidatorType::IPv4 { .. } => IpKind::V4,
261 ValidatorType::IPv6 { .. } => IpKind::V6,
262 }
263 }
264}
265
266/// The main IP address extractor.
267///
268/// An `Extractor` scans byte slices for IPv4 and/or IPv6 addresses, applying configurable
269/// filters to include or exclude certain address classes (private, loopback, broadcast).
270///
271/// Extractors are best created via [`ExtractorBuilder`] and are designed to be reused
272/// across many calls to `find_iter` for maximum efficiency.
273///
274/// # Bytes vs. Strings
275///
276/// This extractor works directly on byte slices rather than strings. This avoids UTF-8
277/// validation overhead and enables zero-copy scanning of very large inputs.
278///
279/// # Performance
280///
281/// The extractor uses a compile-time DFA (Deterministic Finite Automaton) for O(n)
282/// scanning with minimal overhead. See the crate-level documentation for throughput benchmarks.
283pub struct Extractor {
284 dfa: &'static DFA<&'static [u32]>,
285 validators: [ValidatorType; 2],
286}
287
288impl Extractor {
289 /// Find all IP addresses in a byte slice.
290 ///
291 /// Returns an iterator of byte ranges `[start, end)` pointing to each IP
292 /// address found. Ranges are guaranteed to be valid indices into `haystack`.
293 ///
294 /// For richer match information (IP version, direct string access), use
295 /// [`match_iter`][Extractor::match_iter] instead.
296 ///
297 /// # Example
298 ///
299 /// ```no_run
300 /// use ip_extract::ExtractorBuilder;
301 ///
302 /// # fn main() -> anyhow::Result<()> {
303 /// let extractor = ExtractorBuilder::new().build()?;
304 /// let data = b"Connecting from 192.168.1.1";
305 ///
306 /// for range in extractor.find_iter(data) {
307 /// let ip = std::str::from_utf8(&data[range])?;
308 /// println!("Found: {ip}");
309 /// }
310 /// # Ok(())
311 /// # }
312 /// ```
313 #[inline]
314 pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = Range<usize>> + 'a {
315 self.match_iter(haystack).map(|m| m.range())
316 }
317
318 /// Find all IP addresses in a byte slice, yielding rich [`IpMatch`] values.
319 ///
320 /// Like [`find_iter`][Extractor::find_iter], but each match carries the
321 /// matched bytes, their position in the haystack, and the IP version —
322 /// eliminating the need to re-parse or guess the version at the call site.
323 ///
324 /// # Example
325 ///
326 /// ```no_run
327 /// use ip_extract::ExtractorBuilder;
328 ///
329 /// # fn main() -> anyhow::Result<()> {
330 /// let extractor = ExtractorBuilder::new().build()?;
331 /// let data = b"Log: 192.168.1.1 sent request to 2001:db8::1";
332 ///
333 /// for m in extractor.match_iter(data) {
334 /// println!("{} ({:?})", m.as_matched_str(), m.kind());
335 /// }
336 /// # Ok(())
337 /// # }
338 /// ```
339 #[inline]
340 pub fn match_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = IpMatch<'a>> + 'a {
341 let mut input = Input::new(haystack);
342
343 std::iter::from_fn(move || loop {
344 let Ok(Some(m)) = self.dfa.try_search_fwd(&input) else {
345 return None;
346 };
347
348 let end = m.offset();
349 let pid = m.pattern().as_usize();
350 let validator = &self.validators[pid];
351
352 input.set_start(end);
353
354 // Bracket-aware boundary scan (defang always-on: [.] and [:] are valid IP chars).
355 let floor = end.saturating_sub(55); // wider for bracket notation:
356 // max defanged IPv6 ≈ 53 chars
357 let raw_start = (floor..end)
358 .rev()
359 .find(|&i| i == 0 || !is_ip_or_bracket_char(haystack[i - 1]))
360 .unwrap_or(floor);
361
362 // A lone `[` at the start of the candidate is a surrounding bracket (e.g. "[3.3.3.3]"),
363 // not a defang bracket. Defang brackets always surround a separator character:
364 // `[.]`, `[:]`, or `[::]`. Skip a leading `[` that is followed by a digit or hex
365 // character (not `.` or `:`), since that pattern is never valid defang notation.
366 let start = if raw_start < end
367 && haystack[raw_start] == b'['
368 && raw_start + 1 < end
369 && haystack[raw_start + 1] != b'.'
370 && haystack[raw_start + 1] != b':'
371 {
372 raw_start + 1
373 } else {
374 raw_start
375 };
376
377 let valid_right_boundary = match end.cmp(&haystack.len()) {
378 std::cmp::Ordering::Less => {
379 let next = haystack[end];
380 match validator {
381 ValidatorType::IPv4 { .. } => {
382 !(next.is_ascii_digit()
383 || next == b'.'
384 && end + 1 < haystack.len()
385 && haystack[end + 1].is_ascii_digit())
386 }
387 ValidatorType::IPv6 { .. } => !is_ip_or_bracket_char(next),
388 }
389 }
390 _ => true,
391 };
392
393 if !valid_right_boundary {
394 continue;
395 }
396
397 let candidate = &haystack[start..end];
398
399 // Strip brackets before validation (handles both fanged and defanged input).
400 // On normal (fanged) input, memchr scans ~7-15 bytes per match and finds
401 // nothing — falling straight to the else branch with no allocation. The
402 // strip_brackets path only runs when brackets are actually present.
403 if memchr::memchr(b'[', candidate).is_some() {
404 let cleaned = strip_brackets(candidate);
405 if validator.validate(&cleaned) {
406 return Some(IpMatch {
407 bytes: candidate,
408 range: start..end,
409 kind: validator.kind(),
410 });
411 }
412 } else if validator.validate(candidate) {
413 return Some(IpMatch {
414 bytes: candidate,
415 range: start..end,
416 kind: validator.kind(),
417 });
418 }
419 })
420 }
421
422 /// Scan `haystack` for IP addresses, writing non-IP text to `wtr` and
423 /// calling `replacer` for each match.
424 ///
425 /// This is the efficient single-pass decoration primitive: the caller
426 /// never needs to track byte offsets or manage gap writes. The replacer
427 /// writes the substitution directly to `wtr` — no intermediate allocation.
428 ///
429 /// Returns the number of IP addresses found.
430 ///
431 /// # Errors
432 ///
433 /// Returns the first `io::Error` from either a gap write or the replacer.
434 ///
435 /// # Example
436 ///
437 /// ```no_run
438 /// use ip_extract::ExtractorBuilder;
439 /// use std::io::Write;
440 ///
441 /// # fn main() -> anyhow::Result<()> {
442 /// let extractor = ExtractorBuilder::new().build()?;
443 /// let data = b"Server 192.168.1.1 is up";
444 /// let mut out = Vec::new();
445 ///
446 /// let count = extractor.replace_iter(data, &mut out, |m, w| {
447 /// write!(w, "[{}]", m.as_matched_str())
448 /// })?;
449 ///
450 /// assert_eq!(count, 1);
451 /// assert_eq!(out, b"Server [192.168.1.1] is up");
452 /// # Ok(())
453 /// # }
454 /// ```
455 pub fn replace_iter<W, F>(
456 &self,
457 haystack: &[u8],
458 wtr: &mut W,
459 mut replacer: F,
460 ) -> io::Result<usize>
461 where
462 W: io::Write,
463 F: FnMut(&IpMatch, &mut W) -> io::Result<()>,
464 {
465 let mut last = 0;
466 let mut count = 0;
467
468 for m in self.match_iter(haystack) {
469 let range = m.range();
470 wtr.write_all(&haystack[last..range.start])?;
471 replacer(&m, wtr)?;
472 last = range.end;
473 count += 1;
474 }
475
476 wtr.write_all(&haystack[last..])?;
477 Ok(count)
478 }
479}
480
481/// Boundary check for IP characters including defang brackets `[` and `]`.
482#[inline(always)]
483fn is_ip_or_bracket_char(b: u8) -> bool {
484 matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'.' | b':' | b'[' | b']')
485}
486
487/// Strip `[` and `]` from a byte slice, returning a cleaned copy.
488///
489/// Used by the defang DFA approach to normalize `192[.]168[.]1[.]1` → `192.168.1.1`
490/// before feeding to the standard validator.
491fn strip_brackets(bytes: &[u8]) -> Vec<u8> {
492 let mut out = Vec::with_capacity(bytes.len());
493 for &b in bytes {
494 if b != b'[' && b != b']' {
495 out.push(b);
496 }
497 }
498 out
499}
500
501/// A builder for configuring IP extraction behavior.
502///
503/// Use `ExtractorBuilder` to specify which types of IP addresses should be extracted.
504/// By default, it extracts both IPv4 and IPv6 but excludes private, loopback, and
505/// broadcast addresses.
506///
507/// # Example
508///
509/// ```no_run
510/// use ip_extract::ExtractorBuilder;
511///
512/// # fn main() -> anyhow::Result<()> {
513/// let extractor = ExtractorBuilder::new()
514/// .ipv4(true)
515/// .ipv6(false) // Only IPv4
516/// .private_ips(true) // Include private ranges
517/// .build()?;
518/// # Ok(())
519/// # }
520/// ```
521pub struct ExtractorBuilder {
522 include_ipv4: bool,
523 include_ipv6: bool,
524 include_private: bool,
525 include_loopback: bool,
526 include_broadcast: bool,
527}
528
529impl Default for ExtractorBuilder {
530 fn default() -> Self {
531 Self::new()
532 }
533}
534
535impl ExtractorBuilder {
536 /// Create a new builder with default settings.
537 ///
538 /// By default, **all IP addresses are extracted** (principle of least surprise).
539 /// Use `.only_public()` or `.ignore_*()` methods to filter specific categories.
540 ///
541 /// Defaults:
542 /// - IPv4: enabled
543 /// - IPv6: enabled
544 /// - Private IPs: **enabled** (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, fc00::/7)
545 /// - Loopback IPs: **enabled** (127.0.0.0/8, ::1)
546 /// - Broadcast IPs: **enabled** (255.255.255.255, link-local)
547 ///
548 /// # Examples
549 ///
550 /// ```no_run
551 /// use ip_extract::ExtractorBuilder;
552 ///
553 /// # fn main() -> anyhow::Result<()> {
554 /// // Extract all IPs (default)
555 /// let extractor = ExtractorBuilder::new().build()?;
556 ///
557 /// // Extract only public IPs
558 /// let extractor = ExtractorBuilder::new().only_public().build()?;
559 ///
560 /// // Granular control
561 /// let extractor = ExtractorBuilder::new()
562 /// .ignore_private()
563 /// .ignore_loopback()
564 /// .build()?;
565 /// # Ok(())
566 /// # }
567 /// ```
568 #[must_use]
569 pub fn new() -> Self {
570 Self {
571 include_ipv4: true,
572 include_ipv6: true,
573 include_private: true,
574 include_loopback: true,
575 include_broadcast: true,
576 }
577 }
578 /// Enable or disable IPv4 address extraction.
579 ///
580 /// Default: `true`
581 pub fn ipv4(&mut self, include: bool) -> &mut Self {
582 self.include_ipv4 = include;
583 self
584 }
585
586 /// Enable or disable IPv6 address extraction.
587 ///
588 /// Default: `true`
589 pub fn ipv6(&mut self, include: bool) -> &mut Self {
590 self.include_ipv6 = include;
591 self
592 }
593
594 /// Include private IP addresses (RFC 1918 for IPv4, ULA for IPv6).
595 ///
596 /// Private ranges include:
597 /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
598 /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
599 ///
600 /// Default: `true`
601 pub fn private_ips(&mut self, include: bool) -> &mut Self {
602 self.include_private = include;
603 self
604 }
605
606 /// Include loopback addresses.
607 ///
608 /// Loopback ranges:
609 /// - IPv4: 127.0.0.0/8
610 /// - IPv6: ::1
611 ///
612 /// Default: `true`
613 pub fn loopback_ips(&mut self, include: bool) -> &mut Self {
614 self.include_loopback = include;
615 self
616 }
617
618 /// Include broadcast addresses.
619 ///
620 /// Covers:
621 /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
622 /// - IPv6: link-local and other special ranges
623 ///
624 /// Default: `true`
625 pub fn broadcast_ips(&mut self, include: bool) -> &mut Self {
626 self.include_broadcast = include;
627 self
628 }
629
630 /// Ignore private IP addresses (convenience for `.private_ips(false)`).
631 ///
632 /// Excludes:
633 /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
634 /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
635 pub fn ignore_private(&mut self) -> &mut Self {
636 self.include_private = false;
637 self
638 }
639
640 /// Ignore loopback addresses (convenience for `.loopback_ips(false)`).
641 ///
642 /// Excludes:
643 /// - IPv4: 127.0.0.0/8
644 /// - IPv6: ::1
645 pub fn ignore_loopback(&mut self) -> &mut Self {
646 self.include_loopback = false;
647 self
648 }
649
650 /// Ignore broadcast addresses (convenience for `.broadcast_ips(false)`).
651 ///
652 /// Excludes:
653 /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
654 /// - IPv6: link-local and other special ranges
655 pub fn ignore_broadcast(&mut self) -> &mut Self {
656 self.include_broadcast = false;
657 self
658 }
659
660 /// Extract only publicly routable IP addresses.
661 ///
662 /// This is a convenience method equivalent to:
663 /// ```
664 /// # use ip_extract::ExtractorBuilder;
665 /// # let mut builder = ExtractorBuilder::new();
666 /// builder
667 /// .ignore_private()
668 /// .ignore_loopback()
669 /// .ignore_broadcast();
670 /// ```
671 ///
672 /// Excludes:
673 /// - Private: RFC 1918 (IPv4), ULA (IPv6)
674 /// - Loopback: 127.0.0.0/8, ::1
675 /// - Broadcast: 255.255.255.255, link-local ranges
676 ///
677 /// # Example
678 ///
679 /// ```no_run
680 /// use ip_extract::ExtractorBuilder;
681 ///
682 /// # fn main() -> anyhow::Result<()> {
683 /// let extractor = ExtractorBuilder::new()
684 /// .only_public()
685 /// .build()?;
686 /// # Ok(())
687 /// # }
688 /// ```
689 pub fn only_public(&mut self) -> &mut Self {
690 self.include_private = false;
691 self.include_loopback = false;
692 self.include_broadcast = false;
693 self
694 }
695
696 /// Build and return an `Extractor` with the configured settings.
697 ///
698 /// # Errors
699 ///
700 /// Returns an error if no IP version (IPv4 or IPv6) is enabled. At least one
701 /// must be selected.
702 ///
703 /// # Example
704 ///
705 /// ```no_run
706 /// use ip_extract::ExtractorBuilder;
707 ///
708 /// # fn main() -> anyhow::Result<()> {
709 /// let extractor = ExtractorBuilder::new()
710 /// .ipv4(true)
711 /// .ipv6(true)
712 /// .build()?;
713 /// # Ok(())
714 /// # }
715 /// ```
716 pub fn build(&self) -> anyhow::Result<Extractor> {
717 let ipv4 = ValidatorType::IPv4 {
718 include_private: self.include_private,
719 include_loopback: self.include_loopback,
720 include_broadcast: self.include_broadcast,
721 };
722 let ipv6 = ValidatorType::IPv6 {
723 include_private: self.include_private,
724 include_loopback: self.include_loopback,
725 };
726 // Pattern IDs assigned by build_many order: 0 = IPv4, 1 = IPv6.
727 // All DFAs are defang-aware (match both normal and bracket notation).
728 // validators[pid] must stay in sync with build.rs build_many order.
729 let (dfa, validators) = match (self.include_ipv4, self.include_ipv6) {
730 (true, true) => (get_both_dfa(), [ipv4, ipv6]),
731 (true, false) => (get_ipv4_dfa(), [ipv4, ipv6]),
732 // ipv6_only DFA has a single pattern: pid=0 maps to IPv6
733 (false, true) => (get_ipv6_dfa(), [ipv6, ipv4]),
734 _ => anyhow::bail!("No IP address patterns selected"),
735 };
736 Ok(Extractor { dfa, validators })
737 }
738}
739
740/// Validate an IPv4 address from a byte slice, applying filters.
741///
742/// This function uses `parse_ipv4_bytes` for strict validation and then checks
743/// against the provided inclusion filters.
744///
745/// # Arguments
746///
747/// * `bytes` - Candidate byte slice to validate.
748/// * `include_private` - Whether to include RFC 1918 addresses.
749/// * `include_loopback` - Whether to include 127.0.0.0/8 addresses.
750/// * `include_broadcast` - Whether to include broadcast and link-local addresses.
751#[inline]
752fn validate_ipv4(
753 bytes: &[u8],
754 include_private: bool,
755 include_loopback: bool,
756 include_broadcast: bool,
757) -> bool {
758 let Some(ipv4) = parse_ipv4_bytes(bytes) else {
759 return false;
760 };
761
762 if !include_private && ipv4.is_private() {
763 return false;
764 }
765 if !include_loopback && ipv4.is_loopback() {
766 return false;
767 }
768 if !include_broadcast && (ipv4.is_broadcast() || ipv4.is_link_local()) {
769 return false;
770 }
771 true
772}
773
774/// Extract all IPv4 and IPv6 addresses from input, returning them as strings.
775///
776/// This is a convenience function that uses default settings (all IP types included).
777/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
778///
779/// # Errors
780///
781/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
782///
783/// # Example
784///
785/// ```no_run
786/// use ip_extract::extract;
787///
788/// # fn main() -> anyhow::Result<()> {
789/// let ips = extract(b"Server at 192.168.1.1 and 2001:db8::1")?;
790/// assert_eq!(ips, vec!["192.168.1.1", "2001:db8::1"]);
791/// # Ok(())
792/// # }
793/// ```
794pub fn extract(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
795 let extractor = ExtractorBuilder::new().build()?;
796 Ok(extractor
797 .find_iter(haystack)
798 .map(|range| String::from_utf8_lossy(&haystack[range]).to_string())
799 .collect())
800}
801
802/// Extract unique IPv4 and IPv6 addresses from input, returning them as strings.
803///
804/// Maintains order of first observation (not lexicographic order).
805/// This is a convenience function that uses default settings (all IP types included).
806/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
807///
808/// # Errors
809///
810/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
811///
812/// # Example
813///
814/// ```no_run
815/// use ip_extract::extract_unique;
816///
817/// # fn main() -> anyhow::Result<()> {
818/// let ips = extract_unique(b"Server at 192.168.1.1, another at 192.168.1.1")?;
819/// assert_eq!(ips, vec!["192.168.1.1"]);
820/// # Ok(())
821/// # }
822/// ```
823pub fn extract_unique(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
824 use std::collections::HashSet;
825
826 let extractor = ExtractorBuilder::new().build()?;
827 let mut seen = HashSet::new();
828 let mut result = Vec::new();
829
830 for range in extractor.find_iter(haystack) {
831 let ip_str = String::from_utf8_lossy(&haystack[range]).to_string();
832 if seen.insert(ip_str.clone()) {
833 result.push(ip_str);
834 }
835 }
836
837 Ok(result)
838}
839
840/// Extract all IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
841///
842/// This is a convenience function that uses default settings (all IP types included).
843/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
844///
845/// # Errors
846///
847/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
848/// or if an extracted address cannot be parsed (should not happen in practice).
849///
850/// # Example
851///
852/// ```no_run
853/// use ip_extract::extract_parsed;
854///
855/// # fn main() -> anyhow::Result<()> {
856/// let ips = extract_parsed(b"Server at 192.168.1.1 and 2001:db8::1")?;
857/// assert_eq!(ips.len(), 2);
858/// assert!(ips[0].is_ipv4());
859/// assert!(ips[1].is_ipv6());
860/// # Ok(())
861/// # }
862/// ```
863pub fn extract_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
864 let extractor = ExtractorBuilder::new().build()?;
865 extractor
866 .find_iter(haystack)
867 .map(|range| {
868 let s = std::str::from_utf8(&haystack[range])
869 .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
870 s.parse::<IpAddr>()
871 .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))
872 })
873 .collect()
874}
875
876/// Extract unique IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
877///
878/// Maintains order of first observation (not lexicographic order).
879/// This is a convenience function that uses default settings (all IP types included).
880/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
881///
882/// # Errors
883///
884/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
885/// or if an extracted address cannot be parsed (should not happen in practice).
886///
887/// # Example
888///
889/// ```no_run
890/// use ip_extract::extract_unique_parsed;
891///
892/// # fn main() -> anyhow::Result<()> {
893/// let ips = extract_unique_parsed(b"Server at 192.168.1.1, another at 192.168.1.1")?;
894/// assert_eq!(ips.len(), 1);
895/// assert!(ips[0].is_ipv4());
896/// # Ok(())
897/// # }
898/// ```
899pub fn extract_unique_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
900 use std::collections::HashSet;
901
902 let extractor = ExtractorBuilder::new().build()?;
903 let mut seen = HashSet::new();
904 let mut result = Vec::new();
905
906 for range in extractor.find_iter(haystack) {
907 let s = std::str::from_utf8(&haystack[range])
908 .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
909 let addr = s
910 .parse::<IpAddr>()
911 .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))?;
912 if seen.insert(addr) {
913 result.push(addr);
914 }
915 }
916
917 Ok(result)
918}
919
920/// Parse an IPv4 address from a byte slice.
921///
922/// Performs strict validation of dotted-quad notation (e.g., `192.168.1.1`).
923/// Rejects:
924/// - Octet values > 255
925/// - Leading zeros (e.g., `192.168.001.1`)
926/// - Invalid formats
927///
928/// # Example
929///
930/// ```
931/// use ip_extract::parse_ipv4_bytes;
932///
933/// assert_eq!(parse_ipv4_bytes(b"192.168.1.1"), Some("192.168.1.1".parse().unwrap()));
934/// assert_eq!(parse_ipv4_bytes(b"256.1.1.1"), None); // Out of range
935/// assert_eq!(parse_ipv4_bytes(b"192.168.01.1"), None); // Leading zero
936/// ```
937#[must_use]
938#[inline]
939pub fn parse_ipv4_bytes(bytes: &[u8]) -> Option<Ipv4Addr> {
940 if bytes.len() < 7 || bytes.len() > 15 {
941 return None;
942 }
943 let mut octets = [0u8; 4];
944 let mut octet_idx = 0;
945 let mut current_val = 0u16;
946 let mut digits_in_octet = 0;
947 for &b in bytes {
948 match b {
949 b'.' => {
950 if digits_in_octet == 0 || octet_idx == 3 {
951 return None;
952 }
953 #[allow(clippy::cast_possible_truncation)]
954 {
955 octets[octet_idx] = current_val as u8;
956 }
957 octet_idx += 1;
958 current_val = 0;
959 digits_in_octet = 0;
960 }
961 b'0'..=b'9' => {
962 let digit = u16::from(b - b'0');
963 if digits_in_octet > 0 && current_val == 0 {
964 return None;
965 }
966 current_val = current_val * 10 + digit;
967 if current_val > 255 {
968 return None;
969 }
970 digits_in_octet += 1;
971 }
972 _ => return None,
973 }
974 }
975 if octet_idx != 3 || digits_in_octet == 0 {
976 return None;
977 }
978 #[allow(clippy::cast_possible_truncation)]
979 {
980 octets[3] = current_val as u8;
981 }
982 Some(Ipv4Addr::new(octets[0], octets[1], octets[2], octets[3]))
983}
984
985/// Check if an IPv6 address is a Unique Local Address (ULA) per RFC 4193.
986/// ULA addresses are in the fc00::/7 range (fc00:: to fdff::).
987#[inline]
988fn is_unique_local(ip: &Ipv6Addr) -> bool {
989 matches!(ip.octets()[0], 0xfc | 0xfd)
990}
991
992/// Validate an IPv6 address from a byte slice, applying filters.
993///
994/// This function performs parsing and category-based filtering. It uses
995/// `unsafe` `from_utf8_unchecked` for performance, as the candidates are
996/// already filtered by the DFA for IP-like characters.
997///
998/// # Arguments
999///
1000/// * `bytes` - Candidate byte slice to validate.
1001/// * `include_private` - Whether to include ULA and link-local addresses.
1002/// * `include_loopback` - Whether to include the loopback address (`::1`).
1003#[inline]
1004fn validate_ipv6(bytes: &[u8], include_private: bool, include_loopback: bool) -> bool {
1005 if bytes.len() < 2 {
1006 return false;
1007 }
1008 let s = unsafe { std::str::from_utf8_unchecked(bytes) };
1009 let Ok(ip) = s.parse::<IpAddr>() else {
1010 return false;
1011 };
1012
1013 match ip {
1014 IpAddr::V6(ipv6) => {
1015 if !include_private && (ipv6.is_unicast_link_local() || is_unique_local(&ipv6)) {
1016 return false;
1017 }
1018 if !include_loopback && ipv6.is_loopback() {
1019 return false;
1020 }
1021 true
1022 }
1023 IpAddr::V4(_) => false,
1024 }
1025}
1026
1027impl std::fmt::Debug for Extractor {
1028 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1029 f.debug_struct("Extractor")
1030 .field("validators", &self.validators)
1031 .finish()
1032 }
1033}