Skip to main content

ip_extract/
lib.rs

1//! High-performance IP address extraction and tagging engine.
2//!
3//! `ip-extract` provides a blazingly fast, configurable extractor for finding IPv4 and IPv6
4//! addresses in unstructured text. It achieves maximum throughput through:
5//!
6//! - **Compile-time DFA**: IP patterns are converted to dense Forward DFAs during build,
7//!   eliminating runtime regex compilation and heap allocation.
8//! - **Zero-overhead scanning**: The DFA scans at O(n) with no backtracking; validation
9//!   is performed only on candidates.
10//! - **Strict validation**: Deep checks eliminate false positives (e.g., `1.2.3.4.5` is rejected).
11//!
12//! ## Quick Start
13//!
14//! By default, **all IP addresses are extracted**:
15//!
16//! ```no_run
17//! use ip_extract::ExtractorBuilder;
18//!
19//! # fn main() -> anyhow::Result<()> {
20//! // Extract all IPs (default: includes private, loopback, broadcast)
21//! let extractor = ExtractorBuilder::new().build()?;
22//!
23//! let input = b"Connect from 192.168.1.1 to 2001:db8::1";
24//! for range in extractor.find_iter(input) {
25//!     let ip = std::str::from_utf8(&input[range])?;
26//!     println!("Found: {}", ip);
27//! }
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## Tagging and Output
33//!
34//! For more structured output (e.g., JSON), use the `Tagged` and `Tag` types:
35//!
36//! ```no_run
37//! use ip_extract::{ExtractorBuilder, Tagged, Tag};
38//!
39//! # fn main() -> anyhow::Result<()> {
40//! let extractor = ExtractorBuilder::new().build()?;
41//! let data = b"Server at 8.8.8.8";
42//! let mut tagged = Tagged::new(data);
43//!
44//! for range in extractor.find_iter(data) {
45//!     let ip = std::str::from_utf8(&data[range.clone()])?;
46//!     let tag = Tag::new(ip, ip).with_range(range);
47//!     tagged = tagged.tag(tag);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ## Configuration
54//!
55//! Use `ExtractorBuilder` to filter specific IP categories:
56//!
57//! ```no_run
58//! use ip_extract::ExtractorBuilder;
59//!
60//! # fn main() -> anyhow::Result<()> {
61//! // Extract only publicly routable IPs
62//! let extractor = ExtractorBuilder::new()
63//!     .only_public()
64//!     .build()?;
65//!
66//! // Or use granular control
67//! let extractor = ExtractorBuilder::new()
68//!     .ipv4(true)            // Extract IPv4 (default: true)
69//!     .ipv6(false)           // Skip IPv6
70//!     .ignore_private()      // Skip RFC 1918 ranges
71//!     .ignore_loopback()     // Skip loopback (127.0.0.1, ::1)
72//!     .build()?;
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Performance
78//!
79//! Typical throughput on modern hardware:
80//! - Dense IPs (mostly IP addresses): **160+ MiB/s**
81//! - Sparse logs (IPs mixed with text): **360+ MiB/s**
82//! - No IPs (pure scanning): **620+ MiB/s**
83//!
84//! See `benches/ip_benchmark.rs` for details.
85
86use std::io;
87use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
88use std::ops::Range;
89use std::sync::OnceLock;
90
91use regex_automata::dfa::dense::DFA;
92use regex_automata::dfa::Automaton;
93use regex_automata::Input;
94
95mod tag;
96pub use tag::{Tag, Tagged, TextData};
97
98/// Whether a validated IP match is IPv4 or IPv6.
99///
100/// Known at zero cost from the DFA pattern ID — no parsing required.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum IpKind {
103    V4,
104    V6,
105}
106
107/// A validated IP address match within a haystack.
108///
109/// Provides zero-copy access to the matched bytes and their position within
110/// the original haystack, plus the IP version. Parsing to [`IpAddr`] is
111/// available via [`ip()`][IpMatch::ip] but not cached — callers who look up
112/// the same IP repeatedly should cache at a higher level.
113#[derive(Debug, Clone)]
114pub struct IpMatch<'a> {
115    bytes: &'a [u8],
116    range: Range<usize>,
117    kind: IpKind,
118}
119
120impl<'a> IpMatch<'a> {
121    /// The matched IP address as a byte slice.
122    ///
123    /// Zero-copy: this is a slice directly into the haystack.
124    #[inline]
125    pub fn as_bytes(&self) -> &'a [u8] {
126        self.bytes
127    }
128
129    /// The clean IP address as a string, with any defang brackets removed.
130    ///
131    /// For normal (fanged) input this is a zero-copy borrow (`Cow::Borrowed`).
132    /// For defanged input (e.g. `"192.168.1[.]50"`) this allocates and strips
133    /// brackets, returning `Cow::Owned("192.168.1.50")`.
134    ///
135    /// This is the right default for MMDB lookups, deduplication, output, and
136    /// parsing. For the raw matched text (which may contain brackets), use
137    /// [`as_matched_str`][Self::as_matched_str].
138    pub fn as_str(&self) -> std::borrow::Cow<'a, str> {
139        if memchr::memchr(b'[', self.bytes).is_none() {
140            // SAFETY: IP characters and brackets are all ASCII.
141            std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.bytes) })
142        } else {
143            let cleaned = strip_brackets(self.bytes);
144            // SAFETY: strip_brackets retains only IP characters (ASCII).
145            std::borrow::Cow::Owned(unsafe { String::from_utf8_unchecked(cleaned) })
146        }
147    }
148
149    /// The raw matched text as a string slice.
150    ///
151    /// Returns the exact bytes matched in the haystack — for defanged input,
152    /// this may include bracket characters (e.g. `"192.168.1[.]50"`). Use
153    /// [`as_str`][Self::as_str] when you need the canonical IP form.
154    ///
155    /// Zero-copy: this is a slice directly into the haystack. Safe without
156    /// UTF-8 validation because all matched characters (digits, hex, `.`, `:`,
157    /// `[`, `]`) are ASCII.
158    #[inline]
159    pub fn as_matched_str(&self) -> &'a str {
160        // SAFETY: IP characters and brackets are all ASCII.
161        unsafe { std::str::from_utf8_unchecked(self.bytes) }
162    }
163
164    /// The byte range of this match within the original haystack.
165    #[inline]
166    pub fn range(&self) -> Range<usize> {
167        self.range.clone()
168    }
169
170    /// Whether this match is IPv4 or IPv6.
171    #[inline]
172    pub fn kind(&self) -> IpKind {
173        self.kind
174    }
175
176    /// Parse the matched bytes into an [`IpAddr`].
177    ///
178    /// Automatically strips defang brackets before parsing — safe to call on
179    /// both normal and defanged matches. Not cached; callers processing the
180    /// same IP repeatedly should cache at a higher level.
181    ///
182    /// # Panics
183    ///
184    /// Panics if the validated bytes cannot be parsed as an IP address.
185    /// This should not happen in practice because matches are validated by the DFA.
186    pub fn ip(&self) -> IpAddr {
187        let s = self.as_str();
188        match self.kind {
189            IpKind::V4 => IpAddr::V4(s.parse::<Ipv4Addr>().expect("validated by DFA")),
190            IpKind::V6 => IpAddr::V6(s.parse::<Ipv6Addr>().expect("validated by DFA")),
191        }
192    }
193}
194
195// Alignment wrapper: guarantees u32 alignment for DFA deserialization.
196// DFA::from_bytes() requires the byte slice to be u32-aligned; include_bytes!() only
197// guarantees byte alignment. Wrapping in repr(C, align(4)) satisfies this at compile time,
198// with zero runtime cost: no allocation, no copy, no Box::leak.
199#[repr(C, align(4))]
200struct AlignedDfa<T: ?Sized>(T);
201
202static IPV4_DFA_BYTES: &AlignedDfa<[u8]> =
203    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv4.dfa")));
204static IPV6_DFA_BYTES: &AlignedDfa<[u8]> =
205    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv6.dfa")));
206static BOTH_DFA_BYTES: &AlignedDfa<[u8]> =
207    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/both.dfa")));
208
209static DFA_IPV4: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
210static DFA_IPV6: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
211static DFA_BOTH: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
212
213fn load_dfa(aligned: &'static AlignedDfa<[u8]>) -> DFA<&'static [u32]> {
214    let (dfa, _) = DFA::from_bytes(&aligned.0).expect("valid dfa from build.rs");
215    dfa
216}
217
218fn get_ipv4_dfa() -> &'static DFA<&'static [u32]> {
219    DFA_IPV4.get_or_init(|| load_dfa(IPV4_DFA_BYTES))
220}
221fn get_ipv6_dfa() -> &'static DFA<&'static [u32]> {
222    DFA_IPV6.get_or_init(|| load_dfa(IPV6_DFA_BYTES))
223}
224fn get_both_dfa() -> &'static DFA<&'static [u32]> {
225    DFA_BOTH.get_or_init(|| load_dfa(BOTH_DFA_BYTES))
226}
227
228#[derive(Clone, Debug)]
229enum ValidatorType {
230    IPv4 {
231        include_private: bool,
232        include_loopback: bool,
233        include_broadcast: bool,
234    },
235    IPv6 {
236        include_private: bool,
237        include_loopback: bool,
238    },
239}
240
241impl ValidatorType {
242    #[inline(always)]
243    fn validate(&self, bytes: &[u8]) -> bool {
244        match *self {
245            ValidatorType::IPv4 {
246                include_private,
247                include_loopback,
248                include_broadcast,
249            } => validate_ipv4(bytes, include_private, include_loopback, include_broadcast),
250            ValidatorType::IPv6 {
251                include_private,
252                include_loopback,
253            } => validate_ipv6(bytes, include_private, include_loopback),
254        }
255    }
256
257    #[inline(always)]
258    fn kind(&self) -> IpKind {
259        match self {
260            ValidatorType::IPv4 { .. } => IpKind::V4,
261            ValidatorType::IPv6 { .. } => IpKind::V6,
262        }
263    }
264}
265
266/// The main IP address extractor.
267///
268/// An `Extractor` scans byte slices for IPv4 and/or IPv6 addresses, applying configurable
269/// filters to include or exclude certain address classes (private, loopback, broadcast).
270///
271/// Extractors are best created via [`ExtractorBuilder`] and are designed to be reused
272/// across many calls to `find_iter` for maximum efficiency.
273///
274/// # Bytes vs. Strings
275///
276/// This extractor works directly on byte slices rather than strings. This avoids UTF-8
277/// validation overhead and enables zero-copy scanning of very large inputs.
278///
279/// # Performance
280///
281/// The extractor uses a compile-time DFA (Deterministic Finite Automaton) for O(n)
282/// scanning with minimal overhead. See the crate-level documentation for throughput benchmarks.
283pub struct Extractor {
284    dfa: &'static DFA<&'static [u32]>,
285    validators: [ValidatorType; 2],
286}
287
288impl Extractor {
289    /// Find all IP addresses in a byte slice.
290    ///
291    /// Returns an iterator of byte ranges `[start, end)` pointing to each IP
292    /// address found. Ranges are guaranteed to be valid indices into `haystack`.
293    ///
294    /// For richer match information (IP version, direct string access), use
295    /// [`match_iter`][Extractor::match_iter] instead.
296    ///
297    /// # Example
298    ///
299    /// ```no_run
300    /// use ip_extract::ExtractorBuilder;
301    ///
302    /// # fn main() -> anyhow::Result<()> {
303    /// let extractor = ExtractorBuilder::new().build()?;
304    /// let data = b"Connecting from 192.168.1.1";
305    ///
306    /// for range in extractor.find_iter(data) {
307    ///     let ip = std::str::from_utf8(&data[range])?;
308    ///     println!("Found: {ip}");
309    /// }
310    /// # Ok(())
311    /// # }
312    /// ```
313    #[inline]
314    pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = Range<usize>> + 'a {
315        self.match_iter(haystack).map(|m| m.range())
316    }
317
318    /// Find all IP addresses in a byte slice, yielding rich [`IpMatch`] values.
319    ///
320    /// Like [`find_iter`][Extractor::find_iter], but each match carries the
321    /// matched bytes, their position in the haystack, and the IP version —
322    /// eliminating the need to re-parse or guess the version at the call site.
323    ///
324    /// # Example
325    ///
326    /// ```no_run
327    /// use ip_extract::ExtractorBuilder;
328    ///
329    /// # fn main() -> anyhow::Result<()> {
330    /// let extractor = ExtractorBuilder::new().build()?;
331    /// let data = b"Log: 192.168.1.1 sent request to 2001:db8::1";
332    ///
333    /// for m in extractor.match_iter(data) {
334    ///     println!("{} ({:?})", m.as_matched_str(), m.kind());
335    /// }
336    /// # Ok(())
337    /// # }
338    /// ```
339    #[inline]
340    pub fn match_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = IpMatch<'a>> + 'a {
341        let mut input = Input::new(haystack);
342
343        std::iter::from_fn(move || loop {
344            let Ok(Some(m)) = self.dfa.try_search_fwd(&input) else {
345                return None;
346            };
347
348            let end = m.offset();
349            let pid = m.pattern().as_usize();
350            let validator = &self.validators[pid];
351
352            input.set_start(end);
353
354            // Bracket-aware boundary scan (defang always-on: [.] and [:] are valid IP chars).
355            let floor = end.saturating_sub(55); // wider for bracket notation:
356                                                // max defanged IPv6 ≈ 53 chars
357            let raw_start = (floor..end)
358                .rev()
359                .find(|&i| i == 0 || !is_ip_or_bracket_char(haystack[i - 1]))
360                .unwrap_or(floor);
361
362            // A lone `[` at the start of the candidate is a surrounding bracket (e.g. "[3.3.3.3]"),
363            // not a defang bracket. Defang brackets always surround a separator character:
364            // `[.]`, `[:]`, or `[::]`. Skip a leading `[` that is followed by a digit or hex
365            // character (not `.` or `:`), since that pattern is never valid defang notation.
366            let start = if raw_start < end
367                && haystack[raw_start] == b'['
368                && raw_start + 1 < end
369                && haystack[raw_start + 1] != b'.'
370                && haystack[raw_start + 1] != b':'
371            {
372                raw_start + 1
373            } else {
374                raw_start
375            };
376
377            let valid_right_boundary = match end.cmp(&haystack.len()) {
378                std::cmp::Ordering::Less => {
379                    let next = haystack[end];
380                    match validator {
381                        ValidatorType::IPv4 { .. } => {
382                            !(next.is_ascii_digit()
383                                || next == b'.'
384                                    && end + 1 < haystack.len()
385                                    && haystack[end + 1].is_ascii_digit())
386                        }
387                        ValidatorType::IPv6 { .. } => !is_ip_or_bracket_char(next),
388                    }
389                }
390                _ => true,
391            };
392
393            if !valid_right_boundary {
394                continue;
395            }
396
397            let candidate = &haystack[start..end];
398
399            // Strip brackets before validation (handles both fanged and defanged input).
400            // On normal (fanged) input, memchr scans ~7-15 bytes per match and finds
401            // nothing — falling straight to the else branch with no allocation. The
402            // strip_brackets path only runs when brackets are actually present.
403            if memchr::memchr(b'[', candidate).is_some() {
404                let cleaned = strip_brackets(candidate);
405                if validator.validate(&cleaned) {
406                    return Some(IpMatch {
407                        bytes: candidate,
408                        range: start..end,
409                        kind: validator.kind(),
410                    });
411                }
412            } else if validator.validate(candidate) {
413                return Some(IpMatch {
414                    bytes: candidate,
415                    range: start..end,
416                    kind: validator.kind(),
417                });
418            }
419        })
420    }
421
422    /// Scan `haystack` for IP addresses, writing non-IP text to `wtr` and
423    /// calling `replacer` for each match.
424    ///
425    /// This is the efficient single-pass decoration primitive: the caller
426    /// never needs to track byte offsets or manage gap writes. The replacer
427    /// writes the substitution directly to `wtr` — no intermediate allocation.
428    ///
429    /// Returns the number of IP addresses found.
430    ///
431    /// # Errors
432    ///
433    /// Returns the first `io::Error` from either a gap write or the replacer.
434    ///
435    /// # Example
436    ///
437    /// ```no_run
438    /// use ip_extract::ExtractorBuilder;
439    /// use std::io::Write;
440    ///
441    /// # fn main() -> anyhow::Result<()> {
442    /// let extractor = ExtractorBuilder::new().build()?;
443    /// let data = b"Server 192.168.1.1 is up";
444    /// let mut out = Vec::new();
445    ///
446    /// let count = extractor.replace_iter(data, &mut out, |m, w| {
447    ///     write!(w, "[{}]", m.as_matched_str())
448    /// })?;
449    ///
450    /// assert_eq!(count, 1);
451    /// assert_eq!(out, b"Server [192.168.1.1] is up");
452    /// # Ok(())
453    /// # }
454    /// ```
455    pub fn replace_iter<W, F>(
456        &self,
457        haystack: &[u8],
458        wtr: &mut W,
459        mut replacer: F,
460    ) -> io::Result<usize>
461    where
462        W: io::Write,
463        F: FnMut(&IpMatch, &mut W) -> io::Result<()>,
464    {
465        let mut last = 0;
466        let mut count = 0;
467
468        for m in self.match_iter(haystack) {
469            let range = m.range();
470            wtr.write_all(&haystack[last..range.start])?;
471            replacer(&m, wtr)?;
472            last = range.end;
473            count += 1;
474        }
475
476        wtr.write_all(&haystack[last..])?;
477        Ok(count)
478    }
479}
480
481/// Boundary check for IP characters including defang brackets `[` and `]`.
482#[inline(always)]
483fn is_ip_or_bracket_char(b: u8) -> bool {
484    matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'.' | b':' | b'[' | b']')
485}
486
487/// Strip `[` and `]` from a byte slice, returning a cleaned copy.
488///
489/// Used by the defang DFA approach to normalize `192[.]168[.]1[.]1` → `192.168.1.1`
490/// before feeding to the standard validator.
491fn strip_brackets(bytes: &[u8]) -> Vec<u8> {
492    let mut out = Vec::with_capacity(bytes.len());
493    for &b in bytes {
494        if b != b'[' && b != b']' {
495            out.push(b);
496        }
497    }
498    out
499}
500
501/// A builder for configuring IP extraction behavior.
502///
503/// Use `ExtractorBuilder` to specify which types of IP addresses should be extracted.
504/// By default, it extracts both IPv4 and IPv6 but excludes private, loopback, and
505/// broadcast addresses.
506///
507/// # Example
508///
509/// ```no_run
510/// use ip_extract::ExtractorBuilder;
511///
512/// # fn main() -> anyhow::Result<()> {
513/// let extractor = ExtractorBuilder::new()
514///     .ipv4(true)
515///     .ipv6(false)  // Only IPv4
516///     .private_ips(true)  // Include private ranges
517///     .build()?;
518/// # Ok(())
519/// # }
520/// ```
521pub struct ExtractorBuilder {
522    include_ipv4: bool,
523    include_ipv6: bool,
524    include_private: bool,
525    include_loopback: bool,
526    include_broadcast: bool,
527}
528
529impl Default for ExtractorBuilder {
530    fn default() -> Self {
531        Self::new()
532    }
533}
534
535impl ExtractorBuilder {
536    /// Create a new builder with default settings.
537    ///
538    /// By default, **all IP addresses are extracted** (principle of least surprise).
539    /// Use `.only_public()` or `.ignore_*()` methods to filter specific categories.
540    ///
541    /// Defaults:
542    /// - IPv4: enabled
543    /// - IPv6: enabled
544    /// - Private IPs: **enabled** (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, fc00::/7)
545    /// - Loopback IPs: **enabled** (127.0.0.0/8, ::1)
546    /// - Broadcast IPs: **enabled** (255.255.255.255, link-local)
547    ///
548    /// # Examples
549    ///
550    /// ```no_run
551    /// use ip_extract::ExtractorBuilder;
552    ///
553    /// # fn main() -> anyhow::Result<()> {
554    /// // Extract all IPs (default)
555    /// let extractor = ExtractorBuilder::new().build()?;
556    ///
557    /// // Extract only public IPs
558    /// let extractor = ExtractorBuilder::new().only_public().build()?;
559    ///
560    /// // Granular control
561    /// let extractor = ExtractorBuilder::new()
562    ///     .ignore_private()
563    ///     .ignore_loopback()
564    ///     .build()?;
565    /// # Ok(())
566    /// # }
567    /// ```
568    #[must_use]
569    pub fn new() -> Self {
570        Self {
571            include_ipv4: true,
572            include_ipv6: true,
573            include_private: true,
574            include_loopback: true,
575            include_broadcast: true,
576        }
577    }
578    /// Enable or disable IPv4 address extraction.
579    ///
580    /// Default: `true`
581    pub fn ipv4(&mut self, include: bool) -> &mut Self {
582        self.include_ipv4 = include;
583        self
584    }
585
586    /// Enable or disable IPv6 address extraction.
587    ///
588    /// Default: `true`
589    pub fn ipv6(&mut self, include: bool) -> &mut Self {
590        self.include_ipv6 = include;
591        self
592    }
593
594    /// Include private IP addresses (RFC 1918 for IPv4, ULA for IPv6).
595    ///
596    /// Private ranges include:
597    /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
598    /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
599    ///
600    /// Default: `true`
601    pub fn private_ips(&mut self, include: bool) -> &mut Self {
602        self.include_private = include;
603        self
604    }
605
606    /// Include loopback addresses.
607    ///
608    /// Loopback ranges:
609    /// - IPv4: 127.0.0.0/8
610    /// - IPv6: ::1
611    ///
612    /// Default: `true`
613    pub fn loopback_ips(&mut self, include: bool) -> &mut Self {
614        self.include_loopback = include;
615        self
616    }
617
618    /// Include broadcast addresses.
619    ///
620    /// Covers:
621    /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
622    /// - IPv6: link-local and other special ranges
623    ///
624    /// Default: `true`
625    pub fn broadcast_ips(&mut self, include: bool) -> &mut Self {
626        self.include_broadcast = include;
627        self
628    }
629
630    /// Ignore private IP addresses (convenience for `.private_ips(false)`).
631    ///
632    /// Excludes:
633    /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
634    /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
635    pub fn ignore_private(&mut self) -> &mut Self {
636        self.include_private = false;
637        self
638    }
639
640    /// Ignore loopback addresses (convenience for `.loopback_ips(false)`).
641    ///
642    /// Excludes:
643    /// - IPv4: 127.0.0.0/8
644    /// - IPv6: ::1
645    pub fn ignore_loopback(&mut self) -> &mut Self {
646        self.include_loopback = false;
647        self
648    }
649
650    /// Ignore broadcast addresses (convenience for `.broadcast_ips(false)`).
651    ///
652    /// Excludes:
653    /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
654    /// - IPv6: link-local and other special ranges
655    pub fn ignore_broadcast(&mut self) -> &mut Self {
656        self.include_broadcast = false;
657        self
658    }
659
660    /// Extract only publicly routable IP addresses.
661    ///
662    /// This is a convenience method equivalent to:
663    /// ```
664    /// # use ip_extract::ExtractorBuilder;
665    /// # let mut builder = ExtractorBuilder::new();
666    /// builder
667    ///     .ignore_private()
668    ///     .ignore_loopback()
669    ///     .ignore_broadcast();
670    /// ```
671    ///
672    /// Excludes:
673    /// - Private: RFC 1918 (IPv4), ULA (IPv6)
674    /// - Loopback: 127.0.0.0/8, ::1
675    /// - Broadcast: 255.255.255.255, link-local ranges
676    ///
677    /// # Example
678    ///
679    /// ```no_run
680    /// use ip_extract::ExtractorBuilder;
681    ///
682    /// # fn main() -> anyhow::Result<()> {
683    /// let extractor = ExtractorBuilder::new()
684    ///     .only_public()
685    ///     .build()?;
686    /// # Ok(())
687    /// # }
688    /// ```
689    pub fn only_public(&mut self) -> &mut Self {
690        self.include_private = false;
691        self.include_loopback = false;
692        self.include_broadcast = false;
693        self
694    }
695
696    /// Build and return an `Extractor` with the configured settings.
697    ///
698    /// # Errors
699    ///
700    /// Returns an error if no IP version (IPv4 or IPv6) is enabled. At least one
701    /// must be selected.
702    ///
703    /// # Example
704    ///
705    /// ```no_run
706    /// use ip_extract::ExtractorBuilder;
707    ///
708    /// # fn main() -> anyhow::Result<()> {
709    /// let extractor = ExtractorBuilder::new()
710    ///     .ipv4(true)
711    ///     .ipv6(true)
712    ///     .build()?;
713    /// # Ok(())
714    /// # }
715    /// ```
716    pub fn build(&self) -> anyhow::Result<Extractor> {
717        let ipv4 = ValidatorType::IPv4 {
718            include_private: self.include_private,
719            include_loopback: self.include_loopback,
720            include_broadcast: self.include_broadcast,
721        };
722        let ipv6 = ValidatorType::IPv6 {
723            include_private: self.include_private,
724            include_loopback: self.include_loopback,
725        };
726        // Pattern IDs assigned by build_many order: 0 = IPv4, 1 = IPv6.
727        // All DFAs are defang-aware (match both normal and bracket notation).
728        // validators[pid] must stay in sync with build.rs build_many order.
729        let (dfa, validators) = match (self.include_ipv4, self.include_ipv6) {
730            (true, true) => (get_both_dfa(), [ipv4, ipv6]),
731            (true, false) => (get_ipv4_dfa(), [ipv4, ipv6]),
732            // ipv6_only DFA has a single pattern: pid=0 maps to IPv6
733            (false, true) => (get_ipv6_dfa(), [ipv6, ipv4]),
734            _ => anyhow::bail!("No IP address patterns selected"),
735        };
736        Ok(Extractor { dfa, validators })
737    }
738}
739
740/// Validate an IPv4 address from a byte slice, applying filters.
741///
742/// This function uses `parse_ipv4_bytes` for strict validation and then checks
743/// against the provided inclusion filters.
744///
745/// # Arguments
746///
747/// * `bytes` - Candidate byte slice to validate.
748/// * `include_private` - Whether to include RFC 1918 addresses.
749/// * `include_loopback` - Whether to include 127.0.0.0/8 addresses.
750/// * `include_broadcast` - Whether to include broadcast and link-local addresses.
751#[inline]
752fn validate_ipv4(
753    bytes: &[u8],
754    include_private: bool,
755    include_loopback: bool,
756    include_broadcast: bool,
757) -> bool {
758    let Some(ipv4) = parse_ipv4_bytes(bytes) else {
759        return false;
760    };
761
762    if !include_private && ipv4.is_private() {
763        return false;
764    }
765    if !include_loopback && ipv4.is_loopback() {
766        return false;
767    }
768    if !include_broadcast && (ipv4.is_broadcast() || ipv4.is_link_local()) {
769        return false;
770    }
771    true
772}
773
774/// Extract all IPv4 and IPv6 addresses from input, returning them as strings.
775///
776/// This is a convenience function that uses default settings (all IP types included).
777/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
778///
779/// # Errors
780///
781/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
782///
783/// # Example
784///
785/// ```no_run
786/// use ip_extract::extract;
787///
788/// # fn main() -> anyhow::Result<()> {
789/// let ips = extract(b"Server at 192.168.1.1 and 2001:db8::1")?;
790/// assert_eq!(ips, vec!["192.168.1.1", "2001:db8::1"]);
791/// # Ok(())
792/// # }
793/// ```
794pub fn extract(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
795    let extractor = ExtractorBuilder::new().build()?;
796    Ok(extractor
797        .find_iter(haystack)
798        .map(|range| String::from_utf8_lossy(&haystack[range]).to_string())
799        .collect())
800}
801
802/// Extract unique IPv4 and IPv6 addresses from input, returning them as strings.
803///
804/// Maintains order of first observation (not lexicographic order).
805/// This is a convenience function that uses default settings (all IP types included).
806/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
807///
808/// # Errors
809///
810/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
811///
812/// # Example
813///
814/// ```no_run
815/// use ip_extract::extract_unique;
816///
817/// # fn main() -> anyhow::Result<()> {
818/// let ips = extract_unique(b"Server at 192.168.1.1, another at 192.168.1.1")?;
819/// assert_eq!(ips, vec!["192.168.1.1"]);
820/// # Ok(())
821/// # }
822/// ```
823pub fn extract_unique(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
824    use std::collections::HashSet;
825
826    let extractor = ExtractorBuilder::new().build()?;
827    let mut seen = HashSet::new();
828    let mut result = Vec::new();
829
830    for range in extractor.find_iter(haystack) {
831        let ip_str = String::from_utf8_lossy(&haystack[range]).to_string();
832        if seen.insert(ip_str.clone()) {
833            result.push(ip_str);
834        }
835    }
836
837    Ok(result)
838}
839
840/// Extract all IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
841///
842/// This is a convenience function that uses default settings (all IP types included).
843/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
844///
845/// # Errors
846///
847/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
848/// or if an extracted address cannot be parsed (should not happen in practice).
849///
850/// # Example
851///
852/// ```no_run
853/// use ip_extract::extract_parsed;
854///
855/// # fn main() -> anyhow::Result<()> {
856/// let ips = extract_parsed(b"Server at 192.168.1.1 and 2001:db8::1")?;
857/// assert_eq!(ips.len(), 2);
858/// assert!(ips[0].is_ipv4());
859/// assert!(ips[1].is_ipv6());
860/// # Ok(())
861/// # }
862/// ```
863pub fn extract_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
864    let extractor = ExtractorBuilder::new().build()?;
865    extractor
866        .find_iter(haystack)
867        .map(|range| {
868            let s = std::str::from_utf8(&haystack[range])
869                .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
870            s.parse::<IpAddr>()
871                .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))
872        })
873        .collect()
874}
875
876/// Extract unique IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
877///
878/// Maintains order of first observation (not lexicographic order).
879/// This is a convenience function that uses default settings (all IP types included).
880/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
881///
882/// # Errors
883///
884/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
885/// or if an extracted address cannot be parsed (should not happen in practice).
886///
887/// # Example
888///
889/// ```no_run
890/// use ip_extract::extract_unique_parsed;
891///
892/// # fn main() -> anyhow::Result<()> {
893/// let ips = extract_unique_parsed(b"Server at 192.168.1.1, another at 192.168.1.1")?;
894/// assert_eq!(ips.len(), 1);
895/// assert!(ips[0].is_ipv4());
896/// # Ok(())
897/// # }
898/// ```
899pub fn extract_unique_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
900    use std::collections::HashSet;
901
902    let extractor = ExtractorBuilder::new().build()?;
903    let mut seen = HashSet::new();
904    let mut result = Vec::new();
905
906    for range in extractor.find_iter(haystack) {
907        let s = std::str::from_utf8(&haystack[range])
908            .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
909        let addr = s
910            .parse::<IpAddr>()
911            .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))?;
912        if seen.insert(addr) {
913            result.push(addr);
914        }
915    }
916
917    Ok(result)
918}
919
920/// Parse an IPv4 address from a byte slice.
921///
922/// Performs strict validation of dotted-quad notation (e.g., `192.168.1.1`).
923/// Rejects:
924/// - Octet values > 255
925/// - Leading zeros (e.g., `192.168.001.1`)
926/// - Invalid formats
927///
928/// # Example
929///
930/// ```
931/// use ip_extract::parse_ipv4_bytes;
932///
933/// assert_eq!(parse_ipv4_bytes(b"192.168.1.1"), Some("192.168.1.1".parse().unwrap()));
934/// assert_eq!(parse_ipv4_bytes(b"256.1.1.1"), None);  // Out of range
935/// assert_eq!(parse_ipv4_bytes(b"192.168.01.1"), None);  // Leading zero
936/// ```
937#[must_use]
938#[inline]
939pub fn parse_ipv4_bytes(bytes: &[u8]) -> Option<Ipv4Addr> {
940    if bytes.len() < 7 || bytes.len() > 15 {
941        return None;
942    }
943    let mut octets = [0u8; 4];
944    let mut octet_idx = 0;
945    let mut current_val = 0u16;
946    let mut digits_in_octet = 0;
947    for &b in bytes {
948        match b {
949            b'.' => {
950                if digits_in_octet == 0 || octet_idx == 3 {
951                    return None;
952                }
953                #[allow(clippy::cast_possible_truncation)]
954                {
955                    octets[octet_idx] = current_val as u8;
956                }
957                octet_idx += 1;
958                current_val = 0;
959                digits_in_octet = 0;
960            }
961            b'0'..=b'9' => {
962                let digit = u16::from(b - b'0');
963                if digits_in_octet > 0 && current_val == 0 {
964                    return None;
965                }
966                current_val = current_val * 10 + digit;
967                if current_val > 255 {
968                    return None;
969                }
970                digits_in_octet += 1;
971            }
972            _ => return None,
973        }
974    }
975    if octet_idx != 3 || digits_in_octet == 0 {
976        return None;
977    }
978    #[allow(clippy::cast_possible_truncation)]
979    {
980        octets[3] = current_val as u8;
981    }
982    Some(Ipv4Addr::new(octets[0], octets[1], octets[2], octets[3]))
983}
984
985/// Check if an IPv6 address is a Unique Local Address (ULA) per RFC 4193.
986/// ULA addresses are in the fc00::/7 range (fc00:: to fdff::).
987#[inline]
988fn is_unique_local(ip: &Ipv6Addr) -> bool {
989    matches!(ip.octets()[0], 0xfc | 0xfd)
990}
991
992/// Validate an IPv6 address from a byte slice, applying filters.
993///
994/// This function performs parsing and category-based filtering. It uses
995/// `unsafe` `from_utf8_unchecked` for performance, as the candidates are
996/// already filtered by the DFA for IP-like characters.
997///
998/// # Arguments
999///
1000/// * `bytes` - Candidate byte slice to validate.
1001/// * `include_private` - Whether to include ULA and link-local addresses.
1002/// * `include_loopback` - Whether to include the loopback address (`::1`).
1003#[inline]
1004fn validate_ipv6(bytes: &[u8], include_private: bool, include_loopback: bool) -> bool {
1005    if bytes.len() < 2 {
1006        return false;
1007    }
1008    let s = unsafe { std::str::from_utf8_unchecked(bytes) };
1009    let Ok(ip) = s.parse::<IpAddr>() else {
1010        return false;
1011    };
1012
1013    match ip {
1014        IpAddr::V6(ipv6) => {
1015            if !include_private && (ipv6.is_unicast_link_local() || is_unique_local(&ipv6)) {
1016                return false;
1017            }
1018            if !include_loopback && ipv6.is_loopback() {
1019                return false;
1020            }
1021            true
1022        }
1023        IpAddr::V4(_) => false,
1024    }
1025}
1026
1027impl std::fmt::Debug for Extractor {
1028    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1029        f.debug_struct("Extractor")
1030            .field("validators", &self.validators)
1031            .finish()
1032    }
1033}