Skip to main content

ip_extract/
lib.rs

1//! High-performance IP address extraction and tagging engine.
2//!
3//! `ip-extract` provides a blazingly fast, configurable extractor for finding IPv4 and IPv6
4//! addresses in unstructured text. It achieves maximum throughput through:
5//!
6//! - **Compile-time DFA**: IP patterns are converted to dense Forward DFAs during build,
7//!   eliminating runtime regex compilation and heap allocation.
8//! - **Zero-overhead scanning**: The DFA scans at O(n) with no backtracking; validation
9//!   is performed only on candidates.
10//! - **Strict validation**: Deep checks eliminate false positives (e.g., `1.2.3.4.5` is rejected).
11//!
12//! ## Quick Start
13//!
14//! By default, **all IP addresses are extracted**:
15//!
16//! ```no_run
17//! use ip_extract::ExtractorBuilder;
18//!
19//! # fn main() -> anyhow::Result<()> {
20//! // Extract all IPs (default: includes private, loopback, broadcast)
21//! let extractor = ExtractorBuilder::new().build()?;
22//!
23//! let input = b"Connect from 192.168.1.1 to 2001:db8::1";
24//! for range in extractor.find_iter(input) {
25//!     let ip = std::str::from_utf8(&input[range])?;
26//!     println!("Found: {}", ip);
27//! }
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## Tagging and Output
33//!
34//! For more structured output (e.g., JSON), use the `Tagged` and `Tag` types:
35//!
36//! ```no_run
37//! use ip_extract::{ExtractorBuilder, Tagged, Tag};
38//!
39//! # fn main() -> anyhow::Result<()> {
40//! let extractor = ExtractorBuilder::new().build()?;
41//! let data = b"Server at 8.8.8.8";
42//! let mut tagged = Tagged::new(data);
43//!
44//! for range in extractor.find_iter(data) {
45//!     let ip = std::str::from_utf8(&data[range.clone()])?;
46//!     let tag = Tag::new(ip, ip).with_range(range);
47//!     tagged = tagged.tag(tag);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ## Configuration
54//!
55//! Use `ExtractorBuilder` to filter specific IP categories:
56//!
57//! ```no_run
58//! use ip_extract::ExtractorBuilder;
59//!
60//! # fn main() -> anyhow::Result<()> {
61//! // Extract only publicly routable IPs
62//! let extractor = ExtractorBuilder::new()
63//!     .only_public()
64//!     .build()?;
65//!
66//! // Or use granular control
67//! let extractor = ExtractorBuilder::new()
68//!     .ipv4(true)            // Extract IPv4 (default: true)
69//!     .ipv6(false)           // Skip IPv6
70//!     .ignore_private()      // Skip RFC 1918 ranges
71//!     .ignore_loopback()     // Skip loopback (127.0.0.1, ::1)
72//!     .build()?;
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Performance
78//!
79//! Typical throughput on modern hardware:
80//! - Dense IPs (mostly IP addresses): **160+ MiB/s**
81//! - Sparse logs (IPs mixed with text): **360+ MiB/s**
82//! - No IPs (pure scanning): **620+ MiB/s**
83//!
84//! See `benches/ip_benchmark.rs` for details.
85
86use std::io;
87use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
88use std::ops::Range;
89use std::sync::OnceLock;
90
91use regex_automata::dfa::dense::DFA;
92use regex_automata::dfa::Automaton;
93use regex_automata::Input;
94
95mod tag;
96pub use tag::{Tag, Tagged, TextData};
97
98/// Whether a validated IP match is IPv4 or IPv6.
99///
100/// Known at zero cost from the DFA pattern ID — no parsing required.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum IpKind {
103    V4,
104    V6,
105}
106
107/// A validated IP address match within a haystack.
108///
109/// Provides zero-copy access to the matched bytes and their position within
110/// the original haystack, plus the IP version. Parsing to [`IpAddr`] is
111/// available via [`ip()`][IpMatch::ip] but not cached — callers who look up
112/// the same IP repeatedly should cache at a higher level.
113#[derive(Debug, Clone)]
114pub struct IpMatch<'a> {
115    bytes: &'a [u8],
116    range: Range<usize>,
117    kind: IpKind,
118}
119
120impl<'a> IpMatch<'a> {
121    /// The matched IP address as a byte slice.
122    ///
123    /// Zero-copy: this is a slice directly into the haystack.
124    #[inline]
125    pub fn as_bytes(&self) -> &'a [u8] {
126        self.bytes
127    }
128
129    /// The clean IP address as a string, with any defang brackets removed.
130    ///
131    /// For normal (fanged) input this is a zero-copy borrow (`Cow::Borrowed`).
132    /// For defanged input (e.g. `"192.168.1[.]50"`) this allocates and strips
133    /// brackets, returning `Cow::Owned("192.168.1.50")`.
134    ///
135    /// This is the right default for MMDB lookups, deduplication, output, and
136    /// parsing. For the raw matched text (which may contain brackets), use
137    /// [`as_matched_str`][Self::as_matched_str].
138    pub fn as_str(&self) -> std::borrow::Cow<'a, str> {
139        if memchr::memchr(b'[', self.bytes).is_none() {
140            // SAFETY: IP characters and brackets are all ASCII.
141            std::borrow::Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.bytes) })
142        } else {
143            let cleaned = strip_brackets(self.bytes);
144            // SAFETY: strip_brackets retains only IP characters (ASCII).
145            std::borrow::Cow::Owned(unsafe { String::from_utf8_unchecked(cleaned) })
146        }
147    }
148
149    /// The raw matched text as a string slice.
150    ///
151    /// Returns the exact bytes matched in the haystack — for defanged input,
152    /// this may include bracket characters (e.g. `"192.168.1[.]50"`). Use
153    /// [`as_str`][Self::as_str] when you need the canonical IP form.
154    ///
155    /// Zero-copy: this is a slice directly into the haystack. Safe without
156    /// UTF-8 validation because all matched characters (digits, hex, `.`, `:`,
157    /// `[`, `]`) are ASCII.
158    #[inline]
159    pub fn as_matched_str(&self) -> &'a str {
160        // SAFETY: IP characters and brackets are all ASCII.
161        unsafe { std::str::from_utf8_unchecked(self.bytes) }
162    }
163
164    /// The byte range of this match within the original haystack.
165    #[inline]
166    pub fn range(&self) -> Range<usize> {
167        self.range.clone()
168    }
169
170    /// Whether this match is IPv4 or IPv6.
171    #[inline]
172    pub fn kind(&self) -> IpKind {
173        self.kind
174    }
175
176    /// Parse the matched bytes into an [`IpAddr`].
177    ///
178    /// Automatically strips defang brackets before parsing — safe to call on
179    /// both normal and defanged matches. Not cached; callers processing the
180    /// same IP repeatedly should cache at a higher level.
181    ///
182    /// # Panics
183    ///
184    /// Panics if the validated bytes cannot be parsed as an IP address.
185    /// This should not happen in practice because matches are validated by the DFA.
186    pub fn ip(&self) -> IpAddr {
187        let s = self.as_str();
188        match self.kind {
189            IpKind::V4 => IpAddr::V4(s.parse::<Ipv4Addr>().expect("validated by DFA")),
190            IpKind::V6 => IpAddr::V6(s.parse::<Ipv6Addr>().expect("validated by DFA")),
191        }
192    }
193}
194
195// Alignment wrapper: guarantees u32 alignment for DFA deserialization.
196// DFA::from_bytes() requires the byte slice to be u32-aligned; include_bytes!() only
197// guarantees byte alignment. Wrapping in repr(C, align(4)) satisfies this at compile time,
198// with zero runtime cost: no allocation, no copy, no Box::leak.
199#[repr(C, align(4))]
200struct AlignedDfa<T: ?Sized>(T);
201
202static IPV4_DFA_BYTES: &AlignedDfa<[u8]> =
203    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv4.dfa")));
204static IPV6_DFA_BYTES: &AlignedDfa<[u8]> =
205    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/ipv6.dfa")));
206static BOTH_DFA_BYTES: &AlignedDfa<[u8]> =
207    &AlignedDfa(*include_bytes!(concat!(env!("OUT_DIR"), "/both.dfa")));
208
209static DFA_IPV4: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
210static DFA_IPV6: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
211static DFA_BOTH: OnceLock<DFA<&'static [u32]>> = OnceLock::new();
212
213fn load_dfa(aligned: &'static AlignedDfa<[u8]>) -> DFA<&'static [u32]> {
214    let (dfa, _) = DFA::from_bytes(&aligned.0).expect("valid dfa from build.rs");
215    dfa
216}
217
218fn get_ipv4_dfa() -> &'static DFA<&'static [u32]> {
219    DFA_IPV4.get_or_init(|| load_dfa(IPV4_DFA_BYTES))
220}
221fn get_ipv6_dfa() -> &'static DFA<&'static [u32]> {
222    DFA_IPV6.get_or_init(|| load_dfa(IPV6_DFA_BYTES))
223}
224fn get_both_dfa() -> &'static DFA<&'static [u32]> {
225    DFA_BOTH.get_or_init(|| load_dfa(BOTH_DFA_BYTES))
226}
227
228#[derive(Clone, Debug)]
229enum ValidatorType {
230    IPv4 {
231        include_private: bool,
232        include_loopback: bool,
233        include_broadcast: bool,
234    },
235    IPv6 {
236        include_private: bool,
237        include_loopback: bool,
238    },
239}
240
241impl ValidatorType {
242    #[inline(always)]
243    fn validate(&self, bytes: &[u8]) -> bool {
244        match *self {
245            ValidatorType::IPv4 {
246                include_private,
247                include_loopback,
248                include_broadcast,
249            } => validate_ipv4(bytes, include_private, include_loopback, include_broadcast),
250            ValidatorType::IPv6 {
251                include_private,
252                include_loopback,
253            } => validate_ipv6(bytes, include_private, include_loopback),
254        }
255    }
256
257    #[inline(always)]
258    fn kind(&self) -> IpKind {
259        match self {
260            ValidatorType::IPv4 { .. } => IpKind::V4,
261            ValidatorType::IPv6 { .. } => IpKind::V6,
262        }
263    }
264}
265
266/// The main IP address extractor.
267///
268/// An `Extractor` scans byte slices for IPv4 and/or IPv6 addresses, applying configurable
269/// filters to include or exclude certain address classes (private, loopback, broadcast).
270///
271/// Extractors are best created via [`ExtractorBuilder`] and are designed to be reused
272/// across many calls to `find_iter` for maximum efficiency.
273///
274/// # Bytes vs. Strings
275///
276/// This extractor works directly on byte slices rather than strings. This avoids UTF-8
277/// validation overhead and enables zero-copy scanning of very large inputs.
278///
279/// # Performance
280///
281/// The extractor uses a compile-time DFA (Deterministic Finite Automaton) for O(n)
282/// scanning with minimal overhead. See the crate-level documentation for throughput benchmarks.
283pub struct Extractor {
284    dfa: &'static DFA<&'static [u32]>,
285    validators: [ValidatorType; 2],
286}
287
288impl Extractor {
289    /// Find all IP addresses in a byte slice.
290    ///
291    /// Returns an iterator of byte ranges `[start, end)` pointing to each IP
292    /// address found. Ranges are guaranteed to be valid indices into `haystack`.
293    ///
294    /// For richer match information (IP version, direct string access), use
295    /// [`match_iter`][Extractor::match_iter] instead.
296    ///
297    /// # Example
298    ///
299    /// ```no_run
300    /// use ip_extract::ExtractorBuilder;
301    ///
302    /// # fn main() -> anyhow::Result<()> {
303    /// let extractor = ExtractorBuilder::new().build()?;
304    /// let data = b"Connecting from 192.168.1.1";
305    ///
306    /// for range in extractor.find_iter(data) {
307    ///     let ip = std::str::from_utf8(&data[range])?;
308    ///     println!("Found: {ip}");
309    /// }
310    /// # Ok(())
311    /// # }
312    /// ```
313    #[inline]
314    pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = Range<usize>> + 'a {
315        self.match_iter(haystack).map(|m| m.range())
316    }
317
318    /// Find all IP addresses in a byte slice, yielding rich [`IpMatch`] values.
319    ///
320    /// Like [`find_iter`][Extractor::find_iter], but each match carries the
321    /// matched bytes, their position in the haystack, and the IP version —
322    /// eliminating the need to re-parse or guess the version at the call site.
323    ///
324    /// # Example
325    ///
326    /// ```no_run
327    /// use ip_extract::ExtractorBuilder;
328    ///
329    /// # fn main() -> anyhow::Result<()> {
330    /// let extractor = ExtractorBuilder::new().build()?;
331    /// let data = b"Log: 192.168.1.1 sent request to 2001:db8::1";
332    ///
333    /// for m in extractor.match_iter(data) {
334    ///     println!("{} ({:?})", m.as_matched_str(), m.kind());
335    /// }
336    /// # Ok(())
337    /// # }
338    /// ```
339    #[inline]
340    pub fn match_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator<Item = IpMatch<'a>> + 'a {
341        let mut input = Input::new(haystack);
342
343        std::iter::from_fn(move || loop {
344            let Ok(Some(m)) = self.dfa.try_search_fwd(&input) else {
345                return None;
346            };
347
348            let end = m.offset();
349            let pid = m.pattern().as_usize();
350            let validator = &self.validators[pid];
351
352            input.set_start(end);
353
354            // Bracket-aware boundary scan (defang always-on: [.] and [:] are valid IP chars).
355            let floor = end.saturating_sub(55); // wider for bracket notation:
356                                                // max defanged IPv6 ≈ 53 chars
357            let raw_start = (floor..end)
358                .rev()
359                .find(|&i| i == 0 || !is_ip_or_bracket_char(haystack[i - 1]))
360                .unwrap_or(floor);
361
362            // A lone `[` at the start of the candidate is a surrounding bracket (e.g. "[3.3.3.3]"),
363            // not a defang bracket. Defang brackets always surround a separator character:
364            // `[.]`, `[:]`, or `[::]`. Skip a leading `[` that is followed by a digit or hex
365            // character (not `.` or `:`), since that pattern is never valid defang notation.
366            //
367            // Additionally, handle the RFC 5321 SMTP `IPv6:` tag prefix. MTAs write IPv6
368            // addresses as `[IPv6:2001:db8::1]`. The lookback stops at `v` (non-hex letter)
369            // leaving the candidate as `6:2001:db8::1`. Detect this via the heuristic: if the
370            // candidate starts with a single hex digit immediately followed by `:`, and the
371            // character immediately before the candidate in the haystack is a non-hex ASCII
372            // letter, skip that `x:` prefix. This specifically targets the `IPv6:` suffix
373            // pattern (`…v6:` → skip `6:`).
374            let start = if raw_start < end
375                && haystack[raw_start] == b'['
376                && raw_start + 1 < end
377                && haystack[raw_start + 1] != b'.'
378                && haystack[raw_start + 1] != b':'
379            {
380                raw_start + 1
381            } else if raw_start > 0 && raw_start + 1 < end && haystack[raw_start + 1] == b':' && {
382                let prev = haystack[raw_start - 1];
383                prev.is_ascii_alphabetic() && !matches!(prev, b'a'..=b'f' | b'A'..=b'F')
384            } {
385                // Skip the single-hex-char + `:` prefix (e.g. `6:` from `IPv6:`).
386                raw_start + 2
387            } else {
388                raw_start
389            };
390
391            let valid_right_boundary = match end.cmp(&haystack.len()) {
392                std::cmp::Ordering::Less => {
393                    let next = haystack[end];
394                    match validator {
395                        ValidatorType::IPv4 { .. } => {
396                            !(next.is_ascii_digit()
397                                || next == b'.'
398                                    && end + 1 < haystack.len()
399                                    && haystack[end + 1].is_ascii_digit())
400                        }
401                        // `]` is allowed as a right boundary for IPv6. In SMTP literal
402                        // notation (RFC 5321) addresses appear as `[2001:db8::1]` or
403                        // `[IPv6:2001:db8::1]`. In defang notation brackets only appear
404                        // in the middle of the address (`[:]`), never at the very end of
405                        // the DFA match, so a trailing `]` is always a closing bracket.
406                        ValidatorType::IPv6 { .. } => {
407                            !matches!(next, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F'
408                                | b'.' | b':' | b'[')
409                        }
410                    }
411                }
412                _ => true,
413            };
414
415            if !valid_right_boundary {
416                continue;
417            }
418
419            let candidate = &haystack[start..end];
420
421            // Strip brackets before validation (handles both fanged and defanged input).
422            // On normal (fanged) input, memchr scans ~7-15 bytes per match and finds
423            // nothing — falling straight to the else branch with no allocation. The
424            // strip_brackets path only runs when brackets are actually present.
425            if memchr::memchr(b'[', candidate).is_some() {
426                let cleaned = strip_brackets(candidate);
427                if validator.validate(&cleaned) {
428                    return Some(IpMatch {
429                        bytes: candidate,
430                        range: start..end,
431                        kind: validator.kind(),
432                    });
433                }
434            } else if validator.validate(candidate) {
435                return Some(IpMatch {
436                    bytes: candidate,
437                    range: start..end,
438                    kind: validator.kind(),
439                });
440            }
441        })
442    }
443
444    /// Scan `haystack` for IP addresses, writing non-IP text to `wtr` and
445    /// calling `replacer` for each match.
446    ///
447    /// This is the efficient single-pass decoration primitive: the caller
448    /// never needs to track byte offsets or manage gap writes. The replacer
449    /// writes the substitution directly to `wtr` — no intermediate allocation.
450    ///
451    /// Returns the number of IP addresses found.
452    ///
453    /// # Errors
454    ///
455    /// Returns the first `io::Error` from either a gap write or the replacer.
456    ///
457    /// # Example
458    ///
459    /// ```no_run
460    /// use ip_extract::ExtractorBuilder;
461    /// use std::io::Write;
462    ///
463    /// # fn main() -> anyhow::Result<()> {
464    /// let extractor = ExtractorBuilder::new().build()?;
465    /// let data = b"Server 192.168.1.1 is up";
466    /// let mut out = Vec::new();
467    ///
468    /// let count = extractor.replace_iter(data, &mut out, |m, w| {
469    ///     write!(w, "[{}]", m.as_matched_str())
470    /// })?;
471    ///
472    /// assert_eq!(count, 1);
473    /// assert_eq!(out, b"Server [192.168.1.1] is up");
474    /// # Ok(())
475    /// # }
476    /// ```
477    pub fn replace_iter<W, F>(
478        &self,
479        haystack: &[u8],
480        wtr: &mut W,
481        mut replacer: F,
482    ) -> io::Result<usize>
483    where
484        W: io::Write,
485        F: FnMut(&IpMatch, &mut W) -> io::Result<()>,
486    {
487        let mut last = 0;
488        let mut count = 0;
489
490        for m in self.match_iter(haystack) {
491            let range = m.range();
492            wtr.write_all(&haystack[last..range.start])?;
493            replacer(&m, wtr)?;
494            last = range.end;
495            count += 1;
496        }
497
498        wtr.write_all(&haystack[last..])?;
499        Ok(count)
500    }
501}
502
503/// Boundary check for IP characters including defang brackets `[` and `]`.
504#[inline(always)]
505fn is_ip_or_bracket_char(b: u8) -> bool {
506    matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'.' | b':' | b'[' | b']')
507}
508
509/// Strip `[` and `]` from a byte slice, returning a cleaned copy.
510///
511/// Used by the defang DFA approach to normalize `192[.]168[.]1[.]1` → `192.168.1.1`
512/// before feeding to the standard validator.
513fn strip_brackets(bytes: &[u8]) -> Vec<u8> {
514    let mut out = Vec::with_capacity(bytes.len());
515    for &b in bytes {
516        if b != b'[' && b != b']' {
517            out.push(b);
518        }
519    }
520    out
521}
522
523/// A builder for configuring IP extraction behavior.
524///
525/// Use `ExtractorBuilder` to specify which types of IP addresses should be extracted.
526/// By default, it extracts both IPv4 and IPv6 but excludes private, loopback, and
527/// broadcast addresses.
528///
529/// # Example
530///
531/// ```no_run
532/// use ip_extract::ExtractorBuilder;
533///
534/// # fn main() -> anyhow::Result<()> {
535/// let extractor = ExtractorBuilder::new()
536///     .ipv4(true)
537///     .ipv6(false)  // Only IPv4
538///     .private_ips(true)  // Include private ranges
539///     .build()?;
540/// # Ok(())
541/// # }
542/// ```
543pub struct ExtractorBuilder {
544    include_ipv4: bool,
545    include_ipv6: bool,
546    include_private: bool,
547    include_loopback: bool,
548    include_broadcast: bool,
549}
550
551impl Default for ExtractorBuilder {
552    fn default() -> Self {
553        Self::new()
554    }
555}
556
557impl ExtractorBuilder {
558    /// Create a new builder with default settings.
559    ///
560    /// By default, **all IP addresses are extracted** (principle of least surprise).
561    /// Use `.only_public()` or `.ignore_*()` methods to filter specific categories.
562    ///
563    /// Defaults:
564    /// - IPv4: enabled
565    /// - IPv6: enabled
566    /// - Private IPs: **enabled** (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, fc00::/7)
567    /// - Loopback IPs: **enabled** (127.0.0.0/8, ::1)
568    /// - Broadcast IPs: **enabled** (255.255.255.255, link-local)
569    ///
570    /// # Examples
571    ///
572    /// ```no_run
573    /// use ip_extract::ExtractorBuilder;
574    ///
575    /// # fn main() -> anyhow::Result<()> {
576    /// // Extract all IPs (default)
577    /// let extractor = ExtractorBuilder::new().build()?;
578    ///
579    /// // Extract only public IPs
580    /// let extractor = ExtractorBuilder::new().only_public().build()?;
581    ///
582    /// // Granular control
583    /// let extractor = ExtractorBuilder::new()
584    ///     .ignore_private()
585    ///     .ignore_loopback()
586    ///     .build()?;
587    /// # Ok(())
588    /// # }
589    /// ```
590    #[must_use]
591    pub fn new() -> Self {
592        Self {
593            include_ipv4: true,
594            include_ipv6: true,
595            include_private: true,
596            include_loopback: true,
597            include_broadcast: true,
598        }
599    }
600    /// Enable or disable IPv4 address extraction.
601    ///
602    /// Default: `true`
603    pub fn ipv4(&mut self, include: bool) -> &mut Self {
604        self.include_ipv4 = include;
605        self
606    }
607
608    /// Enable or disable IPv6 address extraction.
609    ///
610    /// Default: `true`
611    pub fn ipv6(&mut self, include: bool) -> &mut Self {
612        self.include_ipv6 = include;
613        self
614    }
615
616    /// Include private IP addresses (RFC 1918 for IPv4, ULA for IPv6).
617    ///
618    /// Private ranges include:
619    /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
620    /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
621    ///
622    /// Default: `true`
623    pub fn private_ips(&mut self, include: bool) -> &mut Self {
624        self.include_private = include;
625        self
626    }
627
628    /// Include loopback addresses.
629    ///
630    /// Loopback ranges:
631    /// - IPv4: 127.0.0.0/8
632    /// - IPv6: ::1
633    ///
634    /// Default: `true`
635    pub fn loopback_ips(&mut self, include: bool) -> &mut Self {
636        self.include_loopback = include;
637        self
638    }
639
640    /// Include broadcast addresses.
641    ///
642    /// Covers:
643    /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
644    /// - IPv6: link-local and other special ranges
645    ///
646    /// Default: `true`
647    pub fn broadcast_ips(&mut self, include: bool) -> &mut Self {
648        self.include_broadcast = include;
649        self
650    }
651
652    /// Ignore private IP addresses (convenience for `.private_ips(false)`).
653    ///
654    /// Excludes:
655    /// - IPv4: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
656    /// - IPv6: fc00::/7 (ULA), fe80::/10 (link-local)
657    pub fn ignore_private(&mut self) -> &mut Self {
658        self.include_private = false;
659        self
660    }
661
662    /// Ignore loopback addresses (convenience for `.loopback_ips(false)`).
663    ///
664    /// Excludes:
665    /// - IPv4: 127.0.0.0/8
666    /// - IPv6: ::1
667    pub fn ignore_loopback(&mut self) -> &mut Self {
668        self.include_loopback = false;
669        self
670    }
671
672    /// Ignore broadcast addresses (convenience for `.broadcast_ips(false)`).
673    ///
674    /// Excludes:
675    /// - IPv4: 255.255.255.255 and link-local (169.254.0.0/16)
676    /// - IPv6: link-local and other special ranges
677    pub fn ignore_broadcast(&mut self) -> &mut Self {
678        self.include_broadcast = false;
679        self
680    }
681
682    /// Extract only publicly routable IP addresses.
683    ///
684    /// This is a convenience method equivalent to:
685    /// ```
686    /// # use ip_extract::ExtractorBuilder;
687    /// # let mut builder = ExtractorBuilder::new();
688    /// builder
689    ///     .ignore_private()
690    ///     .ignore_loopback()
691    ///     .ignore_broadcast();
692    /// ```
693    ///
694    /// Excludes:
695    /// - Private: RFC 1918 (IPv4), ULA (IPv6)
696    /// - Loopback: 127.0.0.0/8, ::1
697    /// - Broadcast: 255.255.255.255, link-local ranges
698    ///
699    /// # Example
700    ///
701    /// ```no_run
702    /// use ip_extract::ExtractorBuilder;
703    ///
704    /// # fn main() -> anyhow::Result<()> {
705    /// let extractor = ExtractorBuilder::new()
706    ///     .only_public()
707    ///     .build()?;
708    /// # Ok(())
709    /// # }
710    /// ```
711    pub fn only_public(&mut self) -> &mut Self {
712        self.include_private = false;
713        self.include_loopback = false;
714        self.include_broadcast = false;
715        self
716    }
717
718    /// Build and return an `Extractor` with the configured settings.
719    ///
720    /// # Errors
721    ///
722    /// Returns an error if no IP version (IPv4 or IPv6) is enabled. At least one
723    /// must be selected.
724    ///
725    /// # Example
726    ///
727    /// ```no_run
728    /// use ip_extract::ExtractorBuilder;
729    ///
730    /// # fn main() -> anyhow::Result<()> {
731    /// let extractor = ExtractorBuilder::new()
732    ///     .ipv4(true)
733    ///     .ipv6(true)
734    ///     .build()?;
735    /// # Ok(())
736    /// # }
737    /// ```
738    pub fn build(&self) -> anyhow::Result<Extractor> {
739        let ipv4 = ValidatorType::IPv4 {
740            include_private: self.include_private,
741            include_loopback: self.include_loopback,
742            include_broadcast: self.include_broadcast,
743        };
744        let ipv6 = ValidatorType::IPv6 {
745            include_private: self.include_private,
746            include_loopback: self.include_loopback,
747        };
748        // Pattern IDs assigned by build_many order: 0 = IPv4, 1 = IPv6.
749        // All DFAs are defang-aware (match both normal and bracket notation).
750        // validators[pid] must stay in sync with build.rs build_many order.
751        let (dfa, validators) = match (self.include_ipv4, self.include_ipv6) {
752            (true, true) => (get_both_dfa(), [ipv4, ipv6]),
753            (true, false) => (get_ipv4_dfa(), [ipv4, ipv6]),
754            // ipv6_only DFA has a single pattern: pid=0 maps to IPv6
755            (false, true) => (get_ipv6_dfa(), [ipv6, ipv4]),
756            _ => anyhow::bail!("No IP address patterns selected"),
757        };
758        Ok(Extractor { dfa, validators })
759    }
760}
761
762/// Validate an IPv4 address from a byte slice, applying filters.
763///
764/// This function uses `parse_ipv4_bytes` for strict validation and then checks
765/// against the provided inclusion filters.
766///
767/// # Arguments
768///
769/// * `bytes` - Candidate byte slice to validate.
770/// * `include_private` - Whether to include RFC 1918 addresses.
771/// * `include_loopback` - Whether to include 127.0.0.0/8 addresses.
772/// * `include_broadcast` - Whether to include broadcast and link-local addresses.
773#[inline]
774fn validate_ipv4(
775    bytes: &[u8],
776    include_private: bool,
777    include_loopback: bool,
778    include_broadcast: bool,
779) -> bool {
780    let Some(ipv4) = parse_ipv4_bytes(bytes) else {
781        return false;
782    };
783
784    if !include_private && ipv4.is_private() {
785        return false;
786    }
787    if !include_loopback && ipv4.is_loopback() {
788        return false;
789    }
790    if !include_broadcast && (ipv4.is_broadcast() || ipv4.is_link_local()) {
791        return false;
792    }
793    true
794}
795
796/// Extract all IPv4 and IPv6 addresses from input, returning them as strings.
797///
798/// This is a convenience function that uses default settings (all IP types included).
799/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
800///
801/// # Errors
802///
803/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
804///
805/// # Example
806///
807/// ```no_run
808/// use ip_extract::extract;
809///
810/// # fn main() -> anyhow::Result<()> {
811/// let ips = extract(b"Server at 192.168.1.1 and 2001:db8::1")?;
812/// assert_eq!(ips, vec!["192.168.1.1", "2001:db8::1"]);
813/// # Ok(())
814/// # }
815/// ```
816pub fn extract(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
817    let extractor = ExtractorBuilder::new().build()?;
818    Ok(extractor
819        .find_iter(haystack)
820        .map(|range| String::from_utf8_lossy(&haystack[range]).to_string())
821        .collect())
822}
823
824/// Extract unique IPv4 and IPv6 addresses from input, returning them as strings.
825///
826/// Maintains order of first observation (not lexicographic order).
827/// This is a convenience function that uses default settings (all IP types included).
828/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
829///
830/// # Errors
831///
832/// Returns an error if the builder fails to initialize (e.g., no IP types selected).
833///
834/// # Example
835///
836/// ```no_run
837/// use ip_extract::extract_unique;
838///
839/// # fn main() -> anyhow::Result<()> {
840/// let ips = extract_unique(b"Server at 192.168.1.1, another at 192.168.1.1")?;
841/// assert_eq!(ips, vec!["192.168.1.1"]);
842/// # Ok(())
843/// # }
844/// ```
845pub fn extract_unique(haystack: &[u8]) -> anyhow::Result<Vec<String>> {
846    use std::collections::HashSet;
847
848    let extractor = ExtractorBuilder::new().build()?;
849    let mut seen = HashSet::new();
850    let mut result = Vec::new();
851
852    for range in extractor.find_iter(haystack) {
853        let ip_str = String::from_utf8_lossy(&haystack[range]).to_string();
854        if seen.insert(ip_str.clone()) {
855            result.push(ip_str);
856        }
857    }
858
859    Ok(result)
860}
861
862/// Extract all IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
863///
864/// This is a convenience function that uses default settings (all IP types included).
865/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
866///
867/// # Errors
868///
869/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
870/// or if an extracted address cannot be parsed (should not happen in practice).
871///
872/// # Example
873///
874/// ```no_run
875/// use ip_extract::extract_parsed;
876///
877/// # fn main() -> anyhow::Result<()> {
878/// let ips = extract_parsed(b"Server at 192.168.1.1 and 2001:db8::1")?;
879/// assert_eq!(ips.len(), 2);
880/// assert!(ips[0].is_ipv4());
881/// assert!(ips[1].is_ipv6());
882/// # Ok(())
883/// # }
884/// ```
885pub fn extract_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
886    let extractor = ExtractorBuilder::new().build()?;
887    extractor
888        .find_iter(haystack)
889        .map(|range| {
890            let s = std::str::from_utf8(&haystack[range])
891                .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
892            s.parse::<IpAddr>()
893                .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))
894        })
895        .collect()
896}
897
898/// Extract unique IPv4 and IPv6 addresses from input, returning them as parsed `IpAddr` objects.
899///
900/// Maintains order of first observation (not lexicographic order).
901/// This is a convenience function that uses default settings (all IP types included).
902/// For more control, use `ExtractorBuilder` and `Extractor::find_iter()`.
903///
904/// # Errors
905///
906/// Returns an error if the builder fails to initialize (e.g., no IP types selected),
907/// or if an extracted address cannot be parsed (should not happen in practice).
908///
909/// # Example
910///
911/// ```no_run
912/// use ip_extract::extract_unique_parsed;
913///
914/// # fn main() -> anyhow::Result<()> {
915/// let ips = extract_unique_parsed(b"Server at 192.168.1.1, another at 192.168.1.1")?;
916/// assert_eq!(ips.len(), 1);
917/// assert!(ips[0].is_ipv4());
918/// # Ok(())
919/// # }
920/// ```
921pub fn extract_unique_parsed(haystack: &[u8]) -> anyhow::Result<Vec<IpAddr>> {
922    use std::collections::HashSet;
923
924    let extractor = ExtractorBuilder::new().build()?;
925    let mut seen = HashSet::new();
926    let mut result = Vec::new();
927
928    for range in extractor.find_iter(haystack) {
929        let s = std::str::from_utf8(&haystack[range])
930            .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in IP: {e}"))?;
931        let addr = s
932            .parse::<IpAddr>()
933            .map_err(|e| anyhow::anyhow!("Failed to parse IP '{s}': {e}"))?;
934        if seen.insert(addr) {
935            result.push(addr);
936        }
937    }
938
939    Ok(result)
940}
941
942/// Parse an IPv4 address from a byte slice.
943///
944/// Performs strict validation of dotted-quad notation (e.g., `192.168.1.1`).
945/// Rejects:
946/// - Octet values > 255
947/// - Leading zeros (e.g., `192.168.001.1`)
948/// - Invalid formats
949///
950/// # Example
951///
952/// ```
953/// use ip_extract::parse_ipv4_bytes;
954///
955/// assert_eq!(parse_ipv4_bytes(b"192.168.1.1"), Some("192.168.1.1".parse().unwrap()));
956/// assert_eq!(parse_ipv4_bytes(b"256.1.1.1"), None);  // Out of range
957/// assert_eq!(parse_ipv4_bytes(b"192.168.01.1"), None);  // Leading zero
958/// ```
959#[must_use]
960#[inline]
961pub fn parse_ipv4_bytes(bytes: &[u8]) -> Option<Ipv4Addr> {
962    if bytes.len() < 7 || bytes.len() > 15 {
963        return None;
964    }
965    let mut octets = [0u8; 4];
966    let mut octet_idx = 0;
967    let mut current_val = 0u16;
968    let mut digits_in_octet = 0;
969    for &b in bytes {
970        match b {
971            b'.' => {
972                if digits_in_octet == 0 || octet_idx == 3 {
973                    return None;
974                }
975                #[allow(clippy::cast_possible_truncation)]
976                {
977                    octets[octet_idx] = current_val as u8;
978                }
979                octet_idx += 1;
980                current_val = 0;
981                digits_in_octet = 0;
982            }
983            b'0'..=b'9' => {
984                let digit = u16::from(b - b'0');
985                if digits_in_octet > 0 && current_val == 0 {
986                    return None;
987                }
988                current_val = current_val * 10 + digit;
989                if current_val > 255 {
990                    return None;
991                }
992                digits_in_octet += 1;
993            }
994            _ => return None,
995        }
996    }
997    if octet_idx != 3 || digits_in_octet == 0 {
998        return None;
999    }
1000    #[allow(clippy::cast_possible_truncation)]
1001    {
1002        octets[3] = current_val as u8;
1003    }
1004    Some(Ipv4Addr::new(octets[0], octets[1], octets[2], octets[3]))
1005}
1006
1007/// Check if an IPv6 address is a Unique Local Address (ULA) per RFC 4193.
1008/// ULA addresses are in the fc00::/7 range (fc00:: to fdff::).
1009#[inline]
1010fn is_unique_local(ip: &Ipv6Addr) -> bool {
1011    matches!(ip.octets()[0], 0xfc | 0xfd)
1012}
1013
1014/// Validate an IPv6 address from a byte slice, applying filters.
1015///
1016/// This function performs parsing and category-based filtering. It uses
1017/// `unsafe` `from_utf8_unchecked` for performance, as the candidates are
1018/// already filtered by the DFA for IP-like characters.
1019///
1020/// # Arguments
1021///
1022/// * `bytes` - Candidate byte slice to validate.
1023/// * `include_private` - Whether to include ULA and link-local addresses.
1024/// * `include_loopback` - Whether to include the loopback address (`::1`).
1025#[inline]
1026fn validate_ipv6(bytes: &[u8], include_private: bool, include_loopback: bool) -> bool {
1027    if bytes.len() < 2 {
1028        return false;
1029    }
1030    let s = unsafe { std::str::from_utf8_unchecked(bytes) };
1031    let Ok(ip) = s.parse::<IpAddr>() else {
1032        return false;
1033    };
1034
1035    match ip {
1036        IpAddr::V6(ipv6) => {
1037            if !include_private && (ipv6.is_unicast_link_local() || is_unique_local(&ipv6)) {
1038                return false;
1039            }
1040            if !include_loopback && ipv6.is_loopback() {
1041                return false;
1042            }
1043            true
1044        }
1045        IpAddr::V4(_) => false,
1046    }
1047}
1048
1049impl std::fmt::Debug for Extractor {
1050    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1051        f.debug_struct("Extractor")
1052            .field("validators", &self.validators)
1053            .finish()
1054    }
1055}