wordpress_audit/
scanner.rs

1//! WordPress website scanner
2//!
3//! Detects WordPress version, plugins, and themes by analyzing the website.
4
5use crate::error::{Error, Result};
6use regex::Regex;
7use reqwest::Client;
8use scraper::{Html, Selector};
9use serde::Deserialize;
10use std::collections::HashSet;
11use std::net::{IpAddr, ToSocketAddrs};
12use std::time::Duration;
13use url::Url;
14
15/// User agent for requests (standard Chrome on Windows)
16const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
17
18/// Request timeout in seconds
19const TIMEOUT_SECS: u64 = 30;
20
21/// WordPress.org API base URL
22const WP_API_BASE: &str = "https://api.wordpress.org";
23
24/// WordPress detection paths
25const WP_JSON_PATH: &str = "/wp-json/";
26const WP_FEED_PATH: &str = "/feed/";
27const WP_README_PATH: &str = "/readme.html";
28
29/// WordPress cookie prefixes
30const WP_COOKIE_PREFIXES: &[&str] = &["wordpress_", "wp-"];
31const WP_LANG_COOKIE: &str = "wp_lang";
32
33/// Paths to skip when detecting plugins
34const SKIP_PLUGIN_SLUGS: &[&str] = &["index", "cache"];
35
36/// Allowed URL schemes
37const ALLOWED_SCHEMES: &[&str] = &["http", "https"];
38
39/// Scan results from analyzing a WordPress site
40#[derive(Debug, Clone)]
41pub struct ScanResult {
42    /// Target URL
43    pub url: Url,
44    /// Whether WordPress was detected (even without version)
45    pub wordpress_detected: bool,
46    /// WordPress version if detected
47    pub wordpress_version: Option<String>,
48    /// Latest WordPress version
49    pub wordpress_latest: Option<String>,
50    /// Main theme if detected
51    pub theme: Option<ThemeInfo>,
52    /// Detected plugins
53    pub plugins: Vec<PluginInfo>,
54}
55
56/// Theme information
57#[derive(Debug, Clone)]
58pub struct ThemeInfo {
59    /// Theme slug
60    pub slug: String,
61    /// Theme version if detected
62    pub version: Option<String>,
63    /// Latest version from WordPress.org
64    pub latest_version: Option<String>,
65}
66
67/// Plugin information
68#[derive(Debug, Clone)]
69pub struct PluginInfo {
70    /// Plugin slug
71    pub slug: String,
72    /// Plugin version if detected
73    pub version: Option<String>,
74    /// Latest version from WordPress.org
75    pub latest_version: Option<String>,
76}
77
78/// WordPress.org plugin API response
79#[derive(Debug, Deserialize)]
80struct PluginApiResponse {
81    version: Option<String>,
82}
83
84/// WordPress.org theme API response
85#[derive(Debug, Deserialize)]
86struct ThemeApiResponse {
87    version: Option<String>,
88}
89
90/// WordPress version check API response
91#[derive(Debug, Deserialize)]
92struct WpVersionResponse {
93    offers: Vec<WpVersionOffer>,
94}
95
96#[derive(Debug, Deserialize)]
97struct WpVersionOffer {
98    version: String,
99}
100
101/// WordPress REST API root response
102#[derive(Debug, Deserialize)]
103struct WpJsonResponse {
104    /// Site name
105    name: Option<String>,
106    /// Site URL
107    url: Option<String>,
108    /// Available namespaces (e.g., ["wp/v2", "oembed/1.0"])
109    namespaces: Option<Vec<String>>,
110}
111
112/// WordPress scanner
113#[derive(Debug)]
114pub struct Scanner {
115    client: Client,
116    base_url: Url,
117}
118
119/// Builder for configuring a Scanner with options
120#[derive(Debug)]
121pub struct ScannerBuilder {
122    url: String,
123    allow_private: bool,
124}
125
126impl ScannerBuilder {
127    /// Create a new builder for the given URL or domain
128    pub fn new(url: &str) -> Self {
129        Self {
130            url: url.to_string(),
131            allow_private: false,
132        }
133    }
134
135    /// Allow scanning private/internal IP addresses (localhost, 192.168.x.x, etc.)
136    ///
137    /// By default, SSRF protection blocks requests to internal networks.
138    /// Enable this to scan local WordPress installations.
139    pub fn allow_private(mut self, allow: bool) -> Self {
140        self.allow_private = allow;
141        self
142    }
143
144    /// Build the Scanner with the configured options
145    pub fn build(self) -> Result<Scanner> {
146        Scanner::build_internal(&self.url, self.allow_private)
147    }
148}
149
150impl Scanner {
151    /// Create a new scanner for the given URL or domain
152    ///
153    /// Uses default settings with SSRF protection enabled.
154    /// For more options, use [`Scanner::builder()`].
155    pub fn new(url: &str) -> Result<Self> {
156        Self::build_internal(url, false)
157    }
158
159    /// Create a builder for configuring scanner options
160    ///
161    /// # Example
162    ///
163    /// ```no_run
164    /// use wordpress_audit::Scanner;
165    ///
166    /// let scanner = Scanner::builder("localhost:8080")
167    ///     .allow_private(true)
168    ///     .build()?;
169    /// # Ok::<(), wordpress_audit::Error>(())
170    /// ```
171    pub fn builder(url: &str) -> ScannerBuilder {
172        ScannerBuilder::new(url)
173    }
174
175    /// Internal builder function
176    fn build_internal(url: &str, allow_private: bool) -> Result<Self> {
177        // Auto-add https:// if no scheme provided
178        let url_with_scheme = if !url.contains("://") {
179            format!("https://{}", url)
180        } else {
181            url.to_string()
182        };
183
184        let base_url =
185            Url::parse(&url_with_scheme).map_err(|e| Error::InvalidUrl(e.to_string()))?;
186
187        // Validate URL scheme (SSRF protection)
188        if !ALLOWED_SCHEMES.contains(&base_url.scheme()) {
189            return Err(Error::InvalidUrl(format!(
190                "scheme '{}' not allowed (use http or https)",
191                base_url.scheme()
192            )));
193        }
194
195        // Validate host is not internal/private (SSRF protection)
196        if !allow_private {
197            Self::validate_host(&base_url)?;
198        }
199
200        let client = Client::builder()
201            .user_agent(USER_AGENT)
202            .timeout(Duration::from_secs(TIMEOUT_SECS))
203            .danger_accept_invalid_certs(false)
204            .build()
205            .map_err(|e| Error::HttpClient(e.to_string()))?;
206
207        Ok(Self { client, base_url })
208    }
209
210    /// Validate that the host is not an internal/private address (SSRF protection)
211    fn validate_host(url: &Url) -> Result<()> {
212        let host = url
213            .host_str()
214            .ok_or_else(|| Error::InvalidUrl("missing host".to_string()))?;
215
216        // Block localhost variants
217        if host == "localhost" || host.ends_with(".localhost") {
218            return Err(Error::InvalidUrl("localhost not allowed".to_string()));
219        }
220
221        // Resolve hostname to IP and check if it's internal
222        let port = url
223            .port()
224            .unwrap_or(if url.scheme() == "https" { 443 } else { 80 });
225        let socket_addr = format!("{}:{}", host, port);
226
227        if let Ok(addrs) = socket_addr.to_socket_addrs() {
228            for addr in addrs {
229                if Self::is_internal_ip(addr.ip()) {
230                    return Err(Error::InvalidUrl(format!(
231                        "internal/private IP address not allowed: {}",
232                        addr.ip()
233                    )));
234                }
235            }
236        }
237
238        Ok(())
239    }
240
241    /// Check if an IP address is internal/private (RFC 1918, link-local, loopback, etc.)
242    fn is_internal_ip(ip: IpAddr) -> bool {
243        match ip {
244            IpAddr::V4(ipv4) => {
245                ipv4.is_loopback()                      // 127.0.0.0/8
246                    || ipv4.is_private()                // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
247                    || ipv4.is_link_local()             // 169.254.0.0/16
248                    || ipv4.is_broadcast()              // 255.255.255.255
249                    || ipv4.is_unspecified()            // 0.0.0.0
250                    || ipv4.octets()[0] == 100          // Shared address space 100.64.0.0/10
251                        && ipv4.octets()[1] >= 64
252                        && ipv4.octets()[1] <= 127
253                    || ipv4.octets() == [169, 254, 169, 254]  // AWS metadata
254                    || ipv4.octets()[..2] == [192, 0] // Documentation/test ranges
255            }
256            IpAddr::V6(ipv6) => {
257                ipv6.is_loopback()                      // ::1
258                    || ipv6.is_unspecified()            // ::
259                    // Unique local addresses (fc00::/7)
260                    || (ipv6.segments()[0] & 0xfe00) == 0xfc00
261                    // Link-local (fe80::/10)
262                    || (ipv6.segments()[0] & 0xffc0) == 0xfe80
263            }
264        }
265    }
266
267    /// Scan the WordPress site
268    pub async fn scan(&self) -> Result<ScanResult> {
269        // Fetch homepage
270        let homepage_html = self.fetch_page(&self.base_url).await?;
271        let document = Html::parse_document(&homepage_html);
272
273        // Detect WordPress version
274        let wordpress_version = self.detect_wp_version(&document).await;
275
276        // If version not found, try alternative detection methods
277        let wordpress_detected = wordpress_version.is_some()
278            || self.detect_wp_from_rest_api().await.is_some()
279            || self.detect_wp_from_cookies().await.is_some();
280
281        // Fetch latest WordPress version
282        let wordpress_latest = self.fetch_wp_latest_version().await;
283
284        // Detect theme and fetch latest version
285        let theme = self.detect_theme(&document).await;
286
287        // Detect plugins and fetch latest versions
288        let plugins = self.detect_plugins(&document).await;
289
290        Ok(ScanResult {
291            url: self.base_url.clone(),
292            wordpress_detected,
293            wordpress_version,
294            wordpress_latest,
295            theme,
296            plugins,
297        })
298    }
299
300    /// Fetch latest WordPress version from API
301    async fn fetch_wp_latest_version(&self) -> Option<String> {
302        let url = format!("{}/core/version-check/1.7/", WP_API_BASE);
303        let response: WpVersionResponse =
304            self.client.get(&url).send().await.ok()?.json().await.ok()?;
305        response.offers.first().map(|o| o.version.clone())
306    }
307
308    /// Fetch latest plugin version from WordPress.org API
309    async fn fetch_plugin_latest_version(&self, slug: &str) -> Option<String> {
310        let url = format!(
311            "{}/plugins/info/1.2/?action=plugin_information&slug={}",
312            WP_API_BASE, slug
313        );
314        let response: PluginApiResponse =
315            self.client.get(&url).send().await.ok()?.json().await.ok()?;
316        response.version
317    }
318
319    /// Fetch latest theme version from WordPress.org API
320    async fn fetch_theme_latest_version(&self, slug: &str) -> Option<String> {
321        let url = format!(
322            "{}/themes/info/1.2/?action=theme_information&slug={}",
323            WP_API_BASE, slug
324        );
325        let response: ThemeApiResponse =
326            self.client.get(&url).send().await.ok()?.json().await.ok()?;
327        response.version
328    }
329
330    /// Fetch a page and return its HTML
331    async fn fetch_page(&self, url: &Url) -> Result<String> {
332        let response = self
333            .client
334            .get(url.as_str())
335            .send()
336            .await
337            .map_err(|e| Error::HttpRequest(e.to_string()))?;
338
339        if !response.status().is_success() {
340            return Err(Error::HttpStatus(response.status().as_u16()));
341        }
342
343        response
344            .text()
345            .await
346            .map_err(|e| Error::HttpRequest(e.to_string()))
347    }
348
349    /// Detect WordPress version from various sources
350    async fn detect_wp_version(&self, document: &Html) -> Option<String> {
351        // Try meta generator tag first
352        if let Some(version) = self.detect_version_from_meta(document) {
353            return Some(version);
354        }
355
356        // Try RSS feed
357        if let Some(version) = self.detect_version_from_feed().await {
358            return Some(version);
359        }
360
361        // Try readme.html
362        self.detect_version_from_readme().await
363    }
364
365    /// Detect version from meta generator tag
366    fn detect_version_from_meta(&self, document: &Html) -> Option<String> {
367        let selector = Selector::parse("meta[name='generator']").ok()?;
368
369        for element in document.select(&selector) {
370            if let Some(content) = element.value().attr("content")
371                && content.starts_with("WordPress")
372            {
373                // Extract version from "WordPress X.Y.Z"
374                let version = content.strip_prefix("WordPress ")?.trim();
375                if !version.is_empty() {
376                    return Some(version.to_string());
377                }
378            }
379        }
380        None
381    }
382
383    /// Detect version from RSS feed
384    async fn detect_version_from_feed(&self) -> Option<String> {
385        let feed_url = self.base_url.join(WP_FEED_PATH).ok()?;
386        let html = self.fetch_page(&feed_url).await.ok()?;
387
388        // Look for <generator>https://wordpress.org/?v=X.Y.Z</generator>
389        let re = Regex::new(r"wordpress\.org/\?v=([0-9.]+)").ok()?;
390        re.captures(&html)?.get(1).map(|m| m.as_str().to_string())
391    }
392
393    /// Detect version from readme.html
394    async fn detect_version_from_readme(&self) -> Option<String> {
395        let readme_url = self.base_url.join(WP_README_PATH).ok()?;
396        let html = self.fetch_page(&readme_url).await.ok()?;
397
398        // Look for "Version X.Y.Z" in readme
399        let re = Regex::new(r"Version\s+([0-9.]+)").ok()?;
400        re.captures(&html)?.get(1).map(|m| m.as_str().to_string())
401    }
402
403    /// Detect WordPress via wp-json REST API endpoint
404    async fn detect_wp_from_rest_api(&self) -> Option<()> {
405        let api_url = self.base_url.join(WP_JSON_PATH).ok()?;
406
407        let response = self.client.get(api_url.as_str()).send().await.ok()?;
408
409        if !response.status().is_success() {
410            return None;
411        }
412
413        // Try to parse as WordPress REST API response
414        let api_response: WpJsonResponse = response.json().await.ok()?;
415
416        // Check for WordPress-specific namespaces
417        if let Some(namespaces) = &api_response.namespaces
418            && namespaces.iter().any(|ns| ns.starts_with("wp/"))
419        {
420            return Some(());
421        }
422
423        // If we got a valid response with expected fields, it's likely WordPress
424        if api_response.name.is_some() || api_response.url.is_some() {
425            return Some(());
426        }
427
428        None
429    }
430
431    /// Check for WordPress cookies in response headers
432    async fn detect_wp_from_cookies(&self) -> Option<()> {
433        let response = self.client.get(self.base_url.as_str()).send().await.ok()?;
434
435        // Check for WordPress-specific cookies
436        for cookie in response.cookies() {
437            let name = cookie.name();
438            let is_wp_cookie =
439                WP_COOKIE_PREFIXES.iter().any(|p| name.starts_with(p)) || name == WP_LANG_COOKIE;
440            if is_wp_cookie {
441                return Some(());
442            }
443        }
444
445        // Also check Set-Cookie headers for WordPress patterns
446        if let Some(set_cookie) = response.headers().get("set-cookie")
447            && let Ok(cookie_str) = set_cookie.to_str()
448            && WP_COOKIE_PREFIXES.iter().any(|p| cookie_str.contains(p))
449        {
450            return Some(());
451        }
452
453        None
454    }
455
456    /// Detect the main theme
457    async fn detect_theme(&self, document: &Html) -> Option<ThemeInfo> {
458        // Look for theme in stylesheet URLs
459        let link_selector = Selector::parse("link[rel='stylesheet']").ok()?;
460
461        for element in document.select(&link_selector) {
462            if let Some(href) = element.value().attr("href")
463                && let Some(mut theme) = self.extract_theme_from_url(href)
464            {
465                // Fetch latest version from WordPress.org
466                theme.latest_version = self.fetch_theme_latest_version(&theme.slug).await;
467                return Some(theme);
468            }
469        }
470
471        // Also check style tags and other sources
472        let style_re = Regex::new(r"/wp-content/themes/([^/]+)/").ok()?;
473
474        let html = document.html();
475        if let Some(caps) = style_re.captures(&html) {
476            let slug = caps.get(1)?.as_str().to_string();
477            let latest_version = self.fetch_theme_latest_version(&slug).await;
478            return Some(ThemeInfo {
479                slug,
480                version: None,
481                latest_version,
482            });
483        }
484
485        None
486    }
487
488    /// Extract theme info from a URL
489    fn extract_theme_from_url(&self, url: &str) -> Option<ThemeInfo> {
490        // Match /wp-content/themes/theme-name/
491        let re = Regex::new(r"/wp-content/themes/([^/]+)/").ok()?;
492        let caps = re.captures(url)?;
493        let slug = caps.get(1)?.as_str().to_string();
494
495        // Try to extract version from URL query params
496        let version = if let Some(v_pos) = url.find("ver=") {
497            let v_start = v_pos + 4;
498            let v_end = url[v_start..]
499                .find(|c: char| !c.is_ascii_alphanumeric() && c != '.' && c != '-' && c != '_')
500                .map(|i| v_start + i)
501                .unwrap_or(url.len());
502            let raw_version = url[v_start..v_end].to_string();
503            Some(Self::normalize_version(&raw_version))
504        } else {
505            None
506        };
507
508        Some(ThemeInfo {
509            slug,
510            version,
511            latest_version: None,
512        })
513    }
514
515    /// Detect plugins from the page (includes mu-plugins)
516    async fn detect_plugins(&self, document: &Html) -> Vec<PluginInfo> {
517        let mut plugin_slugs = HashSet::new();
518        let html = document.html();
519
520        // Regex to find plugin paths - includes both plugins and mu-plugins
521        let plugin_re = Regex::new(r"/wp-content/(?:mu-)?plugins/([a-zA-Z0-9_-]+)/").unwrap();
522
523        for caps in plugin_re.captures_iter(&html) {
524            if let Some(slug) = caps.get(1) {
525                let slug_str = slug.as_str().to_string();
526                if !SKIP_PLUGIN_SLUGS.contains(&slug_str.as_str()) {
527                    plugin_slugs.insert(slug_str);
528                }
529            }
530        }
531
532        // Convert to PluginInfo, fetching latest versions
533        let mut plugins = Vec::new();
534        for slug in plugin_slugs {
535            let version = self.find_plugin_version(&html, &slug);
536            let latest_version = self.fetch_plugin_latest_version(&slug).await;
537            plugins.push(PluginInfo {
538                slug,
539                version,
540                latest_version,
541            });
542        }
543        plugins
544    }
545
546    /// Find plugin version from HTML
547    fn find_plugin_version(&self, html: &str, slug: &str) -> Option<String> {
548        // Look for ver= parameter in plugin URLs (supports both plugins and mu-plugins)
549        let pattern = format!(
550            r#"/wp-content/(?:mu-)?plugins/{}/[^'"]*\?[^'"]*ver=([0-9a-zA-Z._-]+)"#,
551            regex::escape(slug)
552        );
553        let re = Regex::new(&pattern).ok()?;
554        let caps = re.captures(html)?;
555        let version = caps.get(1)?.as_str().to_string();
556
557        // Filter out Unix timestamps (10-digit numbers) and hash-like versions
558        Some(Self::normalize_version(&version))
559    }
560
561    /// Normalize version string - detect timestamps and hashes
562    fn normalize_version(version: &str) -> String {
563        // Unix timestamp detection (10 digits, starts with 1 or 2, reasonable range)
564        if version.len() == 10
565            && version.chars().all(|c| c.is_ascii_digit())
566            && version.starts_with(['1', '2'])
567        {
568            return format!("(timestamp:{})", version);
569        }
570
571        // Git commit hash detection (40 hex chars or 7+ hex abbreviation)
572        if (version.len() == 40 || version.len() >= 7)
573            && version.chars().all(|c| c.is_ascii_hexdigit())
574            && !version.chars().all(|c| c.is_ascii_digit())
575        {
576            let short = if version.len() > 7 {
577                &version[..7]
578            } else {
579                version
580            };
581            return format!("(hash:{})", short);
582        }
583
584        version.to_string()
585    }
586}
587
588#[cfg(test)]
589mod tests {
590    use super::*;
591
592    #[test]
593    fn parse_valid_url() {
594        // Note: This may fail if example.com resolves to an internal IP in test environment
595        let scanner = Scanner::new("https://example.com");
596        assert!(scanner.is_ok());
597    }
598
599    #[test]
600    fn parse_invalid_url() {
601        let scanner = Scanner::new("not a url");
602        assert!(scanner.is_err());
603    }
604
605    #[test]
606    fn reject_localhost() {
607        let result = Scanner::new("http://localhost");
608        assert!(result.is_err());
609        assert!(result.unwrap_err().to_string().contains("localhost"));
610    }
611
612    #[test]
613    fn reject_localhost_subdomain() {
614        let result = Scanner::new("http://foo.localhost");
615        assert!(result.is_err());
616    }
617
618    #[test]
619    fn reject_file_scheme() {
620        let result = Scanner::new("file:///etc/passwd");
621        assert!(result.is_err());
622        assert!(result.unwrap_err().to_string().contains("scheme"));
623    }
624
625    #[test]
626    fn reject_ftp_scheme() {
627        let result = Scanner::new("ftp://example.com");
628        assert!(result.is_err());
629        assert!(result.unwrap_err().to_string().contains("scheme"));
630    }
631
632    #[test]
633    fn internal_ip_detection() {
634        use std::net::Ipv4Addr;
635
636        // Private ranges
637        assert!(Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
638            10, 0, 0, 1
639        ))));
640        assert!(Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
641            172, 16, 0, 1
642        ))));
643        assert!(Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
644            192, 168, 1, 1
645        ))));
646
647        // Loopback
648        assert!(Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
649            127, 0, 0, 1
650        ))));
651
652        // Link-local
653        assert!(Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
654            169, 254, 1, 1
655        ))));
656
657        // Public IP should pass
658        assert!(!Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
659            8, 8, 8, 8
660        ))));
661        assert!(!Scanner::is_internal_ip(IpAddr::V4(Ipv4Addr::new(
662            93, 184, 216, 34
663        ))));
664    }
665
666    #[test]
667    fn normalize_semantic_version() {
668        assert_eq!(Scanner::normalize_version("1.2.3"), "1.2.3");
669        assert_eq!(Scanner::normalize_version("22.0.0"), "22.0.0");
670        assert_eq!(Scanner::normalize_version("7.0-alpha"), "7.0-alpha");
671    }
672
673    #[test]
674    fn normalize_timestamp_version() {
675        // Unix timestamps should be marked
676        assert_eq!(
677            Scanner::normalize_version("1748271784"),
678            "(timestamp:1748271784)"
679        );
680        assert_eq!(
681            Scanner::normalize_version("1748268723"),
682            "(timestamp:1748268723)"
683        );
684    }
685
686    #[test]
687    fn normalize_hash_version() {
688        // Git hashes should be shortened and marked
689        assert_eq!(
690            Scanner::normalize_version("569ab5664387d06c16a234c9771d3d57fb15720a"),
691            "(hash:569ab56)"
692        );
693        assert_eq!(Scanner::normalize_version("abcdef1"), "(hash:abcdef1)");
694    }
695
696    #[test]
697    fn normalize_date_version() {
698        // Date-like versions (8 digits) should pass through
699        assert_eq!(Scanner::normalize_version("20200121"), "20200121");
700    }
701}