linkedin_profile_validator/
lib.rs

1//! `LinkedIn` profile URL validation library.
2//!
3//! This crate provides tools to validate `LinkedIn` profile URLs by checking both
4//! format correctness and profile existence through HTTP requests.
5//!
6//! # Features
7//!
8//! - Format validation without network calls
9//! - Profile existence verification
10//! - Async and sync APIs
11//! - Rate limiting awareness
12//!
13//! # Examples
14//!
15//! ## Basic usage
16//!
17//! ```no_run
18//! use linkedin_profile_validator::{LinkedInValidator, LinkedInUrlError};
19//!
20//! let validator = LinkedInValidator::new();
21//! match validator.is_valid_linkedin_profile_url("https://www.linkedin.com/in/johndoe") {
22//!     Ok(_) => println!("Profile exists!"),
23//!     Err(LinkedInUrlError::ProfileNotFound) => println!("Profile not found"),
24//!     Err(LinkedInUrlError::AuthenticationRequired) => println!("LinkedIn requires auth"),
25//!     Err(e) => println!("Error: {}", e),
26//! }
27//! ```
28//!
29//! ## Format validation only
30//!
31//! ```
32//! use linkedin_profile_validator::is_valid_linkedin_profile_format;
33//!
34//! if is_valid_linkedin_profile_format("https://www.linkedin.com/in/johndoe") {
35//!     println!("Valid LinkedIn profile URL format");
36//! }
37//! ```
38
39use regex::Regex;
40use thiserror::Error;
41use url::Url;
42
43/// Errors that can occur during `LinkedIn` URL validation.
44#[derive(Error, Debug)]
45pub enum LinkedInUrlError {
46    /// The provided URL has invalid format.
47    #[error("Invalid URL format: {0}")]
48    InvalidUrl(String),
49
50    /// The URL is not from `LinkedIn` domain.
51    #[error("Not a LinkedIn URL")]
52    NotLinkedInUrl,
53
54    /// The URL is from `LinkedIn` but not a profile URL.
55    #[error("Not a LinkedIn profile URL")]
56    NotProfileUrl,
57
58    /// Network error occurred during validation.
59    #[error("Network error: {0}")]
60    NetworkError(#[from] reqwest::Error),
61
62    /// The `LinkedIn` profile was not found (404).
63    #[error("Profile not found (404)")]
64    ProfileNotFound,
65
66    /// `LinkedIn` requires authentication to verify the profile.
67    #[error("Unable to verify - LinkedIn requires authentication")]
68    AuthenticationRequired,
69}
70
71/// A `LinkedIn` profile validator that performs HTTP requests to verify profile existence.
72///
73/// # Example
74///
75/// ```no_run
76/// use linkedin_profile_validator::LinkedInValidator;
77///
78/// let validator = LinkedInValidator::new();
79/// let result = validator.is_valid_linkedin_profile_url("https://www.linkedin.com/in/johndoe");
80/// ```
81pub struct LinkedInValidator {
82    client: reqwest::blocking::Client,
83}
84
85impl LinkedInValidator {
86    /// Creates a new `LinkedIn` validator instance.
87    ///
88    /// # Panics
89    ///
90    /// Panics if the HTTP client cannot be built.
91    #[must_use]
92    pub fn new() -> Self {
93        let client = reqwest::blocking::Client::builder()
94            .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
95            .timeout(std::time::Duration::from_secs(10))
96            .build()
97            .unwrap();
98
99        Self { client }
100    }
101
102    /// Validates a `LinkedIn` profile URL by checking format and existence.
103    ///
104    /// This method performs an HTTP request to verify if the profile actually exists.
105    ///
106    /// # Arguments
107    ///
108    /// * `url_str` - The `LinkedIn` profile URL to validate
109    ///
110    /// # Returns
111    ///
112    /// * `Ok(true)` - If the profile exists
113    /// * `Err(LinkedInUrlError)` - If validation fails
114    ///
115    /// # Errors
116    ///
117    /// Returns an error if:
118    /// - The URL format is invalid
119    /// - The URL is not from `LinkedIn` domain
120    /// - The URL is not a profile URL
121    /// - Network request fails
122    /// - The profile doesn't exist (404)
123    /// - `LinkedIn` requires authentication
124    ///
125    /// # Example
126    ///
127    /// ```no_run
128    /// use linkedin_profile_validator::LinkedInValidator;
129    ///
130    /// let validator = LinkedInValidator::new();
131    /// match validator.is_valid_linkedin_profile_url("https://www.linkedin.com/in/johndoe") {
132    ///     Ok(_) => println!("Valid profile"),
133    ///     Err(e) => println!("Invalid: {}", e),
134    /// }
135    /// ```
136    pub fn is_valid_linkedin_profile_url(&self, url_str: &str) -> Result<bool, LinkedInUrlError> {
137        let url = Url::parse(url_str).map_err(|e| LinkedInUrlError::InvalidUrl(e.to_string()))?;
138
139        if !is_linkedin_domain(&url) {
140            return Err(LinkedInUrlError::NotLinkedInUrl);
141        }
142
143        if !is_profile_path(&url) {
144            return Err(LinkedInUrlError::NotProfileUrl);
145        }
146
147        self.check_profile_exists(url_str)?;
148
149        Ok(true)
150    }
151
152    fn check_profile_exists(&self, url: &str) -> Result<(), LinkedInUrlError> {
153        let mut response = self.client.get(url).send()?;
154
155        // LinkedIn returns 999 status for bot detection/rate limiting
156        // In this case, we need to follow redirects manually
157        if response.status().as_u16() == 999 {
158            // Try with cookie header to bypass authwall
159            response = self.client.get(url).header("Cookie", "sl=v=1&1").send()?;
160        }
161
162        // Check if redirected to 404 page
163        let final_url = response.url().to_string();
164        if final_url.contains("/404/") || final_url.contains("linkedin.com/404") {
165            return Err(LinkedInUrlError::ProfileNotFound);
166        }
167
168        // Get response body
169        let body = response.text()?;
170
171        // Check for authwall (indicates we're being blocked)
172        if body.contains("/authwall") || body.contains("sessionRedirect") {
173            // When we hit authwall, we can't determine if profile exists
174            return Err(LinkedInUrlError::AuthenticationRequired);
175        }
176
177        // Check for common error page indicators
178        if body.contains("This page doesn't exist")
179            || body.contains("This page doesn't exist")
180            || body.contains("Page not found")
181            || body.contains("Check the URL or return to LinkedIn home")
182            || body.contains("return to LinkedIn home")
183            || body.contains("Go to your feed") && body.contains("doesn't exist")
184        {
185            return Err(LinkedInUrlError::ProfileNotFound);
186        }
187
188        Ok(())
189    }
190}
191
192fn is_linkedin_domain(url: &Url) -> bool {
193    matches!(url.domain(), Some(domain) if domain == "linkedin.com" || domain == "www.linkedin.com")
194}
195
196fn is_profile_path(url: &Url) -> bool {
197    let path = url.path();
198    let profile_regex = Regex::new(r"^/in/[a-zA-Z0-9\-]+/?$").unwrap();
199    profile_regex.is_match(path)
200}
201
202impl Default for LinkedInValidator {
203    fn default() -> Self {
204        Self::new()
205    }
206}
207
208/// Validates a `LinkedIn` profile URL asynchronously.
209///
210/// This function performs an HTTP request to verify if the profile actually exists.
211/// Use this for async contexts like web servers.
212///
213/// # Arguments
214///
215/// * `url` - The `LinkedIn` profile URL to validate
216///
217/// # Returns
218///
219/// * `Ok(true)` - If the profile exists
220/// * `Err(LinkedInUrlError)` - If validation fails
221///
222/// # Errors
223///
224/// Returns an error if:
225/// - The URL format is invalid
226/// - The URL is not from `LinkedIn` domain
227/// - The URL is not a profile URL
228/// - Network request fails
229/// - The profile doesn't exist (404)
230/// - `LinkedIn` requires authentication
231///
232/// # Example
233///
234/// ```no_run
235/// use linkedin_profile_validator::validate_linkedin_url_async;
236///
237/// # async fn example() {
238/// match validate_linkedin_url_async("https://www.linkedin.com/in/johndoe").await {
239///     Ok(_) => println!("Valid profile"),
240///     Err(e) => println!("Invalid: {}", e),
241/// }
242/// # }
243/// ```
244pub async fn validate_linkedin_url_async(url: &str) -> Result<bool, LinkedInUrlError> {
245    let url_parsed = Url::parse(url).map_err(|e| LinkedInUrlError::InvalidUrl(e.to_string()))?;
246
247    if !is_linkedin_domain(&url_parsed) {
248        return Err(LinkedInUrlError::NotLinkedInUrl);
249    }
250
251    if !is_profile_path(&url_parsed) {
252        return Err(LinkedInUrlError::NotProfileUrl);
253    }
254
255    let client = reqwest::Client::builder()
256        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
257        .timeout(std::time::Duration::from_secs(10))
258        .build()?;
259
260    let mut response = client.get(url).send().await?;
261
262    // LinkedIn returns 999 status for bot detection/rate limiting
263    if response.status().as_u16() == 999 {
264        // Try with cookie header to bypass authwall
265        response = client.get(url).header("Cookie", "sl=v=1&1").send().await?;
266    }
267
268    // Check if redirected to 404 page
269    let final_url = response.url().to_string();
270    if final_url.contains("/404/") || final_url.contains("linkedin.com/404") {
271        return Err(LinkedInUrlError::ProfileNotFound);
272    }
273
274    // Get response body
275    let body = response.text().await?;
276
277    // Check for authwall (indicates we're being blocked)
278    if body.contains("/authwall") || body.contains("sessionRedirect") {
279        return Err(LinkedInUrlError::AuthenticationRequired);
280    }
281
282    // Check for common error page indicators
283    if body.contains("This page doesn't exist")
284        || body.contains("This page doesn't exist")
285        || body.contains("Page not found")
286        || body.contains("Check the URL or return to LinkedIn home")
287        || body.contains("return to LinkedIn home")
288        || body.contains("Go to your feed") && body.contains("doesn't exist")
289    {
290        return Err(LinkedInUrlError::ProfileNotFound);
291    }
292
293    Ok(true)
294}
295
296/// Checks if a URL has valid `LinkedIn` profile format without making network calls.
297///
298/// This function only validates the URL format and does not check if the profile exists.
299/// Use this for quick validation without network overhead.
300///
301/// # Arguments
302///
303/// * `url` - The URL to validate
304///
305/// # Returns
306///
307/// * `true` - If the URL has valid `LinkedIn` profile format
308/// * `false` - If the URL is invalid or not a `LinkedIn` profile URL
309///
310/// # Example
311///
312/// ```
313/// use linkedin_profile_validator::is_valid_linkedin_profile_format;
314///
315/// assert!(is_valid_linkedin_profile_format("https://www.linkedin.com/in/johndoe"));
316/// assert!(!is_valid_linkedin_profile_format("https://www.google.com/in/johndoe"));
317/// assert!(!is_valid_linkedin_profile_format("https://linkedin.com/company/microsoft"));
318/// ```
319#[must_use]
320pub fn is_valid_linkedin_profile_format(url: &str) -> bool {
321    let Ok(url_parsed) = Url::parse(url) else {
322        return false;
323    };
324
325    is_linkedin_domain(&url_parsed) && is_profile_path(&url_parsed)
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_valid_profile_format() {
334        // Test with real valid profiles
335        assert!(is_valid_linkedin_profile_format(
336            "https://www.linkedin.com/in/hamze/"
337        ));
338        assert!(is_valid_linkedin_profile_format(
339            "https://www.linkedin.com/in/hamzeghalebi/"
340        ));
341        assert!(is_valid_linkedin_profile_format(
342            "https://www.linkedin.com/in/johndoe"
343        ));
344        assert!(is_valid_linkedin_profile_format(
345            "https://linkedin.com/in/jane-doe"
346        ));
347        assert!(is_valid_linkedin_profile_format(
348            "https://www.linkedin.com/in/john-doe-123/"
349        ));
350    }
351
352    #[test]
353    fn test_invalid_profile_format() {
354        assert!(!is_valid_linkedin_profile_format(
355            "https://www.google.com/in/johndoe"
356        ));
357        assert!(!is_valid_linkedin_profile_format(
358            "https://linkedin.com/company/microsoft"
359        ));
360        assert!(!is_valid_linkedin_profile_format("https://linkedin.com/"));
361        assert!(!is_valid_linkedin_profile_format("not-a-url"));
362    }
363
364    #[test]
365    fn test_real_valid_profile() {
366        let validator = LinkedInValidator::new();
367        // This is a valid LinkedIn profile
368        match validator.is_valid_linkedin_profile_url("https://www.linkedin.com/in/hamze/") {
369            Ok(true) => (),
370            Ok(false) => panic!("Expected profile to be valid"),
371            Err(LinkedInUrlError::AuthenticationRequired) => {
372                println!("LinkedIn requires authentication - cannot verify profile existence");
373            }
374            Err(e) => panic!("Expected profile to be valid or require auth, got error: {e}"),
375        }
376    }
377
378    #[test]
379    fn test_real_invalid_profile() {
380        let validator = LinkedInValidator::new();
381        // This LinkedIn profile doesn't exist - LinkedIn shows error page
382        match validator.is_valid_linkedin_profile_url("https://www.linkedin.com/in/hamzeghalebi/") {
383            Ok(_) => {
384                // LinkedIn might be allowing access sometimes, especially after multiple requests
385                // This is inconsistent behavior from LinkedIn
386                println!("Warning: LinkedIn allowed access to profile page - cannot determine if profile actually exists");
387            }
388            Err(LinkedInUrlError::ProfileNotFound) => (),
389            Err(LinkedInUrlError::AuthenticationRequired) => {
390                println!("LinkedIn requires authentication - cannot verify profile existence");
391            }
392            Err(e) => panic!("Expected ProfileNotFound or AuthenticationRequired error, got: {e}"),
393        }
394    }
395
396    #[tokio::test]
397    async fn test_async_valid_profile() {
398        // Test async validation with valid profile
399        match validate_linkedin_url_async("https://www.linkedin.com/in/hamze/").await {
400            Ok(true) => (),
401            Ok(false) => panic!("Expected profile to be valid"),
402            Err(LinkedInUrlError::AuthenticationRequired) => {
403                println!("LinkedIn requires authentication - cannot verify profile existence");
404            }
405            Err(e) => panic!("Expected profile to be valid or require auth, got error: {e}"),
406        }
407    }
408
409    #[tokio::test]
410    async fn test_async_invalid_profile() {
411        // Test async validation with invalid profile that shows error page
412        match validate_linkedin_url_async("https://www.linkedin.com/in/hamzeghalebi/").await {
413            Ok(_) => {
414                // LinkedIn might be allowing access sometimes, especially after multiple requests
415                // This is inconsistent behavior from LinkedIn
416                println!("Warning: LinkedIn allowed access to profile page - cannot determine if profile actually exists");
417            }
418            Err(LinkedInUrlError::ProfileNotFound) => (),
419            Err(LinkedInUrlError::AuthenticationRequired) => {
420                println!("LinkedIn requires authentication - cannot verify profile existence");
421            }
422            Err(e) => panic!("Expected ProfileNotFound or AuthenticationRequired error, got: {e}"),
423        }
424    }
425
426    #[test]
427    #[ignore = "Debug test to inspect LinkedIn response"]
428    fn debug_linkedin_response() {
429        let client = reqwest::blocking::Client::builder()
430            .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
431            .timeout(std::time::Duration::from_secs(10))
432            .build()
433            .unwrap();
434
435        let url = "https://www.linkedin.com/in/hamzeghalebi/";
436        let response = client.get(url).send().unwrap();
437
438        println!("Status: {}", response.status());
439        println!("Final URL: {}", response.url());
440
441        let body = response.text().unwrap();
442        println!("Body length: {}", body.len());
443        println!(
444            "First 2000 chars:\n{}",
445            &body.chars().take(2000).collect::<String>()
446        );
447    }
448}