ruvector_data_framework/
patent_clients.rs

1//! Patent database API integrations for USPTO PatentsView and EPO
2//!
3//! This module provides async clients for fetching patent data from:
4//! - USPTO PatentsView API (Free, no authentication required)
5//! - EPO Open Patent Services (Free tier available)
6//!
7//! Converts patent data to SemanticVector format for RuVector discovery.
8
9use std::collections::HashMap;
10use std::sync::Arc;
11use std::time::Duration;
12
13use chrono::{NaiveDate, Utc};
14use reqwest::{Client, StatusCode};
15use serde::Deserialize;
16use tokio::time::sleep;
17
18use crate::api_clients::SimpleEmbedder;
19use crate::ruvector_native::{Domain, SemanticVector};
20use crate::{FrameworkError, Result};
21
22/// Rate limiting configuration
23const USPTO_RATE_LIMIT_MS: u64 = 200; // ~5 requests/second
24const EPO_RATE_LIMIT_MS: u64 = 1000; // Conservative 1 request/second
25const MAX_RETRIES: u32 = 3;
26const RETRY_DELAY_MS: u64 = 1000;
27
28// ============================================================================
29// USPTO PatentsView API Client
30// ============================================================================
31
32/// USPTO PatentsView API response
33#[derive(Debug, Deserialize)]
34struct UsptoPatentsResponse {
35    #[serde(default)]
36    patents: Vec<UsptoPatent>,
37    #[serde(default)]
38    count: i32,
39    #[serde(default)]
40    total_patent_count: Option<i32>,
41}
42
43/// USPTO Patent record
44#[derive(Debug, Deserialize)]
45struct UsptoPatent {
46    /// Patent number
47    patent_number: String,
48    /// Patent title
49    #[serde(default)]
50    patent_title: Option<String>,
51    /// Patent abstract
52    #[serde(default)]
53    patent_abstract: Option<String>,
54    /// Grant date (YYYY-MM-DD)
55    #[serde(default)]
56    patent_date: Option<String>,
57    /// Application filing date
58    #[serde(default)]
59    app_date: Option<String>,
60    /// Assignees (organizations/companies)
61    #[serde(default)]
62    assignees: Vec<UsptoAssignee>,
63    /// Inventors
64    #[serde(default)]
65    inventors: Vec<UsptoInventor>,
66    /// CPC classifications
67    #[serde(default)]
68    cpcs: Vec<UsptoCpc>,
69    /// Citation counts
70    #[serde(default)]
71    cited_patent_count: Option<i32>,
72    #[serde(default)]
73    citedby_patent_count: Option<i32>,
74}
75
76#[derive(Debug, Deserialize)]
77struct UsptoAssignee {
78    #[serde(default)]
79    assignee_organization: Option<String>,
80    #[serde(default)]
81    assignee_individual_name_first: Option<String>,
82    #[serde(default)]
83    assignee_individual_name_last: Option<String>,
84}
85
86#[derive(Debug, Deserialize)]
87struct UsptoInventor {
88    #[serde(default)]
89    inventor_name_first: Option<String>,
90    #[serde(default)]
91    inventor_name_last: Option<String>,
92}
93
94#[derive(Debug, Deserialize)]
95struct UsptoCpc {
96    /// CPC section (e.g., "Y02")
97    #[serde(default)]
98    cpc_section_id: Option<String>,
99    /// CPC subclass (e.g., "Y02E")
100    #[serde(default)]
101    cpc_subclass_id: Option<String>,
102    /// CPC group (e.g., "Y02E10/50")
103    #[serde(default)]
104    cpc_group_id: Option<String>,
105}
106
107/// USPTO citation response
108#[derive(Debug, Deserialize)]
109struct UsptoCitationsResponse {
110    #[serde(default)]
111    patents: Vec<UsptoCitation>,
112}
113
114#[derive(Debug, Deserialize)]
115struct UsptoCitation {
116    patent_number: String,
117    #[serde(default)]
118    patent_title: Option<String>,
119}
120
121/// Client for USPTO PatentsView API (PatentSearch API v2)
122///
123/// PatentsView provides free access to USPTO patent data with no authentication required.
124/// Uses the new PatentSearch API (ElasticSearch-based) as of May 2025.
125/// API documentation: https://search.patentsview.org/docs/
126pub struct UsptoPatentClient {
127    client: Client,
128    base_url: String,
129    rate_limit_delay: Duration,
130    embedder: Arc<SimpleEmbedder>,
131}
132
133impl UsptoPatentClient {
134    /// Create a new USPTO PatentsView client
135    ///
136    /// No authentication required for the PatentsView API.
137    /// Uses the new PatentSearch API at search.patentsview.org
138    pub fn new() -> Result<Self> {
139        let client = Client::builder()
140            .timeout(Duration::from_secs(30))
141            .user_agent("RuVector-Discovery/1.0")
142            .build()
143            .map_err(FrameworkError::Network)?;
144
145        Ok(Self {
146            client,
147            base_url: "https://search.patentsview.org/api/v1".to_string(),
148            rate_limit_delay: Duration::from_millis(USPTO_RATE_LIMIT_MS),
149            embedder: Arc::new(SimpleEmbedder::new(512)), // Higher dimension for technical text
150        })
151    }
152
153    /// Search patents by keyword query
154    ///
155    /// # Arguments
156    /// * `query` - Search keywords (e.g., "artificial intelligence", "solar cell")
157    /// * `max_results` - Maximum number of results to return (max 1000 per page)
158    ///
159    /// # Example
160    /// ```ignore
161    /// let client = UsptoPatentClient::new()?;
162    /// let patents = client.search_patents("quantum computing", 50).await?;
163    /// ```
164    pub async fn search_patents(
165        &self,
166        query: &str,
167        max_results: usize,
168    ) -> Result<Vec<SemanticVector>> {
169        let per_page = max_results.min(100);
170        let encoded_query = urlencoding::encode(query);
171
172        // New PatentSearch API uses GET with query parameters
173        // Query format: q=patent_title:*query* OR patent_abstract:*query*
174        let url = format!(
175            "{}/patent/?q=patent_title:*{}*%20OR%20patent_abstract:*{}*&f=patent_id,patent_title,patent_abstract,patent_date,assignees,inventors,cpcs&o={{\"size\":{},\"matched_subentities_only\":true}}",
176            self.base_url, encoded_query, encoded_query, per_page
177        );
178
179        sleep(self.rate_limit_delay).await;
180
181        let response = self.fetch_with_retry(&url).await?;
182        let uspto_response: UsptoPatentsResponse = response.json().await?;
183
184        self.convert_patents_to_vectors(uspto_response.patents)
185    }
186
187    /// Search patents by assignee (company/organization name)
188    ///
189    /// # Arguments
190    /// * `company_name` - Company or organization name (e.g., "IBM", "Google")
191    /// * `max_results` - Maximum number of results to return
192    ///
193    /// # Example
194    /// ```ignore
195    /// let patents = client.search_by_assignee("Tesla Inc", 100).await?;
196    /// ```
197    pub async fn search_by_assignee(
198        &self,
199        company_name: &str,
200        max_results: usize,
201    ) -> Result<Vec<SemanticVector>> {
202        let per_page = max_results.min(100);
203        let encoded_name = urlencoding::encode(company_name);
204
205        // New PatentSearch API format
206        let url = format!(
207            "{}/patent/?q=assignees.assignee_organization:*{}*&f=patent_id,patent_title,patent_abstract,patent_date,assignees,inventors,cpcs&o={{\"size\":{},\"matched_subentities_only\":true}}",
208            self.base_url, encoded_name, per_page
209        );
210
211        sleep(self.rate_limit_delay).await;
212
213        let response = self.fetch_with_retry(&url).await?;
214        let uspto_response: UsptoPatentsResponse = response.json().await?;
215
216        self.convert_patents_to_vectors(uspto_response.patents)
217    }
218
219    /// Search patents by CPC classification code
220    ///
221    /// # Arguments
222    /// * `cpc_class` - CPC classification (e.g., "Y02E" for climate tech energy, "G06N" for AI)
223    /// * `max_results` - Maximum number of results to return
224    ///
225    /// # Example - Climate Change Mitigation Technologies
226    /// ```ignore
227    /// let climate_patents = client.search_by_cpc("Y02", 200).await?;
228    /// ```
229    ///
230    /// # Common CPC Classes
231    /// * `Y02` - Climate change mitigation technologies
232    /// * `Y02E` - Climate tech - Energy generation/transmission/distribution
233    /// * `G06N` - Computing arrangements based on AI/ML/neural networks
234    /// * `A61` - Medical or veterinary science
235    /// * `H01` - Electric elements (batteries, solar cells, etc.)
236    pub async fn search_by_cpc(
237        &self,
238        cpc_class: &str,
239        max_results: usize,
240    ) -> Result<Vec<SemanticVector>> {
241        let per_page = max_results.min(100);
242        let encoded_cpc = urlencoding::encode(cpc_class);
243
244        // New PatentSearch API - query cpcs.cpc_group field
245        let url = format!(
246            "{}/patent/?q=cpcs.cpc_group:{}*&f=patent_id,patent_title,patent_abstract,patent_date,assignees,inventors,cpcs&o={{\"size\":{},\"matched_subentities_only\":true}}",
247            self.base_url, encoded_cpc, per_page
248        );
249
250        sleep(self.rate_limit_delay).await;
251
252        let response = self.fetch_with_retry(&url).await?;
253        let uspto_response: UsptoPatentsResponse = response.json().await?;
254
255        self.convert_patents_to_vectors(uspto_response.patents)
256    }
257
258    /// Get detailed information for a specific patent
259    ///
260    /// # Arguments
261    /// * `patent_number` - USPTO patent number (e.g., "10000000")
262    ///
263    /// # Example
264    /// ```ignore
265    /// let patent = client.get_patent("10123456").await?;
266    /// ```
267    pub async fn get_patent(&self, patent_number: &str) -> Result<Option<SemanticVector>> {
268        // New PatentSearch API - direct patent lookup
269        let url = format!(
270            "{}/patent/?q=patent_id:{}&f=patent_id,patent_title,patent_abstract,patent_date,assignees,inventors,cpcs&o={{\"size\":1}}",
271            self.base_url, patent_number
272        );
273
274        sleep(self.rate_limit_delay).await;
275
276        let response = self.fetch_with_retry(&url).await?;
277        let uspto_response: UsptoPatentsResponse = response.json().await?;
278
279        let mut vectors = self.convert_patents_to_vectors(uspto_response.patents)?;
280        Ok(vectors.pop())
281    }
282
283    /// Get citations for a patent (both citing and cited patents)
284    ///
285    /// # Arguments
286    /// * `patent_number` - USPTO patent number
287    ///
288    /// # Returns
289    /// Tuple of (patents that cite this patent, patents cited by this patent)
290    pub async fn get_citations(
291        &self,
292        patent_number: &str,
293    ) -> Result<(Vec<SemanticVector>, Vec<SemanticVector>)> {
294        // Get patents that cite this one (forward citations)
295        let citing = self.get_citing_patents(patent_number).await?;
296
297        // Get patents cited by this one (backward citations)
298        let cited = self.get_cited_patents(patent_number).await?;
299
300        Ok((citing, cited))
301    }
302
303    /// Get patents that cite the given patent (forward citations)
304    /// Note: Citation data requires separate API endpoints in PatentSearch API v2
305    async fn get_citing_patents(&self, _patent_number: &str) -> Result<Vec<SemanticVector>> {
306        // The new PatentSearch API handles citations differently
307        // Forward citations are available via /api/v1/us_patent_citation/ endpoint
308        // For now, return empty - full citation support requires additional implementation
309        Ok(Vec::new())
310    }
311
312    /// Get patents cited by the given patent (backward citations)
313    /// Note: Citation data requires separate API endpoints in PatentSearch API v2
314    async fn get_cited_patents(&self, _patent_number: &str) -> Result<Vec<SemanticVector>> {
315        // The new PatentSearch API handles citations differently
316        // Backward citations are available via /api/v1/us_patent_citation/ endpoint
317        // For now, return empty - full citation support requires additional implementation
318        Ok(Vec::new())
319    }
320
321    /// Convert USPTO patent records to SemanticVectors
322    fn convert_patents_to_vectors(&self, patents: Vec<UsptoPatent>) -> Result<Vec<SemanticVector>> {
323        let mut vectors = Vec::new();
324
325        for patent in patents {
326            let title = patent.patent_title.unwrap_or_else(|| "Untitled Patent".to_string());
327            let abstract_text = patent.patent_abstract.unwrap_or_default();
328
329            // Create combined text for embedding
330            let text = format!("{} {}", title, abstract_text);
331            let embedding = self.embedder.embed_text(&text);
332
333            // Parse grant date (prefer patent_date, fallback to app_date)
334            let timestamp = patent
335                .patent_date
336                .or(patent.app_date)
337                .as_ref()
338                .and_then(|d| NaiveDate::parse_from_str(d, "%Y-%m-%d").ok())
339                .and_then(|d| d.and_hms_opt(0, 0, 0))
340                .map(|dt| dt.and_utc())
341                .unwrap_or_else(Utc::now);
342
343            // Extract assignee names
344            let assignees = patent
345                .assignees
346                .iter()
347                .map(|a| {
348                    a.assignee_organization
349                        .clone()
350                        .or_else(|| {
351                            let first = a.assignee_individual_name_first.as_deref().unwrap_or("");
352                            let last = a.assignee_individual_name_last.as_deref().unwrap_or("");
353                            if !first.is_empty() || !last.is_empty() {
354                                Some(format!("{} {}", first, last).trim().to_string())
355                            } else {
356                                None
357                            }
358                        })
359                        .unwrap_or_default()
360                })
361                .filter(|s| !s.is_empty())
362                .collect::<Vec<_>>()
363                .join(", ");
364
365            // Extract inventor names
366            let inventors = patent
367                .inventors
368                .iter()
369                .filter_map(|i| {
370                    let first = i.inventor_name_first.as_deref().unwrap_or("");
371                    let last = i.inventor_name_last.as_deref().unwrap_or("");
372                    if !first.is_empty() || !last.is_empty() {
373                        Some(format!("{} {}", first, last).trim().to_string())
374                    } else {
375                        None
376                    }
377                })
378                .collect::<Vec<_>>()
379                .join(", ");
380
381            // Extract CPC codes
382            let cpc_codes = patent
383                .cpcs
384                .iter()
385                .filter_map(|cpc| {
386                    cpc.cpc_group_id
387                        .clone()
388                        .or_else(|| cpc.cpc_subclass_id.clone())
389                        .or_else(|| cpc.cpc_section_id.clone())
390                })
391                .collect::<Vec<_>>()
392                .join(", ");
393
394            // Build metadata
395            let mut metadata = HashMap::new();
396            metadata.insert("patent_number".to_string(), patent.patent_number.clone());
397            metadata.insert("title".to_string(), title);
398            metadata.insert("abstract".to_string(), abstract_text);
399            metadata.insert("assignee".to_string(), assignees);
400            metadata.insert("inventors".to_string(), inventors);
401            metadata.insert("cpc_codes".to_string(), cpc_codes);
402            metadata.insert(
403                "citations_count".to_string(),
404                patent.citedby_patent_count.unwrap_or(0).to_string(),
405            );
406            metadata.insert(
407                "cited_count".to_string(),
408                patent.cited_patent_count.unwrap_or(0).to_string(),
409            );
410            metadata.insert("source".to_string(), "uspto".to_string());
411
412            vectors.push(SemanticVector {
413                id: format!("US{}", patent.patent_number),
414                embedding,
415                domain: Domain::Research, // Could be Domain::Innovation if that variant exists
416                timestamp,
417                metadata,
418            });
419        }
420
421        Ok(vectors)
422    }
423
424    /// GET request with retry logic
425    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
426        let mut retries = 0;
427        loop {
428            match self.client.get(url).send().await {
429                Ok(response) => {
430                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES
431                    {
432                        retries += 1;
433                        sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
434                        continue;
435                    }
436                    if !response.status().is_success() {
437                        return Err(FrameworkError::Network(
438                            reqwest::Error::from(response.error_for_status().unwrap_err()),
439                        ));
440                    }
441                    return Ok(response);
442                }
443                Err(_) if retries < MAX_RETRIES => {
444                    retries += 1;
445                    sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
446                }
447                Err(e) => return Err(FrameworkError::Network(e)),
448            }
449        }
450    }
451
452    /// POST request with retry logic (kept for backwards compatibility)
453    #[allow(dead_code)]
454    async fn post_with_retry(
455        &self,
456        url: &str,
457        json: &serde_json::Value,
458    ) -> Result<reqwest::Response> {
459        let mut retries = 0;
460        loop {
461            match self.client.post(url).json(json).send().await {
462                Ok(response) => {
463                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES
464                    {
465                        retries += 1;
466                        sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
467                        continue;
468                    }
469                    if !response.status().is_success() {
470                        return Err(FrameworkError::Network(
471                            reqwest::Error::from(response.error_for_status().unwrap_err()),
472                        ));
473                    }
474                    return Ok(response);
475                }
476                Err(_) if retries < MAX_RETRIES => {
477                    retries += 1;
478                    sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
479                }
480                Err(e) => return Err(FrameworkError::Network(e)),
481            }
482        }
483    }
484}
485
486impl Default for UsptoPatentClient {
487    fn default() -> Self {
488        Self::new().expect("Failed to create USPTO client")
489    }
490}
491
492// ============================================================================
493// EPO Open Patent Services Client (Placeholder)
494// ============================================================================
495
496/// Client for European Patent Office (EPO) Open Patent Services
497///
498/// Note: This is a placeholder for future EPO integration.
499/// The EPO OPS API requires registration and OAuth authentication.
500/// See: https://developers.epo.org/
501pub struct EpoClient {
502    client: Client,
503    base_url: String,
504    consumer_key: Option<String>,
505    consumer_secret: Option<String>,
506    rate_limit_delay: Duration,
507    embedder: Arc<SimpleEmbedder>,
508}
509
510impl EpoClient {
511    /// Create a new EPO client
512    ///
513    /// # Arguments
514    /// * `consumer_key` - EPO API consumer key (from developer registration)
515    /// * `consumer_secret` - EPO API consumer secret
516    ///
517    /// Registration required at: https://developers.epo.org/
518    pub fn new(consumer_key: Option<String>, consumer_secret: Option<String>) -> Result<Self> {
519        let client = Client::builder()
520            .timeout(Duration::from_secs(30))
521            .build()
522            .map_err(FrameworkError::Network)?;
523
524        Ok(Self {
525            client,
526            base_url: "https://ops.epo.org/3.2/rest-services".to_string(),
527            consumer_key,
528            consumer_secret,
529            rate_limit_delay: Duration::from_millis(EPO_RATE_LIMIT_MS),
530            embedder: Arc::new(SimpleEmbedder::new(512)),
531        })
532    }
533
534    /// Search European patents
535    ///
536    /// Note: Implementation requires OAuth authentication flow.
537    /// This is a placeholder for future development.
538    pub async fn search_patents(
539        &self,
540        _query: &str,
541        _max_results: usize,
542    ) -> Result<Vec<SemanticVector>> {
543        Err(FrameworkError::Config(
544            "EPO client not yet implemented. Requires OAuth authentication.".to_string(),
545        ))
546    }
547}
548
549// ============================================================================
550// Tests
551// ============================================================================
552
553#[cfg(test)]
554mod tests {
555    use super::*;
556
557    #[tokio::test]
558    async fn test_uspto_client_creation() {
559        let client = UsptoPatentClient::new();
560        assert!(client.is_ok());
561    }
562
563    #[tokio::test]
564    async fn test_epo_client_creation() {
565        let client = EpoClient::new(None, None);
566        assert!(client.is_ok());
567    }
568
569    #[test]
570    fn test_default_client() {
571        let client = UsptoPatentClient::default();
572        assert_eq!(
573            client.rate_limit_delay,
574            Duration::from_millis(USPTO_RATE_LIMIT_MS)
575        );
576    }
577
578    #[test]
579    fn test_rate_limiting() {
580        let client = UsptoPatentClient::new().unwrap();
581        assert_eq!(
582            client.rate_limit_delay,
583            Duration::from_millis(USPTO_RATE_LIMIT_MS)
584        );
585    }
586
587    #[test]
588    fn test_cpc_classification_mapping() {
589        // Verify we handle different CPC code lengths correctly
590        let test_cases = vec![
591            ("Y02", "cpc_section_id"),
592            ("G06N", "cpc_subclass_id"),
593            ("Y02E10/50", "cpc_group_id"),
594        ];
595
596        for (code, expected_field) in test_cases {
597            let field = if code.len() <= 3 {
598                "cpc_section_id"
599            } else if code.len() <= 4 {
600                "cpc_subclass_id"
601            } else {
602                "cpc_group_id"
603            };
604            assert_eq!(field, expected_field, "Failed for CPC code: {}", code);
605        }
606    }
607
608    #[tokio::test]
609    #[ignore] // Requires network access
610    async fn test_search_patents_integration() {
611        let client = UsptoPatentClient::new().unwrap();
612
613        // Test basic search
614        let result = client.search_patents("quantum computing", 5).await;
615
616        // Should either succeed or fail with network error, not panic
617        match result {
618            Ok(patents) => {
619                assert!(patents.len() <= 5);
620                for patent in patents {
621                    assert!(patent.id.starts_with("US"));
622                    assert_eq!(patent.domain, Domain::Research);
623                    assert!(!patent.metadata.is_empty());
624                }
625            }
626            Err(e) => {
627                // Network errors are acceptable in tests
628                println!("Network test skipped: {}", e);
629            }
630        }
631    }
632
633    #[tokio::test]
634    #[ignore] // Requires network access
635    async fn test_search_by_cpc_integration() {
636        let client = UsptoPatentClient::new().unwrap();
637
638        // Test CPC search for AI/ML patents
639        let result = client.search_by_cpc("G06N", 5).await;
640
641        match result {
642            Ok(patents) => {
643                assert!(patents.len() <= 5);
644                for patent in patents {
645                    let cpc_codes = patent.metadata.get("cpc_codes").map(|s| s.as_str()).unwrap_or("");
646                    // Should contain G06N classification
647                    assert!(
648                        cpc_codes.contains("G06N") || cpc_codes.is_empty(),
649                        "Expected G06N in CPC codes, got: {}",
650                        cpc_codes
651                    );
652                }
653            }
654            Err(e) => {
655                println!("Network test skipped: {}", e);
656            }
657        }
658    }
659
660    #[test]
661    fn test_embedding_dimension() {
662        let client = UsptoPatentClient::new().unwrap();
663        // Verify embedding dimension is set correctly for technical text
664        let embedding = client.embedder.embed_text("test patent description");
665        assert_eq!(embedding.len(), 512);
666    }
667}