1use std::collections::HashSet;
58use std::sync::Arc;
59use tokio::sync::Mutex;
60use tokio::time::{Duration, sleep};
61use tracing::{Span, debug, field, info, instrument, warn};
62
63mod validator;
64pub use validator::{CqlError, CqlErrorKind, FIELD_CODES, ValidatedCql, validate as validate_cql};
65
66const DEFAULT_EPO_BASE: &str = "https://ops.epo.org/3.2";
67
68#[derive(Debug, Clone)]
74#[non_exhaustive]
75pub enum EpoError {
76 Auth(String),
80 NotFound,
82 RateLimit,
86 Api(String),
89 Network(String),
92}
93
94impl std::fmt::Display for EpoError {
95 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 match self {
97 EpoError::Auth(msg) => write!(f, "EPO auth error: {msg}"),
98 EpoError::NotFound => write!(f, "Patent not found"),
99 EpoError::RateLimit => write!(f, "EPO rate limit exceeded"),
100 EpoError::Api(msg) => write!(f, "EPO API error: {msg}"),
101 EpoError::Network(msg) => write!(f, "Network error: {msg}"),
102 }
103 }
104}
105
106impl std::error::Error for EpoError {}
107
108#[derive(Debug, Clone, Default, serde::Serialize)]
110pub struct FamilyMember {
111 pub patent_id: String,
113 pub country: String,
115 pub kind: String,
118 pub title: String,
122 pub publication_date: Option<String>,
125}
126
127#[derive(Debug, Clone, serde::Serialize)]
131#[non_exhaustive]
132pub struct Citation {
133 pub patent_id: String,
135 pub phase: String,
139 pub category: Option<String>,
144 pub cited_by: Option<String>,
147 pub date: Option<String>,
149 pub name: Option<String>,
152}
153
154#[derive(Debug, Clone, serde::Serialize)]
160#[non_exhaustive]
161pub struct Citations {
162 pub cited: Vec<Citation>,
165 pub citing: Vec<Citation>,
169}
170
171#[derive(Debug, Clone, Default)]
178#[non_exhaustive]
179pub struct PatentBiblio {
180 pub title: String,
183 pub abstract_text: String,
187 pub assignee: Option<String>,
191 pub applicants: Vec<String>,
196 pub inventors: Vec<String>,
200 pub filing_date: Option<String>,
202 pub publication_date: Option<String>,
205 pub priority_date: Option<String>,
208 pub kind_code: Option<String>,
211 pub family_id: Option<String>,
215 pub classification: Vec<String>,
219 pub cpc_classifications: Vec<String>,
223}
224
225#[derive(Debug, Clone, serde::Serialize)]
228#[non_exhaustive]
229pub struct SearchResultPatent {
230 pub patent_id: String,
232 pub title: String,
234 pub abstract_text: String,
236 pub assignee: Option<String>,
238 pub applicants: Vec<String>,
240 pub inventors: Vec<String>,
242 pub filing_date: Option<String>,
244 pub publication_date: Option<String>,
246 pub priority_date: Option<String>,
248 pub kind_code: Option<String>,
250 pub family_id: Option<String>,
252 pub classification: Vec<String>,
254 pub cpc_classifications: Vec<String>,
256}
257
258#[derive(Debug, Clone, Default)]
266#[non_exhaustive]
267pub struct PatentDescription {
268 pub patent_id: String,
270 pub language: Option<String>,
274 pub paragraphs: Vec<DescriptionParagraph>,
278 pub plain_text: String,
283}
284
285#[derive(Debug, Clone)]
287#[non_exhaustive]
288pub struct DescriptionParagraph {
289 pub num: Option<String>,
292 pub text: String,
294}
295
296#[derive(Debug, Clone, Default)]
303#[non_exhaustive]
304pub struct PatentClaims {
305 pub patent_id: String,
307 pub language: Option<String>,
309 pub claims: Vec<Claim>,
312 pub plain_text: String,
316}
317
318#[derive(Debug, Clone)]
320#[non_exhaustive]
321pub struct Claim {
322 pub num: Option<String>,
325 pub id: Option<String>,
327 pub text: String,
332}
333
334#[derive(Debug, Clone)]
336#[non_exhaustive]
337pub struct SearchResults {
338 pub total_count: u32,
341 pub range: (u32, u32),
344 pub patents: Vec<SearchResultPatent>,
346}
347
348struct TokenState {
349 access_token: String,
350 expires_at: std::time::Instant,
351}
352
353struct EndpointPacer {
357 last_call: Mutex<Option<std::time::Instant>>,
358 interval: Duration,
359}
360
361pub struct EpoClient {
362 http: reqwest::Client,
363 consumer_key: String,
364 consumer_secret: String,
365 config: ClientConfig,
366 token: Arc<Mutex<Option<TokenState>>>,
367 throttling: Arc<std::sync::Mutex<Option<ThrottlingState>>>,
370 pacer_inpadoc: Arc<EndpointPacer>,
374 pacer_search: Arc<EndpointPacer>,
375 pacer_retrieval: Arc<EndpointPacer>,
376 pacer_other: Arc<EndpointPacer>,
377 sem: Arc<tokio::sync::Semaphore>,
381}
382
383#[derive(Debug, Clone)]
387pub struct ClientConfig {
388 pub base_url: String,
390 pub retry_backoff: Vec<Duration>,
398 pub token_refresh_buffer: Duration,
401 pub inpadoc_interval: Duration,
405 pub search_interval: Duration,
409 pub retrieval_interval: Duration,
413 pub other_interval: Duration,
416 pub weekly_warn_bytes: u64,
419 pub max_concurrent: usize,
427}
428
429#[derive(Debug, Clone, PartialEq, Eq)]
431#[non_exhaustive]
432pub enum ThrottlingLoad {
433 Idle,
435 Busy,
437 Overloaded,
439 Unavailable,
441 Other(String),
445}
446
447#[derive(Debug, Clone, Copy, PartialEq, Eq)]
449#[non_exhaustive]
450pub enum ThrottlingColor {
451 Green,
453 Yellow,
455 Red,
457 Black,
459}
460
461#[derive(Debug, Clone)]
463#[non_exhaustive]
464pub struct EndpointQuota {
465 pub color: ThrottlingColor,
467 pub remaining_per_minute: u32,
470}
471
472#[derive(Debug, Clone)]
480#[non_exhaustive]
481pub struct ThrottlingState {
482 pub load: ThrottlingLoad,
484 pub endpoints: std::collections::HashMap<String, EndpointQuota>,
488 pub hour_bytes_used: Option<u64>,
490 pub week_bytes_used: Option<u64>,
493}
494
495impl ThrottlingState {
496 pub fn is_exhausted(&self) -> bool {
502 self.endpoints
503 .values()
504 .any(|q| q.color == ThrottlingColor::Black || q.remaining_per_minute == 0)
505 }
506
507 pub fn inpadoc_remaining(&self) -> Option<u32> {
510 self.endpoints
511 .get("inpadoc")
512 .map(|q| q.remaining_per_minute)
513 }
514
515 pub fn search_remaining(&self) -> Option<u32> {
518 self.endpoints.get("search").map(|q| q.remaining_per_minute)
519 }
520
521 pub fn retrieval_remaining(&self) -> Option<u32> {
524 self.endpoints
525 .get("retrieval")
526 .map(|q| q.remaining_per_minute)
527 }
528}
529
530fn expires_in_with_buffer(expires_in: u64, buffer: Duration) -> Duration {
533 Duration::from_secs(expires_in.saturating_sub(buffer.as_secs()))
534}
535
536fn emit_threshold_warnings(
539 prev: Option<&ThrottlingState>,
540 new: &ThrottlingState,
541 weekly_warn_bytes: u64,
542) {
543 let prev_load = prev.map(|p| &p.load);
545 if prev_load != Some(&new.load) && new.load != ThrottlingLoad::Idle {
546 warn!(load = ?new.load, "EPO server load level changed");
547 }
548
549 for (name, quota) in &new.endpoints {
551 let prev_color = prev.and_then(|p| p.endpoints.get(name)).map(|q| q.color);
552 let worsened = matches!(
553 (prev_color, quota.color),
554 (
555 Some(ThrottlingColor::Green),
556 ThrottlingColor::Yellow | ThrottlingColor::Red | ThrottlingColor::Black
557 ) | (
558 Some(ThrottlingColor::Yellow),
559 ThrottlingColor::Red | ThrottlingColor::Black
560 ) | (Some(ThrottlingColor::Red), ThrottlingColor::Black)
561 | (
562 None,
563 ThrottlingColor::Yellow | ThrottlingColor::Red | ThrottlingColor::Black
564 )
565 );
566 if worsened {
567 warn!(
568 endpoint = %name,
569 color = ?quota.color,
570 remaining = quota.remaining_per_minute,
571 "EPO endpoint quota worsened"
572 );
573 }
574 }
575
576 let prev_week = prev.and_then(|p| p.week_bytes_used).unwrap_or(0);
578 if let Some(now_week) = new.week_bytes_used
579 && prev_week < weekly_warn_bytes
580 && now_week >= weekly_warn_bytes
581 {
582 warn!(
583 bytes_used = now_week,
584 threshold = weekly_warn_bytes,
585 "EPO weekly quota crossed warn threshold"
586 );
587 }
588}
589
590pub(crate) fn parse_throttling_header(
605 header: &str,
606 hour_used: Option<u64>,
607 week_used: Option<u64>,
608) -> Option<ThrottlingState> {
609 let header = header.trim();
610 let (load_str, rest) = match header.split_once('(') {
611 Some((load, rest)) => (load.trim(), rest.trim_end_matches(')').trim()),
612 None => (header, ""),
613 };
614
615 let load = match load_str.to_ascii_lowercase().as_str() {
616 "idle" => ThrottlingLoad::Idle,
617 "busy" => ThrottlingLoad::Busy,
618 "overloaded" => ThrottlingLoad::Overloaded,
619 "service_unavailable" => ThrottlingLoad::Unavailable,
620 _ => ThrottlingLoad::Other(load_str.to_string()),
621 };
622
623 let mut endpoints = std::collections::HashMap::new();
624 for entry in rest.split(',') {
625 let entry = entry.trim();
626 let Some((name, rest)) = entry.split_once('=') else {
627 continue;
628 };
629 let Some((color_str, remaining_str)) = rest.split_once(':') else {
630 continue;
631 };
632 let color = match color_str.to_ascii_lowercase().as_str() {
633 "green" => ThrottlingColor::Green,
634 "yellow" => ThrottlingColor::Yellow,
635 "red" => ThrottlingColor::Red,
636 "black" => ThrottlingColor::Black,
637 _ => continue,
638 };
639 let Ok(remaining) = remaining_str.trim().parse::<u32>() else {
640 continue;
641 };
642 endpoints.insert(
643 name.trim().to_ascii_lowercase(),
644 EndpointQuota {
645 color,
646 remaining_per_minute: remaining,
647 },
648 );
649 }
650
651 Some(ThrottlingState {
652 load,
653 endpoints,
654 hour_bytes_used: hour_used,
655 week_bytes_used: week_used,
656 })
657}
658
659impl Default for ClientConfig {
660 fn default() -> Self {
661 Self {
662 base_url: DEFAULT_EPO_BASE.to_string(),
663 retry_backoff: vec![
664 Duration::from_millis(250),
665 Duration::from_millis(500),
666 Duration::from_millis(1000),
667 Duration::from_millis(2000),
668 ],
669 token_refresh_buffer: Duration::from_secs(60),
670 inpadoc_interval: Duration::from_millis(1334),
674 search_interval: Duration::from_millis(4000),
675 retrieval_interval: Duration::from_millis(600),
676 other_interval: Duration::from_millis(60),
677 weekly_warn_bytes: (4u64 * 1024 * 1024 * 1024) * 3 / 4,
679 max_concurrent: 8,
683 }
684 }
685}
686
687async fn retry_with_pacer<F, Fut, T>(
692 pacer: &EndpointPacer,
693 backoff: &[Duration],
694 patent_id: &str,
695 endpoint: &str,
696 mut op: F,
697) -> Result<T, EpoError>
698where
699 F: FnMut() -> Fut,
700 Fut: std::future::Future<Output = Result<T, EpoError>>,
701{
702 let attempts = backoff.len().max(1);
703 let span = Span::current();
704 for attempt in 0..attempts {
705 if attempt > 0
706 && let Some(delay) = backoff.get(attempt)
707 {
708 sleep(*delay).await;
709 }
710 pacer.wait_turn().await;
711 match op().await {
712 Ok(v) => {
713 span.record("attempts", attempt as u64 + 1);
714 return Ok(v);
715 }
716 Err(EpoError::RateLimit) if attempt + 1 < attempts => {
717 warn!(
718 endpoint,
719 patent_id,
720 attempt = attempt + 1,
721 "EPO rate limit, will retry"
722 );
723 }
724 Err(e) => {
725 span.record("attempts", attempt as u64 + 1);
726 return Err(e);
727 }
728 }
729 }
730 span.record("attempts", attempts as u64);
731 Err(EpoError::RateLimit)
732}
733
734impl EndpointPacer {
735 fn new(interval: Duration) -> Self {
736 Self {
737 last_call: Mutex::new(None),
738 interval,
739 }
740 }
741
742 async fn wait_turn(&self) {
743 if self.interval.is_zero() {
744 return;
745 }
746 let mut g = self.last_call.lock().await;
747 if let Some(last) = *g {
748 let elapsed = std::time::Instant::now().saturating_duration_since(last);
749 if elapsed < self.interval {
750 sleep(self.interval - elapsed).await;
751 }
752 }
753 *g = Some(std::time::Instant::now());
754 }
755}
756
757impl EpoClient {
758 pub fn new(consumer_key: String, consumer_secret: String, base_url: Option<String>) -> Self {
761 let mut config = ClientConfig::default();
762 if let Some(url) = base_url {
763 config.base_url = url;
764 }
765 Self::with_config(consumer_key, consumer_secret, config)
766 }
767
768 pub fn with_config(
772 consumer_key: String,
773 consumer_secret: String,
774 config: ClientConfig,
775 ) -> Self {
776 let pacer_inpadoc = Arc::new(EndpointPacer::new(config.inpadoc_interval));
777 let pacer_search = Arc::new(EndpointPacer::new(config.search_interval));
778 let pacer_retrieval = Arc::new(EndpointPacer::new(config.retrieval_interval));
779 let pacer_other = Arc::new(EndpointPacer::new(config.other_interval));
780 let sem = Arc::new(tokio::sync::Semaphore::new(config.max_concurrent.max(1)));
782 Self {
783 http: reqwest::Client::new(),
784 consumer_key,
785 consumer_secret,
786 config,
787 token: Arc::new(Mutex::new(None)),
788 throttling: Arc::new(std::sync::Mutex::new(None)),
789 pacer_inpadoc,
790 pacer_search,
791 pacer_retrieval,
792 pacer_other,
793 sem,
794 }
795 }
796
797 pub fn throttling_state(&self) -> Option<ThrottlingState> {
805 self.throttling.lock().ok().and_then(|g| g.clone())
806 }
807
808 fn update_throttling(&self, headers: &reqwest::header::HeaderMap) {
813 let header_value = headers
814 .get("x-throttling-control")
815 .and_then(|v| v.to_str().ok());
816 let hour = headers
817 .get("x-individualquotaperhour-used")
818 .and_then(|v| v.to_str().ok())
819 .and_then(|s| s.parse().ok());
820 let week = headers
821 .get("x-registeredquotaperweek-used")
822 .and_then(|v| v.to_str().ok())
823 .and_then(|s| s.parse().ok());
824
825 let Some(throttling_str) = header_value else {
826 return;
827 };
828 let Some(new_state) = parse_throttling_header(throttling_str, hour, week) else {
829 return;
830 };
831
832 let mut g = match self.throttling.lock() {
833 Ok(g) => g,
834 Err(_) => return, };
836 let prev = g.clone();
837 emit_threshold_warnings(prev.as_ref(), &new_state, self.config.weekly_warn_bytes);
838 *g = Some(new_state);
839 }
840
841 async fn get_token(&self) -> Result<String, EpoError> {
842 let mut guard = self.token.lock().await;
843
844 if let Some(ref state) = *guard
846 && state.expires_at > std::time::Instant::now()
847 {
848 return Ok(state.access_token.clone());
849 }
850
851 self.pacer_other.wait_turn().await;
853
854 let resp = self
856 .http
857 .post(format!("{}/auth/accesstoken", self.config.base_url))
858 .basic_auth(&self.consumer_key, Some(&self.consumer_secret))
859 .form(&[("grant_type", "client_credentials")])
860 .send()
861 .await
862 .map_err(|e| EpoError::Auth(e.to_string()))?;
863
864 self.update_throttling(resp.headers());
868
869 if !resp.status().is_success() {
870 let status = resp.status();
871 let body = resp.text().await.unwrap_or_default();
872 return Err(EpoError::Auth(format!("{status}: {body}")));
873 }
874
875 let json: serde_json::Value = resp
876 .json()
877 .await
878 .map_err(|e| EpoError::Auth(e.to_string()))?;
879
880 let access_token = json["access_token"]
881 .as_str()
882 .ok_or_else(|| EpoError::Auth("No access_token in response".into()))?
883 .to_string();
884
885 let expires_in = json["expires_in"]
886 .as_u64()
887 .or_else(|| json["expires_in"].as_str().and_then(|s| s.parse().ok()))
888 .unwrap_or(1200); let expires_at = std::time::Instant::now()
891 + expires_in_with_buffer(expires_in, self.config.token_refresh_buffer);
892
893 *guard = Some(TokenState {
894 access_token: access_token.clone(),
895 expires_at,
896 });
897
898 Ok(access_token)
899 }
900
901 #[instrument(skip(self), fields(endpoint = "biblio", attempts = field::Empty))]
902 pub async fn fetch_biblio(&self, patent_id: &str) -> Result<PatentBiblio, EpoError> {
903 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
904 retry_with_pacer(
905 &self.pacer_inpadoc,
906 &self.config.retry_backoff,
907 patent_id,
908 "biblio",
909 || self.fetch_biblio_once(patent_id),
910 )
911 .await
912 }
913
914 async fn fetch_biblio_once(&self, patent_id: &str) -> Result<PatentBiblio, EpoError> {
915 Ok(parse_biblio(&self.fetch_biblio_json_once(patent_id).await?))
916 }
917
918 async fn fetch_biblio_json_once(&self, patent_id: &str) -> Result<serde_json::Value, EpoError> {
924 let token = self.get_token().await?;
925
926 let url = format!(
927 "{}/rest-services/published-data/publication/epodoc/{patent_id}/biblio",
928 self.config.base_url
929 );
930
931 let resp = self
932 .http
933 .get(&url)
934 .header("Accept", "application/json")
935 .bearer_auth(&token)
936 .send()
937 .await
938 .map_err(|e| EpoError::Network(e.to_string()))?;
939
940 self.update_throttling(resp.headers());
944
945 match resp.status().as_u16() {
946 200 => {}
947 404 => return Err(EpoError::NotFound),
948 403 | 429 => return Err(EpoError::RateLimit),
949 status => {
950 let body = resp.text().await.unwrap_or_default();
951 return Err(EpoError::Api(format!("{status}: {body}")));
952 }
953 }
954
955 resp.json::<serde_json::Value>()
956 .await
957 .map_err(|e| EpoError::Api(e.to_string()))
958 }
959
960 #[instrument(skip(self), fields(endpoint = "biblio+citations", attempts = field::Empty))]
968 pub async fn fetch_biblio_with_citations(
969 &self,
970 patent_id: &str,
971 ) -> Result<(PatentBiblio, Citations), EpoError> {
972 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
973 let json = retry_with_pacer(
974 &self.pacer_inpadoc,
975 &self.config.retry_backoff,
976 patent_id,
977 "biblio+citations",
978 || self.fetch_biblio_json_once(patent_id),
979 )
980 .await?;
981 let biblio = parse_biblio(&json);
982 let citations = parse_citations(&json);
983 debug!(
984 patent_id,
985 cited = citations.cited.len(),
986 "EPO biblio+citations fetch succeeded"
987 );
988 Ok((biblio, citations))
989 }
990
991 #[instrument(skip(self), fields(endpoint = "citations", attempts = field::Empty))]
992 pub async fn fetch_citations(&self, patent_id: &str) -> Result<Citations, EpoError> {
993 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
994 let json = retry_with_pacer(
995 &self.pacer_inpadoc,
996 &self.config.retry_backoff,
997 patent_id,
998 "citations",
999 || self.fetch_biblio_json_once(patent_id),
1000 )
1001 .await?;
1002 Ok(parse_citations(&json))
1003 }
1004
1005 #[instrument(skip(self, cql), fields(
1006 endpoint = "search",
1007 query = %cql,
1008 range_begin,
1009 range_end,
1010 http_status = field::Empty,
1011 total_count = field::Empty,
1012 returned = field::Empty,
1013 attempts = field::Empty,
1014 ))]
1015 pub async fn fetch_search(
1016 &self,
1017 cql: &str,
1018 range_begin: u32,
1019 range_end: u32,
1020 ) -> Result<SearchResults, EpoError> {
1021 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1022 let val = retry_with_pacer(
1023 &self.pacer_search,
1024 &self.config.retry_backoff,
1025 cql,
1026 "search",
1027 || self.fetch_search_once(cql, range_begin, range_end),
1028 )
1029 .await?;
1030 let span = Span::current();
1031 span.record("total_count", val.total_count);
1032 span.record("returned", val.patents.len());
1033 info!("EPO search fetch succeeded");
1034 Ok(val)
1035 }
1036
1037 async fn fetch_search_once(
1038 &self,
1039 cql: &str,
1040 range_begin: u32,
1041 range_end: u32,
1042 ) -> Result<SearchResults, EpoError> {
1043 let token = self.get_token().await?;
1044
1045 let url = format!(
1046 "{}/rest-services/published-data/search/biblio",
1047 self.config.base_url
1048 );
1049
1050 let resp = self
1051 .http
1052 .get(&url)
1053 .header("Accept", "application/json")
1054 .bearer_auth(&token)
1055 .query(&[("q", cql), ("Range", &format!("{range_begin}-{range_end}"))])
1056 .send()
1057 .await
1058 .map_err(|e| EpoError::Network(e.to_string()))?;
1059
1060 let status = resp.status().as_u16();
1061 Span::current().record("http_status", status);
1062 self.update_throttling(resp.headers());
1063
1064 match status {
1065 200 => {}
1066 404 => {
1067 return Ok(SearchResults {
1068 total_count: 0,
1069 range: (0, 0),
1070 patents: Vec::new(),
1071 });
1072 }
1073 403 | 429 => return Err(EpoError::RateLimit),
1074 other => {
1075 let body = resp.text().await.unwrap_or_default();
1076 return Err(EpoError::Api(format!("{other}: {body}")));
1077 }
1078 }
1079
1080 let json: serde_json::Value = resp
1081 .json()
1082 .await
1083 .map_err(|e| EpoError::Api(e.to_string()))?;
1084
1085 Ok(parse_search_results(&json))
1086 }
1087
1088 #[instrument(skip(self), fields(endpoint = "family", attempts = field::Empty))]
1089 pub async fn fetch_family(&self, patent_id: &str) -> Result<Vec<FamilyMember>, EpoError> {
1090 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1091 retry_with_pacer(
1092 &self.pacer_inpadoc,
1093 &self.config.retry_backoff,
1094 patent_id,
1095 "family",
1096 || self.fetch_family_once(patent_id),
1097 )
1098 .await
1099 }
1100
1101 async fn fetch_family_once(&self, patent_id: &str) -> Result<Vec<FamilyMember>, EpoError> {
1102 let token = self.get_token().await?;
1103
1104 let url = format!(
1105 "{}/rest-services/family/publication/epodoc/{patent_id}",
1106 self.config.base_url
1107 );
1108
1109 let resp = self
1110 .http
1111 .get(&url)
1112 .header("Accept", "application/json")
1113 .bearer_auth(&token)
1114 .send()
1115 .await
1116 .map_err(|e| EpoError::Network(e.to_string()))?;
1117
1118 self.update_throttling(resp.headers());
1119
1120 match resp.status().as_u16() {
1121 200 => {}
1122 404 => return Err(EpoError::NotFound),
1123 403 | 429 => return Err(EpoError::RateLimit),
1124 status => {
1125 let body = resp.text().await.unwrap_or_default();
1126 return Err(EpoError::Api(format!("{status}: {body}")));
1127 }
1128 }
1129
1130 let json: serde_json::Value = resp
1131 .json()
1132 .await
1133 .map_err(|e| EpoError::Api(e.to_string()))?;
1134
1135 let family = parse_family(&json);
1136 debug!(
1137 patent_id,
1138 members = family.len(),
1139 "EPO family fetch succeeded"
1140 );
1141 Ok(family)
1142 }
1143
1144 #[instrument(skip(self), fields(endpoint = "description", attempts = field::Empty))]
1155 pub async fn fetch_description(&self, patent_id: &str) -> Result<PatentDescription, EpoError> {
1156 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1157 retry_with_pacer(
1158 &self.pacer_retrieval,
1159 &self.config.retry_backoff,
1160 patent_id,
1161 "description",
1162 || self.fetch_description_once(patent_id),
1163 )
1164 .await
1165 }
1166
1167 async fn fetch_description_once(&self, patent_id: &str) -> Result<PatentDescription, EpoError> {
1168 let token = self.get_token().await?;
1169 let url = format!(
1170 "{}/rest-services/published-data/publication/epodoc/{patent_id}/description",
1171 self.config.base_url
1172 );
1173 let resp = self
1174 .http
1175 .get(&url)
1176 .header("Accept", "application/json")
1177 .bearer_auth(&token)
1178 .send()
1179 .await
1180 .map_err(|e| EpoError::Network(e.to_string()))?;
1181
1182 self.update_throttling(resp.headers());
1183
1184 match resp.status().as_u16() {
1185 200 => {}
1186 404 => return Err(EpoError::NotFound),
1187 403 | 429 => return Err(EpoError::RateLimit),
1188 status => {
1189 let body = resp.text().await.unwrap_or_default();
1190 return Err(EpoError::Api(format!("{status}: {body}")));
1191 }
1192 }
1193
1194 let json: serde_json::Value = resp
1195 .json()
1196 .await
1197 .map_err(|e| EpoError::Api(e.to_string()))?;
1198 Ok(parse_description(&json, patent_id))
1199 }
1200
1201 #[instrument(skip(self), fields(endpoint = "claims", attempts = field::Empty))]
1214 pub async fn fetch_claims(&self, patent_id: &str) -> Result<PatentClaims, EpoError> {
1215 let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1216 retry_with_pacer(
1217 &self.pacer_retrieval,
1218 &self.config.retry_backoff,
1219 patent_id,
1220 "claims",
1221 || self.fetch_claims_once(patent_id),
1222 )
1223 .await
1224 }
1225
1226 async fn fetch_claims_once(&self, patent_id: &str) -> Result<PatentClaims, EpoError> {
1227 let token = self.get_token().await?;
1228 let url = format!(
1229 "{}/rest-services/published-data/publication/epodoc/{patent_id}/claims",
1230 self.config.base_url
1231 );
1232 let resp = self
1233 .http
1234 .get(&url)
1235 .header("Accept", "application/json")
1236 .bearer_auth(&token)
1237 .send()
1238 .await
1239 .map_err(|e| EpoError::Network(e.to_string()))?;
1240
1241 self.update_throttling(resp.headers());
1242
1243 match resp.status().as_u16() {
1244 200 => {}
1245 404 => return Err(EpoError::NotFound),
1246 403 | 429 => return Err(EpoError::RateLimit),
1247 status => {
1248 let body = resp.text().await.unwrap_or_default();
1249 return Err(EpoError::Api(format!("{status}: {body}")));
1250 }
1251 }
1252
1253 let json: serde_json::Value = resp
1254 .json()
1255 .await
1256 .map_err(|e| EpoError::Api(e.to_string()))?;
1257 Ok(parse_claims(&json, patent_id))
1258 }
1259
1260 #[instrument(skip(self), fields(endpoint = "citing"))]
1271 pub async fn fetch_citing(&self, patent_id: &str, max: u32) -> Result<Vec<Citation>, EpoError> {
1272 let cql = format!("ct={patent_id}");
1273 let results = self.fetch_search(&cql, 1, max.max(1)).await?;
1274 Ok(results
1275 .patents
1276 .into_iter()
1277 .map(|p| Citation {
1278 patent_id: p.patent_id,
1279 phase: "citing".to_string(),
1280 category: None,
1281 cited_by: None,
1282 date: p.publication_date,
1283 name: p.assignee,
1284 })
1285 .collect())
1286 }
1287
1288 pub async fn enrich_family_titles(&self, family: &mut [FamilyMember]) -> usize {
1299 let mut filled = 0usize;
1300 for member in family.iter_mut() {
1301 if !member.title.is_empty() {
1302 continue;
1303 }
1304 match self.fetch_biblio(&member.patent_id).await {
1305 Ok(b) if !b.title.is_empty() => {
1306 member.title = b.title;
1307 filled += 1;
1308 }
1309 Ok(_) => {}
1310 Err(e) => {
1311 debug!(
1312 patent_id = %member.patent_id,
1313 error = %e,
1314 "enrich_family_titles: per-member biblio fetch failed"
1315 );
1316 }
1317 }
1318 }
1319 filled
1320 }
1321}
1322
1323pub fn parse_biblio(json: &serde_json::Value) -> PatentBiblio {
1331 let docs = locate_exchange_docs(json);
1332 let chosen = match pick_preferred_doc(&docs) {
1333 Some(d) => d,
1334 None => return PatentBiblio::default(),
1335 };
1336
1337 let biblio = &chosen["bibliographic-data"];
1338 let applicants = extract_applicants_all(biblio);
1339 let assignee = applicants.first().cloned();
1340
1341 PatentBiblio {
1342 title: extract_text_by_lang(&biblio["invention-title"]),
1343 abstract_text: extract_abstract_with_fallback(chosen, &docs),
1344 assignee,
1345 applicants,
1346 inventors: extract_inventors(biblio),
1347 filing_date: extract_date(&biblio["application-reference"]["document-id"]),
1348 publication_date: extract_date(&biblio["publication-reference"]["document-id"]),
1349 priority_date: extract_priority_date(biblio),
1350 kind_code: chosen["@kind"].as_str().map(str::to_string),
1351 family_id: chosen["@family-id"].as_str().map(str::to_string),
1352 classification: extract_classifications(biblio),
1353 cpc_classifications: extract_cpc_classifications(biblio),
1354 }
1355}
1356
1357fn locate_exchange_docs(json: &serde_json::Value) -> Vec<&serde_json::Value> {
1361 let mut candidate = &json["ops:world-patent-data"]["exchange-documents"]["exchange-document"];
1362 if candidate.is_null() {
1363 candidate = &json["ops:world-patent-data"]["ops:biblio-search"]["ops:search-result"]["exchange-documents"]
1364 ["exchange-document"];
1365 }
1366 if let Some(arr) = candidate.as_array() {
1367 arr.iter().collect()
1368 } else if candidate.is_null() {
1369 Vec::new()
1370 } else {
1371 vec![candidate]
1372 }
1373}
1374
1375fn pick_preferred_doc<'a>(docs: &[&'a serde_json::Value]) -> Option<&'a serde_json::Value> {
1377 let by_b = docs
1378 .iter()
1379 .find(|d| d["@kind"].as_str().is_some_and(|k| k.starts_with('B')));
1380 if let Some(d) = by_b {
1381 return Some(*d);
1382 }
1383 let by_a = docs
1384 .iter()
1385 .find(|d| d["@kind"].as_str().is_some_and(|k| k.starts_with('A')));
1386 if let Some(d) = by_a {
1387 return Some(*d);
1388 }
1389 docs.first().copied()
1390}
1391
1392fn extract_abstract_with_fallback(
1393 chosen: &serde_json::Value,
1394 all: &[&serde_json::Value],
1395) -> String {
1396 let primary = extract_text_by_lang(&chosen["abstract"]);
1397 if !primary.is_empty() {
1398 return primary;
1399 }
1400 for d in all {
1401 let txt = extract_text_by_lang(&d["abstract"]);
1402 if !txt.is_empty() {
1403 return txt;
1404 }
1405 }
1406 String::new()
1407}
1408
1409#[instrument(skip(json), fields(
1421 doc_count = field::Empty,
1422 parsed = field::Empty,
1423 duplicates = field::Empty,
1424 malformed = field::Empty,
1425))]
1426pub fn parse_search_results(json: &serde_json::Value) -> SearchResults {
1427 let biblio_search = &json["ops:world-patent-data"]["ops:biblio-search"];
1428 let search_result = &biblio_search["ops:search-result"];
1429
1430 if search_result.is_null() {
1431 return SearchResults {
1432 total_count: 0,
1433 range: (0, 0),
1434 patents: Vec::new(),
1435 };
1436 }
1437
1438 let total_count = biblio_search["@total-result-count"]
1439 .as_str()
1440 .and_then(|s| s.parse().ok())
1441 .or_else(|| biblio_search["@total-result-count"].as_u64())
1442 .unwrap_or(0) as u32;
1443
1444 let range_obj = if biblio_search["ops:range"].is_object() {
1447 &biblio_search["ops:range"]
1448 } else {
1449 &search_result["ops:range"]
1450 };
1451 let range_begin = range_obj["@begin"]
1452 .as_str()
1453 .and_then(|s| s.parse().ok())
1454 .unwrap_or(0);
1455 let range_end = range_obj["@end"]
1456 .as_str()
1457 .and_then(|s| s.parse().ok())
1458 .unwrap_or(0);
1459
1460 let doc_refs = collect_search_docs(search_result);
1461
1462 let mut patents = Vec::new();
1463 let mut seen = HashSet::new();
1464 let mut duplicates: u64 = 0;
1465 let mut malformed: u64 = 0;
1466 let doc_count = doc_refs.len() as u64;
1467
1468 for doc in &doc_refs {
1469 let biblio = &doc["bibliographic-data"];
1470
1471 let pub_ref = &biblio["publication-reference"]["document-id"];
1472 let patent_id = extract_patent_id_from_doc_ids(pub_ref);
1473
1474 if patent_id.is_empty() {
1475 malformed += 1;
1476 debug!(
1477 pub_ref = %pub_ref,
1478 "EPO search doc missing patent_id"
1479 );
1480 continue;
1481 }
1482 if seen.contains(&patent_id) {
1483 duplicates += 1;
1484 continue;
1485 }
1486 seen.insert(patent_id.clone());
1487
1488 let applicants = extract_applicants_all(biblio);
1489 let assignee = applicants.first().cloned();
1490 patents.push(SearchResultPatent {
1491 patent_id,
1492 title: extract_text_by_lang(&biblio["invention-title"]),
1493 abstract_text: extract_text_by_lang(&doc["abstract"]),
1494 assignee,
1495 applicants,
1496 inventors: extract_inventors(biblio),
1497 filing_date: extract_date(&biblio["application-reference"]["document-id"]),
1498 publication_date: extract_date(&biblio["publication-reference"]["document-id"]),
1499 priority_date: extract_priority_date(biblio),
1500 kind_code: doc["@kind"].as_str().map(str::to_string),
1501 family_id: doc["@family-id"].as_str().map(str::to_string),
1502 classification: extract_classifications(biblio),
1503 cpc_classifications: extract_cpc_classifications(biblio),
1504 });
1505 }
1506
1507 let span = Span::current();
1508 span.record("doc_count", doc_count);
1509 span.record("parsed", patents.len() as u64);
1510 span.record("duplicates", duplicates);
1511 span.record("malformed", malformed);
1512 if duplicates > 0 || malformed > 0 {
1513 debug!(
1514 parsed = patents.len(),
1515 duplicates, malformed, "EPO search parse dropped entries"
1516 );
1517 }
1518
1519 SearchResults {
1520 total_count,
1521 range: (range_begin, range_end),
1522 patents,
1523 }
1524}
1525
1526fn collect_search_docs(search_result: &serde_json::Value) -> Vec<&serde_json::Value> {
1534 let docs_field = &search_result["exchange-documents"];
1535
1536 if let Some(arr) = docs_field.as_array() {
1537 return arr
1540 .iter()
1541 .map(|item| {
1542 let nested = &item["exchange-document"];
1543 if nested.is_null() { item } else { nested }
1544 })
1545 .filter(|d| !d.is_null() && d.is_object())
1546 .collect();
1547 }
1548
1549 let nested = &docs_field["exchange-document"];
1551 if let Some(arr) = nested.as_array() {
1552 arr.iter().collect()
1553 } else if !nested.is_null() {
1554 vec![nested]
1555 } else {
1556 Vec::new()
1557 }
1558}
1559
1560fn extract_patent_id_from_doc_ids(doc_ids: &serde_json::Value) -> String {
1562 if doc_ids.is_null() {
1563 return String::new();
1564 }
1565
1566 let items = if doc_ids.is_array() {
1567 doc_ids.as_array().unwrap().as_slice()
1568 } else {
1569 std::slice::from_ref(doc_ids)
1570 };
1571
1572 for item in items {
1574 let doc_type = item["@document-id-type"].as_str().unwrap_or("");
1575 if doc_type == "epodoc"
1576 && let Some(num) = item["doc-number"]["$"].as_str()
1577 && !num.is_empty()
1578 {
1579 return num.to_string();
1580 }
1581 }
1582
1583 for item in items {
1585 let doc_type = item["@document-id-type"].as_str().unwrap_or("");
1586 if doc_type == "docdb" {
1587 let country = item["country"]["$"].as_str().unwrap_or("");
1588 let num = item["doc-number"]["$"].as_str().unwrap_or("");
1589 if !num.is_empty() {
1590 return format!("{country}{num}");
1591 }
1592 }
1593 }
1594
1595 String::new()
1596}
1597
1598pub(crate) fn extract_text_by_lang(val: &serde_json::Value) -> String {
1602 if val.is_null() {
1603 return String::new();
1604 }
1605
1606 let items = if val.is_array() {
1607 val.as_array().unwrap().as_slice()
1608 } else {
1609 std::slice::from_ref(val)
1610 };
1611
1612 for item in items {
1613 if item["@lang"].as_str() == Some("en")
1614 && let Some(text) = item_text(item)
1615 {
1616 return text.to_string();
1617 }
1618 }
1619
1620 for item in items {
1621 if let Some(text) = item_text(item) {
1622 return text.to_string();
1623 }
1624 }
1625
1626 val.as_str().unwrap_or("").to_string()
1627}
1628
1629fn item_text(item: &serde_json::Value) -> Option<&str> {
1632 item["$"].as_str().or_else(|| item["p"]["$"].as_str())
1633}
1634
1635pub(crate) fn extract_date(doc_ids: &serde_json::Value) -> Option<String> {
1637 if doc_ids.is_null() {
1638 return None;
1639 }
1640
1641 let items = if doc_ids.is_array() {
1642 doc_ids.as_array().unwrap().as_slice()
1643 } else {
1644 std::slice::from_ref(doc_ids)
1645 };
1646
1647 for item in items {
1648 if let Some(date) = item["date"]["$"].as_str() {
1649 if date.len() == 8 {
1651 return Some(format!("{}-{}-{}", &date[..4], &date[4..6], &date[6..8]));
1652 }
1653 return Some(date.to_string());
1654 }
1655 }
1656
1657 None
1658}
1659
1660pub(crate) fn extract_classifications(biblio: &serde_json::Value) -> Vec<String> {
1669 let mut result = Vec::new();
1670 let mut seen = HashSet::new();
1671
1672 let ipc = &biblio["classification-ipc"]["text"];
1673 for item in iter_array_or_one(ipc) {
1674 if let Some(code) = item["$"].as_str() {
1675 let code = code.trim().to_string();
1676 if !code.is_empty() && seen.insert(code.clone()) {
1677 result.push(code);
1678 }
1679 }
1680 }
1681
1682 let ipcr = &biblio["classifications-ipcr"]["classification-ipcr"];
1683 for item in iter_array_or_one(ipcr) {
1684 if let Some(text) = item["text"]["$"].as_str() {
1685 let code = normalize_ipcr_text(text);
1686 if !code.is_empty() && seen.insert(code.clone()) {
1687 result.push(code);
1688 }
1689 }
1690 }
1691
1692 result
1693}
1694
1695pub(crate) fn extract_cpc_classifications(biblio: &serde_json::Value) -> Vec<String> {
1701 let mut result = Vec::new();
1702 let mut seen = HashSet::new();
1703
1704 let items = &biblio["patent-classifications"]["patent-classification"];
1705 for item in iter_array_or_one(items) {
1706 let scheme = item["classification-scheme"]["@scheme"]
1707 .as_str()
1708 .unwrap_or("");
1709 if !matches!(scheme, "CPCI" | "CPCA" | "CPC") {
1711 continue;
1712 }
1713 let section = item["section"]["$"].as_str().unwrap_or("");
1714 let class = item["class"]["$"].as_str().unwrap_or("");
1715 let subclass = item["subclass"]["$"].as_str().unwrap_or("");
1716 let main = item["main-group"]["$"].as_str().unwrap_or("");
1717 let sub = item["subgroup"]["$"].as_str().unwrap_or("");
1718 if section.is_empty() || class.is_empty() || subclass.is_empty() {
1719 continue;
1720 }
1721 let code = format!("{section}{class}{subclass}{main}/{sub}");
1722 if seen.insert(code.clone()) {
1723 result.push(code);
1724 }
1725 }
1726
1727 result
1728}
1729
1730pub(crate) fn extract_inventors(biblio: &serde_json::Value) -> Vec<String> {
1734 let items = &biblio["parties"]["inventors"]["inventor"];
1735 extract_party_names(items)
1736}
1737
1738pub(crate) fn extract_applicants_all(biblio: &serde_json::Value) -> Vec<String> {
1741 let items = &biblio["parties"]["applicants"]["applicant"];
1742 extract_party_names(items)
1743}
1744
1745fn extract_party_names(items: &serde_json::Value) -> Vec<String> {
1746 let mut result = Vec::new();
1747 let mut seen = HashSet::new();
1748
1749 let entries: Vec<&serde_json::Value> = iter_array_or_one(items);
1750 let has_epodoc = entries
1751 .iter()
1752 .any(|e| e["@data-format"].as_str() == Some("epodoc"));
1753
1754 for entry in &entries {
1755 let format = entry["@data-format"].as_str().unwrap_or("");
1756 if has_epodoc && format != "epodoc" {
1759 continue;
1760 }
1761 let name = entry["applicant-name"]["name"]["$"]
1762 .as_str()
1763 .or_else(|| entry["inventor-name"]["name"]["$"].as_str())
1764 .or_else(|| entry["applicant-name"]["$"].as_str())
1765 .or_else(|| entry["inventor-name"]["$"].as_str())
1766 .unwrap_or("")
1767 .trim();
1768 if !name.is_empty() && seen.insert(name.to_string()) {
1769 result.push(name.to_string());
1770 }
1771 }
1772
1773 result
1774}
1775
1776pub(crate) fn extract_priority_date(biblio: &serde_json::Value) -> Option<String> {
1779 let claims = &biblio["priority-claims"]["priority-claim"];
1780 let mut best: Option<String> = None;
1781
1782 for claim in iter_array_or_one(claims) {
1783 let doc_ids = &claim["document-id"];
1784 for did in iter_array_or_one(doc_ids) {
1785 if did["@document-id-type"].as_str() != Some("epodoc") {
1786 continue;
1787 }
1788 let Some(date) = did["date"]["$"].as_str() else {
1789 continue;
1790 };
1791 if date.len() != 8 {
1792 continue;
1793 }
1794 let formatted = format!("{}-{}-{}", &date[..4], &date[4..6], &date[6..8]);
1795 best = Some(match best {
1796 Some(b) if b <= formatted => b,
1797 _ => formatted,
1798 });
1799 }
1800 }
1801
1802 best
1803}
1804
1805fn normalize_ipcr_text(text: &str) -> String {
1809 let mut out = String::new();
1810 for tok in text.split_whitespace() {
1811 let is_flag = tok.len() == 1 && tok.chars().all(|c| c.is_ascii_alphabetic());
1812 if is_flag {
1813 break;
1814 }
1815 out.push_str(tok);
1816 }
1817 out
1818}
1819
1820fn iter_array_or_one(val: &serde_json::Value) -> Vec<&serde_json::Value> {
1821 if let Some(arr) = val.as_array() {
1822 arr.iter().collect()
1823 } else if val.is_null() {
1824 Vec::new()
1825 } else {
1826 vec![val]
1827 }
1828}
1829
1830pub fn parse_citations(json: &serde_json::Value) -> Citations {
1841 let mut cited = Vec::new();
1842 let mut seen = HashSet::new();
1843
1844 for doc in locate_exchange_docs(json) {
1845 let refs = &doc["bibliographic-data"]["references-cited"]["citation"];
1846 for cit in iter_array_or_one(refs) {
1847 let patcit = &cit["patcit"];
1848 if patcit.is_null() {
1849 continue;
1850 }
1851
1852 let phase = cit["@cited-phase"].as_str().unwrap_or("").to_string();
1853 let category = cit["category"]["$"]
1854 .as_str()
1855 .map(str::to_string)
1856 .filter(|s| !s.is_empty());
1857 let cited_by = cit["@cited-by"]
1858 .as_str()
1859 .map(str::to_string)
1860 .filter(|s| !s.is_empty());
1861
1862 let doc_ids = iter_array_or_one(&patcit["document-id"]);
1863 let date = extract_date(&patcit["document-id"]);
1864 let name = doc_ids
1865 .iter()
1866 .find_map(|d| d["name"]["$"].as_str())
1867 .map(str::to_string)
1868 .filter(|s| !s.is_empty());
1869
1870 for did in &doc_ids {
1871 let doc_type = did["@document-id-type"].as_str().unwrap_or("");
1872 if doc_type != "epodoc" && doc_type != "docdb" {
1873 continue;
1874 }
1875 let country = did["country"]["$"].as_str().unwrap_or("");
1876 let doc_number = did["doc-number"]["$"].as_str().unwrap_or("");
1877 if doc_number.is_empty() {
1878 continue;
1879 }
1880 let patent_id = if doc_type == "epodoc" {
1881 doc_number.to_string()
1882 } else {
1883 format!("{country}{doc_number}")
1884 };
1885 if !seen.insert(patent_id.clone()) {
1886 continue;
1887 }
1888 cited.push(Citation {
1889 patent_id,
1890 phase: phase.clone(),
1891 category: category.clone(),
1892 cited_by: cited_by.clone(),
1893 date: date.clone(),
1894 name: name.clone(),
1895 });
1896 break;
1897 }
1898 }
1899 }
1900
1901 Citations {
1902 cited,
1903 citing: Vec::new(),
1904 }
1905}
1906
1907pub fn parse_description(json: &serde_json::Value, patent_id: &str) -> PatentDescription {
1920 let desc = &json["ops:world-patent-data"]["ftxt:fulltext-documents"]["ftxt:fulltext-document"]
1921 ["description"];
1922
1923 if desc.is_null() {
1924 return PatentDescription {
1925 patent_id: patent_id.to_string(),
1926 ..Default::default()
1927 };
1928 }
1929
1930 let language = desc["@lang"].as_str().map(str::to_string);
1931
1932 let mut paragraphs = Vec::new();
1933 for item in iter_array_or_one(&desc["p"]) {
1934 let num = item["@num"].as_str().map(str::to_string);
1935 let text = item_text(item).unwrap_or("").trim().to_string();
1936 if !text.is_empty() {
1937 paragraphs.push(DescriptionParagraph { num, text });
1938 }
1939 }
1940
1941 let plain_text = paragraphs
1942 .iter()
1943 .map(|p| p.text.as_str())
1944 .collect::<Vec<_>>()
1945 .join("\n\n");
1946
1947 PatentDescription {
1948 patent_id: patent_id.to_string(),
1949 language,
1950 paragraphs,
1951 plain_text,
1952 }
1953}
1954
1955pub fn parse_claims(json: &serde_json::Value, patent_id: &str) -> PatentClaims {
1970 let claims_field = &json["ops:world-patent-data"]["ftxt:fulltext-documents"]["ftxt:fulltext-document"]
1971 ["claims"];
1972
1973 if claims_field.is_null() {
1974 return PatentClaims {
1975 patent_id: patent_id.to_string(),
1976 ..Default::default()
1977 };
1978 }
1979
1980 let language = claims_field["@lang"].as_str().map(str::to_string);
1981
1982 let mut claims = Vec::new();
1983 for item in iter_array_or_one(&claims_field["claim"]) {
1984 let num = item["@num"].as_str().map(str::to_string);
1985 let id = item["@id"].as_str().map(str::to_string);
1986 let claim_text = &item["claim-text"];
1987
1988 if num.is_none()
1996 && id.is_none()
1997 && let Some(arr) = claim_text.as_array()
1998 && arr.len() > 1
1999 && arr.iter().all(is_leaf_claim_text)
2000 {
2001 for elem in arr {
2002 let text = flatten_claim_text(elem);
2003 if !text.is_empty() {
2004 claims.push(Claim {
2005 num: None,
2006 id: None,
2007 text,
2008 });
2009 }
2010 }
2011 } else {
2012 let text = flatten_claim_text(claim_text);
2013 if !text.is_empty() {
2014 claims.push(Claim { num, id, text });
2015 }
2016 }
2017 }
2018
2019 let plain_text = claims
2020 .iter()
2021 .map(|c| c.text.as_str())
2022 .collect::<Vec<_>>()
2023 .join("\n\n");
2024
2025 PatentClaims {
2026 patent_id: patent_id.to_string(),
2027 language,
2028 claims,
2029 plain_text,
2030 }
2031}
2032
2033fn flatten_claim_text(val: &serde_json::Value) -> String {
2048 let mut out = String::new();
2049 collect_text_fragments(val, &mut out);
2050 out.trim().to_string()
2051}
2052
2053fn is_leaf_claim_text(val: &serde_json::Value) -> bool {
2059 match val {
2060 serde_json::Value::String(_) => true,
2061 serde_json::Value::Object(map) => map.keys().all(|k| k == "$" || k.starts_with('@')),
2062 _ => false,
2063 }
2064}
2065
2066fn collect_text_fragments(val: &serde_json::Value, out: &mut String) {
2070 match val {
2071 serde_json::Value::String(s) => out.push_str(s),
2072 serde_json::Value::Array(arr) => {
2073 for item in arr {
2074 collect_text_fragments(item, out);
2075 }
2076 }
2077 serde_json::Value::Object(map) => {
2078 if let Some(text) = map.get("$") {
2079 collect_text_fragments(text, out);
2080 }
2081 for (k, v) in map {
2082 if k == "$" || k.starts_with('@') {
2083 continue;
2084 }
2085 collect_text_fragments(v, out);
2086 }
2087 }
2088 _ => {}
2089 }
2090}
2091
2092pub fn parse_family(json: &serde_json::Value) -> Vec<FamilyMember> {
2100 let members =
2101 iter_array_or_one(&json["ops:world-patent-data"]["ops:patent-family"]["ops:family-member"]);
2102
2103 let mut result: Vec<FamilyMember> = Vec::new();
2104
2105 for member in members {
2106 let pub_ref = &member["publication-reference"]["document-id"];
2107 let doc_ids = iter_array_or_one(pub_ref);
2108 if doc_ids.is_empty() {
2109 continue;
2110 }
2111
2112 let mut chosen: Option<&serde_json::Value> = None;
2114 for doc_id in &doc_ids {
2115 let doc_type = doc_id["@document-id-type"].as_str().unwrap_or("");
2116 if doc_type == "epodoc" || doc_type == "docdb" {
2117 chosen = Some(doc_id);
2118 break;
2119 }
2120 }
2121 let Some(doc_id) = chosen else { continue };
2122
2123 let doc_type = doc_id["@document-id-type"].as_str().unwrap_or("");
2124 let country = doc_id["country"]["$"].as_str().unwrap_or("").to_string();
2125 let doc_number = doc_id["doc-number"]["$"].as_str().unwrap_or("");
2126 let kind = doc_id["kind"]["$"].as_str().unwrap_or("").to_string();
2127
2128 if doc_number.is_empty() {
2129 continue;
2130 }
2131
2132 let patent_id = if doc_type == "epodoc" {
2133 doc_number.to_string()
2134 } else {
2135 format!("{country}{doc_number}")
2136 };
2137
2138 result.push(FamilyMember {
2139 patent_id,
2140 country,
2141 kind,
2142 title: extract_text_by_lang(&member["invention-title"]),
2143 publication_date: extract_date(pub_ref),
2144 });
2145 }
2146
2147 result
2148}
2149
2150#[cfg(test)]
2151mod tests {
2152 use super::*;
2153
2154 #[test]
2155 fn test_parse_family_empty() {
2156 let json = serde_json::json!({});
2157 let result = parse_family(&json);
2158 assert!(result.is_empty());
2159 }
2160
2161 #[test]
2162 fn test_parse_family_single_member() {
2163 let json = serde_json::json!({
2164 "ops:world-patent-data": {
2165 "ops:patent-family": {
2166 "ops:family-member": {
2167 "publication-reference": {
2168 "document-id": [
2169 {
2170 "@document-id-type": "epodoc",
2171 "country": {"$": "EP"},
2172 "doc-number": {"$": "EP1234567"},
2173 "kind": {"$": "A1"}
2174 }
2175 ]
2176 },
2177 "invention-title": {"$": "Test invention", "@lang": "en"}
2178 }
2179 }
2180 }
2181 });
2182 let result = parse_family(&json);
2183 assert_eq!(result.len(), 1);
2184 assert_eq!(result[0].patent_id, "EP1234567");
2185 assert_eq!(result[0].country, "EP");
2186 assert_eq!(result[0].kind, "A1");
2187 assert_eq!(result[0].title, "Test invention");
2188 }
2189
2190 #[test]
2191 fn test_parse_family_multiple_members() {
2192 let json = serde_json::json!({
2193 "ops:world-patent-data": {
2194 "ops:patent-family": {
2195 "ops:family-member": [
2196 {
2197 "publication-reference": {
2198 "document-id": [
2199 {
2200 "@document-id-type": "epodoc",
2201 "country": {"$": "EP"},
2202 "doc-number": {"$": "EP111"},
2203 "kind": {"$": "A1"}
2204 }
2205 ]
2206 },
2207 "invention-title": {"$": "Invention A", "@lang": "en"}
2208 },
2209 {
2210 "publication-reference": {
2211 "document-id": [
2212 {
2213 "@document-id-type": "epodoc",
2214 "country": {"$": "US"},
2215 "doc-number": {"$": "US222"},
2216 "kind": {"$": "B1"}
2217 }
2218 ]
2219 },
2220 "invention-title": {"$": "Invention B", "@lang": "en"}
2221 }
2222 ]
2223 }
2224 }
2225 });
2226 let result = parse_family(&json);
2227 assert_eq!(result.len(), 2);
2228 assert_eq!(result[0].patent_id, "EP111");
2229 assert_eq!(result[1].patent_id, "US222");
2230 }
2231
2232 #[test]
2233 fn test_parse_family_keeps_all_publications_per_patent() {
2234 let json = serde_json::json!({
2237 "ops:world-patent-data": {
2238 "ops:patent-family": {
2239 "ops:family-member": [
2240 {
2241 "publication-reference": {
2242 "document-id": [
2243 {
2244 "@document-id-type": "epodoc",
2245 "country": {"$": "EP"},
2246 "date": {"$": "20000517"},
2247 "doc-number": {"$": "EP111"},
2248 "kind": {"$": "A1"}
2249 }
2250 ]
2251 },
2252 "invention-title": {"$": "Same patent", "@lang": "en"}
2253 },
2254 {
2255 "publication-reference": {
2256 "document-id": [
2257 {
2258 "@document-id-type": "epodoc",
2259 "country": {"$": "EP"},
2260 "date": {"$": "20030212"},
2261 "doc-number": {"$": "EP111"},
2262 "kind": {"$": "B1"}
2263 }
2264 ]
2265 },
2266 "invention-title": {"$": "Same patent v2", "@lang": "en"}
2267 }
2268 ]
2269 }
2270 }
2271 });
2272 let result = parse_family(&json);
2273 assert_eq!(result.len(), 2);
2274 assert_eq!(result[0].kind, "A1");
2275 assert_eq!(result[0].publication_date.as_deref(), Some("2000-05-17"));
2276 assert_eq!(result[1].kind, "B1");
2277 assert_eq!(result[1].publication_date.as_deref(), Some("2003-02-12"));
2278 }
2279
2280 #[test]
2281 fn test_parse_family_keeps_all_publications_when_no_dates() {
2282 let json = serde_json::json!({
2283 "ops:world-patent-data": {
2284 "ops:patent-family": {
2285 "ops:family-member": [
2286 {
2287 "publication-reference": {
2288 "document-id": [{
2289 "@document-id-type": "epodoc",
2290 "country": {"$": "EP"},
2291 "doc-number": {"$": "EP111"},
2292 "kind": {"$": "A1"}
2293 }]
2294 }
2295 },
2296 {
2297 "publication-reference": {
2298 "document-id": [{
2299 "@document-id-type": "epodoc",
2300 "country": {"$": "EP"},
2301 "doc-number": {"$": "EP111"},
2302 "kind": {"$": "A2"}
2303 }]
2304 }
2305 }
2306 ]
2307 }
2308 }
2309 });
2310 let result = parse_family(&json);
2311 assert_eq!(result.len(), 2);
2312 assert_eq!(result[0].kind, "A1");
2313 assert_eq!(result[1].kind, "A2");
2314 }
2315
2316 #[test]
2317 fn test_parse_search_results_empty() {
2318 let json = serde_json::json!({});
2319 let result = parse_search_results(&json);
2320 assert_eq!(result.total_count, 0);
2321 assert!(result.patents.is_empty());
2322 }
2323
2324 #[test]
2325 fn test_parse_search_results_single() {
2326 let json = serde_json::json!({
2327 "ops:world-patent-data": {
2328 "ops:biblio-search": {
2329 "@total-result-count": "1",
2330 "ops:search-result": {
2331 "ops:range": {"@begin": "1", "@end": "1"},
2332 "exchange-documents": {
2333 "exchange-document": {
2334 "bibliographic-data": {
2335 "invention-title": [{"$": "Test title", "@lang": "en"}],
2336 "parties": {
2337 "applicants": {
2338 "applicant": [{"applicant-name": {"name": {"$": "Acme Corp"}}}]
2339 }
2340 },
2341 "publication-reference": {
2342 "document-id": [{
2343 "@document-id-type": "epodoc",
2344 "doc-number": {"$": "EP1234567"}
2345 }]
2346 },
2347 "application-reference": {
2348 "document-id": [{"date": {"$": "20200115"}}]
2349 },
2350 "classifications-ipcr": {
2351 "classification-ipcr": [{"text": {"$": "H01M 10/48"}}]
2352 }
2353 },
2354 "abstract": [{"$": "Test abstract text", "@lang": "en"}]
2355 }
2356 }
2357 }
2358 }
2359 }
2360 });
2361 let result = parse_search_results(&json);
2362 assert_eq!(result.total_count, 1);
2363 assert_eq!(result.range, (1, 1));
2364 assert_eq!(result.patents.len(), 1);
2365 assert_eq!(result.patents[0].patent_id, "EP1234567");
2366 assert_eq!(result.patents[0].title, "Test title");
2367 assert_eq!(result.patents[0].abstract_text, "Test abstract text");
2368 assert_eq!(result.patents[0].assignee.as_deref(), Some("Acme Corp"));
2369 assert_eq!(result.patents[0].filing_date.as_deref(), Some("2020-01-15"));
2370 }
2371
2372 #[test]
2373 fn test_parse_search_results_multiple() {
2374 let json = serde_json::json!({
2375 "ops:world-patent-data": {
2376 "ops:biblio-search": {
2377 "@total-result-count": "42",
2378 "ops:search-result": {
2379 "ops:range": {"@begin": "1", "@end": "2"},
2380 "exchange-documents": {
2381 "exchange-document": [
2382 {
2383 "bibliographic-data": {
2384 "invention-title": [{"$": "Patent A", "@lang": "en"}],
2385 "publication-reference": {
2386 "document-id": [{
2387 "@document-id-type": "epodoc",
2388 "doc-number": {"$": "EP111"}
2389 }]
2390 }
2391 },
2392 "abstract": [{"$": "Abstract A", "@lang": "en"}]
2393 },
2394 {
2395 "bibliographic-data": {
2396 "invention-title": [{"$": "Patent B", "@lang": "en"}],
2397 "publication-reference": {
2398 "document-id": [{
2399 "@document-id-type": "epodoc",
2400 "doc-number": {"$": "US222"}
2401 }]
2402 }
2403 },
2404 "abstract": [{"$": "Abstract B", "@lang": "en"}]
2405 }
2406 ]
2407 }
2408 }
2409 }
2410 }
2411 });
2412 let result = parse_search_results(&json);
2413 assert_eq!(result.total_count, 42);
2414 assert_eq!(result.range, (1, 2));
2415 assert_eq!(result.patents.len(), 2);
2416 assert_eq!(result.patents[0].patent_id, "EP111");
2417 assert_eq!(result.patents[1].patent_id, "US222");
2418 }
2419
2420 #[test]
2421 fn test_parse_search_results_deduplicates() {
2422 let json = serde_json::json!({
2423 "ops:world-patent-data": {
2424 "ops:biblio-search": {
2425 "@total-result-count": "2",
2426 "ops:search-result": {
2427 "ops:range": {"@begin": "1", "@end": "2"},
2428 "exchange-documents": {
2429 "exchange-document": [
2430 {
2431 "bibliographic-data": {
2432 "invention-title": [{"$": "Same", "@lang": "en"}],
2433 "publication-reference": {
2434 "document-id": [{"@document-id-type": "epodoc", "doc-number": {"$": "EP111"}}]
2435 }
2436 }
2437 },
2438 {
2439 "bibliographic-data": {
2440 "invention-title": [{"$": "Same v2", "@lang": "en"}],
2441 "publication-reference": {
2442 "document-id": [{"@document-id-type": "epodoc", "doc-number": {"$": "EP111"}}]
2443 }
2444 }
2445 }
2446 ]
2447 }
2448 }
2449 }
2450 }
2451 });
2452 let result = parse_search_results(&json);
2453 assert_eq!(
2454 result.patents.len(),
2455 1,
2456 "Duplicate patent IDs should be deduplicated"
2457 );
2458 }
2459
2460 #[test]
2461 fn test_parse_biblio_empty() {
2462 let json = serde_json::json!({});
2463 let result = parse_biblio(&json);
2464 assert!(result.title.is_empty());
2465 assert!(result.abstract_text.is_empty());
2466 assert!(result.applicants.is_empty());
2467 assert!(result.inventors.is_empty());
2468 assert!(result.kind_code.is_none());
2469 assert!(result.family_id.is_none());
2470 assert!(result.cpc_classifications.is_empty());
2471 }
2472
2473 #[test]
2479 fn test_parse_biblio_real_shape_a1_b1() {
2480 let abstract_obj = serde_json::json!({
2481 "@lang": "en",
2482 "p": {"$": "An apparatus for manufacturing green bricks."}
2483 });
2484 let json = serde_json::json!({
2485 "ops:world-patent-data": {
2486 "exchange-documents": {
2487 "exchange-document": [
2488 {
2489 "@country": "EP",
2490 "@doc-number": "1000000",
2491 "@family-id": "19768124",
2492 "@kind": "A1",
2493 "abstract": abstract_obj,
2494 "bibliographic-data": {
2495 "invention-title": [
2496 {"@lang": "de", "$": "DE title"},
2497 {"@lang": "en", "$": "Apparatus for manufacturing green bricks"}
2498 ],
2499 "publication-reference": {
2500 "document-id": [{
2501 "@document-id-type": "epodoc",
2502 "date": {"$": "20000517"},
2503 "doc-number": {"$": "EP1000000"}
2504 }]
2505 },
2506 "application-reference": {
2507 "document-id": [{
2508 "@document-id-type": "epodoc",
2509 "date": {"$": "19991108"},
2510 "doc-number": {"$": "EP19990203729"}
2511 }]
2512 },
2513 "references-cited": {
2514 "citation": [{
2515 "@cited-phase": "national-search-report",
2516 "patcit": {
2517 "document-id": [{
2518 "@document-id-type": "epodoc",
2519 "doc-number": {"$": "EP0680812"}
2520 }]
2521 }
2522 }]
2523 }
2524 }
2525 },
2526 {
2527 "@country": "EP",
2528 "@doc-number": "1000000",
2529 "@family-id": "19768124",
2530 "@kind": "B1",
2531 "abstract": abstract_obj,
2532 "bibliographic-data": {
2533 "invention-title": [
2534 {"@lang": "en", "$": "Apparatus for manufacturing green bricks"}
2535 ],
2536 "publication-reference": {
2537 "document-id": [{
2538 "@document-id-type": "epodoc",
2539 "date": {"$": "20030212"},
2540 "doc-number": {"$": "EP1000000"}
2541 }]
2542 },
2543 "application-reference": {
2544 "document-id": [{
2545 "@document-id-type": "epodoc",
2546 "date": {"$": "19991108"},
2547 "doc-number": {"$": "EP19990203729"}
2548 }]
2549 },
2550 "classification-ipc": {
2551 "text": [
2552 {"$": "B28B1/29"},
2553 {"$": "B28B5/02"}
2554 ]
2555 },
2556 "classifications-ipcr": {
2557 "classification-ipcr": [
2558 {"@sequence": "1", "text": {"$": "B28B 1/ 29 A I"}},
2559 {"@sequence": "2", "text": {"$": "H02P 6/ 08 A I"}}
2560 ]
2561 },
2562 "patent-classifications": {
2563 "patent-classification": [{
2564 "@sequence": "1",
2565 "classification-scheme": {"@office": "EP", "@scheme": "CPCI"},
2566 "section": {"$": "B"},
2567 "class": {"$": "28"},
2568 "subclass": {"$": "B"},
2569 "main-group": {"$": "1"},
2570 "subgroup": {"$": "29"}
2571 }]
2572 },
2573 "priority-claims": {
2574 "priority-claim": {
2575 "document-id": [{
2576 "@document-id-type": "epodoc",
2577 "date": {"$": "19981112"},
2578 "doc-number": {"$": "NL19981010536"}
2579 }]
2580 }
2581 },
2582 "parties": {
2583 "applicants": {
2584 "applicant": [
2585 {
2586 "@data-format": "epodoc",
2587 "@sequence": "1",
2588 "applicant-name": {"name": {"$": "BOER BEHEER NIJMEGEN BV DE [NL]"}}
2589 },
2590 {
2591 "@data-format": "original",
2592 "@sequence": "1",
2593 "applicant-name": {"name": {"$": "BEHEERMAATSCHAPPIJ DE BOER NIJMEGEN B.V."}}
2594 }
2595 ]
2596 },
2597 "inventors": {
2598 "inventor": [{
2599 "@data-format": "epodoc",
2600 "inventor-name": {"name": {"$": "KOSMAN WILHELMUS JACOBUS MARIA [NL]"}}
2601 }]
2602 }
2603 }
2604 }
2605 }
2606 ]
2607 }
2608 }
2609 });
2610
2611 let r = parse_biblio(&json);
2612
2613 assert_eq!(r.title, "Apparatus for manufacturing green bricks");
2614 assert_eq!(
2615 r.abstract_text,
2616 "An apparatus for manufacturing green bricks."
2617 );
2618 assert_eq!(
2619 r.kind_code.as_deref(),
2620 Some("B1"),
2621 "should prefer B1 over A1"
2622 );
2623 assert_eq!(r.family_id.as_deref(), Some("19768124"));
2624 assert_eq!(
2625 r.publication_date.as_deref(),
2626 Some("2003-02-12"),
2627 "publication_date should come from chosen B1 doc, not A1"
2628 );
2629 assert_eq!(r.filing_date.as_deref(), Some("1999-11-08"));
2630 assert_eq!(r.priority_date.as_deref(), Some("1998-11-12"));
2631 assert_eq!(
2632 r.assignee.as_deref(),
2633 Some("BOER BEHEER NIJMEGEN BV DE [NL]")
2634 );
2635 assert_eq!(r.applicants, vec!["BOER BEHEER NIJMEGEN BV DE [NL]"]);
2636 assert_eq!(r.inventors, vec!["KOSMAN WILHELMUS JACOBUS MARIA [NL]"]);
2637 assert_eq!(
2638 r.classification,
2639 vec!["B28B1/29", "B28B5/02", "H02P6/08"],
2640 "should produce full IPC codes (compact form), de-duped across IPC + IPCR blocks"
2641 );
2642 assert_eq!(r.cpc_classifications, vec!["B28B1/29"]);
2643 }
2644
2645 #[test]
2649 fn test_parse_citations_merges_across_a1_b1() {
2650 let json = serde_json::json!({
2651 "ops:world-patent-data": {
2652 "exchange-documents": {
2653 "exchange-document": [
2654 {
2655 "@kind": "A1",
2656 "bibliographic-data": {
2657 "references-cited": {
2658 "citation": {
2659 "@cited-phase": "search",
2660 "patcit": {
2661 "document-id": [{
2662 "@document-id-type": "epodoc",
2663 "doc-number": {"$": "EP0680812"}
2664 }]
2665 }
2666 }
2667 }
2668 }
2669 },
2670 {
2671 "@kind": "B1",
2672 "bibliographic-data": {}
2673 }
2674 ]
2675 }
2676 }
2677 });
2678 let r = parse_citations(&json);
2679 assert_eq!(r.cited.len(), 1);
2680 assert_eq!(r.cited[0].patent_id, "EP0680812");
2681 }
2682
2683 #[test]
2684 fn test_normalize_ipcr_drops_flags() {
2685 assert_eq!(
2687 normalize_ipcr_text("B28B 1/ 29 A I"),
2688 "B28B1/29"
2689 );
2690 assert_eq!(
2692 normalize_ipcr_text("H02P 6/ 08 A I"),
2693 "H02P6/08"
2694 );
2695 assert_eq!(
2697 normalize_ipcr_text("G05B 19/ 4093 A I"),
2698 "G05B19/4093"
2699 );
2700 assert_eq!(
2702 normalize_ipcr_text("G05B 19/ 40938 A I"),
2703 "G05B19/40938"
2704 );
2705 assert_eq!(
2707 normalize_ipcr_text("G06T 7/ 246 A I"),
2708 "G06T7/246"
2709 );
2710 assert_eq!(
2712 normalize_ipcr_text("B28B 7/ 00 A I"),
2713 "B28B7/00"
2714 );
2715 assert_eq!(
2717 normalize_ipcr_text("A61B 3/ 113 A"),
2718 "A61B3/113"
2719 );
2720 assert_eq!(normalize_ipcr_text("B28B 1/29"), "B28B1/29");
2722 assert_eq!(normalize_ipcr_text(""), "");
2724 assert_eq!(normalize_ipcr_text(" "), "");
2725 }
2726
2727 #[test]
2728 fn test_extract_text_by_lang_handles_p_wrapper() {
2729 let val = serde_json::json!({"@lang": "en", "p": {"$": "Abstract body."}});
2731 assert_eq!(extract_text_by_lang(&val), "Abstract body.");
2732
2733 let val = serde_json::json!({"@lang": "en", "$": "Title body."});
2735 assert_eq!(extract_text_by_lang(&val), "Title body.");
2736 }
2737
2738 #[test]
2739 fn test_parse_citations_empty() {
2740 let json = serde_json::json!({});
2741 let result = parse_citations(&json);
2742 assert!(result.cited.is_empty());
2743 assert!(result.citing.is_empty());
2744 }
2745
2746 #[test]
2747 fn test_parse_citations_single() {
2748 let json = serde_json::json!({
2749 "ops:world-patent-data": {
2750 "exchange-documents": {
2751 "exchange-document": {
2752 "bibliographic-data": {
2753 "references-cited": {
2754 "citation": {
2755 "@cited-phase": "search",
2756 "patcit": {
2757 "document-id": [
2758 {
2759 "@document-id-type": "epodoc",
2760 "country": {"$": "US"},
2761 "doc-number": {"$": "US7654321"},
2762 "kind": {"$": "A1"}
2763 }
2764 ]
2765 }
2766 }
2767 }
2768 }
2769 }
2770 }
2771 }
2772 });
2773 let result = parse_citations(&json);
2774 assert_eq!(result.cited.len(), 1);
2775 assert_eq!(result.cited[0].patent_id, "US7654321");
2776 assert_eq!(result.cited[0].phase, "search");
2777 }
2778
2779 #[test]
2780 fn test_parse_citations_multiple() {
2781 let json = serde_json::json!({
2782 "ops:world-patent-data": {
2783 "exchange-documents": {
2784 "exchange-document": {
2785 "bibliographic-data": {
2786 "references-cited": {
2787 "citation": [
2788 {
2789 "@cited-phase": "search",
2790 "patcit": {
2791 "document-id": [
2792 {
2793 "@document-id-type": "epodoc",
2794 "country": {"$": "US"},
2795 "doc-number": {"$": "US111"},
2796 "kind": {"$": "A1"}
2797 }
2798 ]
2799 }
2800 },
2801 {
2802 "@cited-phase": "examination",
2803 "patcit": {
2804 "document-id": [
2805 {
2806 "@document-id-type": "epodoc",
2807 "country": {"$": "EP"},
2808 "doc-number": {"$": "EP222"},
2809 "kind": {"$": "B1"}
2810 }
2811 ]
2812 }
2813 }
2814 ]
2815 }
2816 }
2817 }
2818 }
2819 }
2820 });
2821 let result = parse_citations(&json);
2822 assert_eq!(result.cited.len(), 2);
2823 assert_eq!(result.cited[0].patent_id, "US111");
2824 assert_eq!(result.cited[1].patent_id, "EP222");
2825 }
2826
2827 #[test]
2828 fn test_parse_citations_deduplicates() {
2829 let json = serde_json::json!({
2830 "ops:world-patent-data": {
2831 "exchange-documents": {
2832 "exchange-document": {
2833 "bibliographic-data": {
2834 "references-cited": {
2835 "citation": [
2836 {
2837 "@cited-phase": "search",
2838 "patcit": {
2839 "document-id": [{
2840 "@document-id-type": "epodoc",
2841 "country": {"$": "US"},
2842 "doc-number": {"$": "US111"},
2843 "kind": {"$": "A1"}
2844 }]
2845 }
2846 },
2847 {
2848 "@cited-phase": "examination",
2849 "patcit": {
2850 "document-id": [{
2851 "@document-id-type": "epodoc",
2852 "country": {"$": "US"},
2853 "doc-number": {"$": "US111"},
2854 "kind": {"$": "A2"}
2855 }]
2856 }
2857 }
2858 ]
2859 }
2860 }
2861 }
2862 }
2863 }
2864 });
2865 let result = parse_citations(&json);
2866 assert_eq!(
2867 result.cited.len(),
2868 1,
2869 "Duplicate patent IDs should be deduplicated"
2870 );
2871 }
2872
2873 #[test]
2874 fn test_parse_throttling_idle() {
2875 let s = parse_throttling_header(
2876 "idle (images=green:200, inpadoc=green:60, retrieval=green:200, search=green:30)",
2877 None,
2878 None,
2879 )
2880 .unwrap();
2881 assert_eq!(s.load, ThrottlingLoad::Idle);
2882 assert_eq!(s.endpoints["search"].remaining_per_minute, 30);
2883 assert_eq!(s.endpoints["search"].color, ThrottlingColor::Green);
2884 assert!(!s.is_exhausted());
2885 }
2886
2887 #[test]
2888 fn test_parse_throttling_busy_real_shape() {
2889 let header = "busy (images=green:100, inpadoc=green:45, other=green:1000, retrieval=green:100, search=green:15)";
2891 let s = parse_throttling_header(header, Some(20360), Some(1427448)).unwrap();
2892 assert_eq!(s.load, ThrottlingLoad::Busy);
2893 assert_eq!(s.inpadoc_remaining(), Some(45));
2894 assert_eq!(s.search_remaining(), Some(15));
2895 assert_eq!(s.retrieval_remaining(), Some(100));
2896 assert_eq!(s.hour_bytes_used, Some(20360));
2897 assert_eq!(s.week_bytes_used, Some(1427448));
2898 }
2899
2900 #[test]
2901 fn test_parse_throttling_color_progression() {
2902 let s = parse_throttling_header(
2903 "overloaded (search=yellow:5, retrieval=red:1, images=black:0)",
2904 None,
2905 None,
2906 )
2907 .unwrap();
2908 assert_eq!(s.load, ThrottlingLoad::Overloaded);
2909 assert_eq!(s.endpoints["search"].color, ThrottlingColor::Yellow);
2910 assert_eq!(s.endpoints["retrieval"].color, ThrottlingColor::Red);
2911 assert_eq!(s.endpoints["images"].color, ThrottlingColor::Black);
2912 assert!(s.is_exhausted(), "any black endpoint => exhausted");
2913 }
2914
2915 #[test]
2916 fn test_parse_throttling_zero_remaining_treated_as_exhausted() {
2917 let s = parse_throttling_header("idle (search=green:0)", None, None).unwrap();
2919 assert!(s.is_exhausted());
2920 }
2921
2922 #[test]
2923 fn test_parse_throttling_unknown_load_preserves_endpoints() {
2924 let s = parse_throttling_header("weird_state (inpadoc=green:45)", None, None).unwrap();
2925 assert_eq!(
2926 s.load,
2927 ThrottlingLoad::Other("weird_state".to_string()),
2928 "unknown load string should be preserved verbatim"
2929 );
2930 assert_eq!(s.inpadoc_remaining(), Some(45));
2931 }
2932
2933 #[test]
2934 fn test_parse_throttling_malformed_entries_skipped() {
2935 let s = parse_throttling_header(
2937 "idle (inpadoc=green:45, malformed-entry, search=:notanumber)",
2938 None,
2939 None,
2940 )
2941 .unwrap();
2942 assert_eq!(s.endpoints.len(), 1);
2943 assert_eq!(s.inpadoc_remaining(), Some(45));
2944 }
2945
2946 #[test]
2947 fn test_parse_description_empty_returns_patent_id() {
2948 let r = parse_description(&serde_json::json!({}), "EP1");
2949 assert_eq!(r.patent_id, "EP1");
2950 assert!(r.language.is_none());
2951 assert!(r.paragraphs.is_empty());
2952 assert!(r.plain_text.is_empty());
2953 }
2954
2955 #[test]
2956 fn test_parse_description_single_paragraph() {
2957 let json = serde_json::json!({
2959 "ops:world-patent-data": {
2960 "ftxt:fulltext-documents": {
2961 "ftxt:fulltext-document": {
2962 "description": {
2963 "@lang": "en",
2964 "p": {"@num": "0001", "$": " The invention. "}
2965 }
2966 }
2967 }
2968 }
2969 });
2970 let r = parse_description(&json, "EP1");
2971 assert_eq!(r.language.as_deref(), Some("en"));
2972 assert_eq!(r.paragraphs.len(), 1);
2973 assert_eq!(r.paragraphs[0].num.as_deref(), Some("0001"));
2974 assert_eq!(r.paragraphs[0].text, "The invention.");
2975 assert_eq!(r.plain_text, "The invention.");
2976 }
2977
2978 #[test]
2979 fn test_parse_description_multi_paragraph_joins_blank_skipped() {
2980 let json = serde_json::json!({
2981 "ops:world-patent-data": {
2982 "ftxt:fulltext-documents": {
2983 "ftxt:fulltext-document": {
2984 "description": {
2985 "@lang": "en",
2986 "p": [
2987 {"@num": "0001", "$": "BACKGROUND OF THE INVENTION"},
2988 {"@num": "0002", "$": ""}, {"@num": "0003", "$": "Para three."},
2990 {"$": "Untagged."} ]
2992 }
2993 }
2994 }
2995 }
2996 });
2997 let r = parse_description(&json, "EP1000000");
2998 assert_eq!(r.paragraphs.len(), 3, "empty paragraph should be skipped");
2999 assert_eq!(r.paragraphs[0].text, "BACKGROUND OF THE INVENTION");
3000 assert_eq!(r.paragraphs[1].num.as_deref(), Some("0003"));
3001 assert_eq!(r.paragraphs[2].num, None);
3002 assert_eq!(
3003 r.plain_text,
3004 "BACKGROUND OF THE INVENTION\n\nPara three.\n\nUntagged."
3005 );
3006 }
3007
3008 #[test]
3009 fn test_parse_claims_empty_returns_patent_id() {
3010 let r = parse_claims(&serde_json::json!({}), "EP1");
3011 assert_eq!(r.patent_id, "EP1");
3012 assert!(r.language.is_none());
3013 assert!(r.claims.is_empty());
3014 assert!(r.plain_text.is_empty());
3015 }
3016
3017 #[test]
3018 fn test_parse_claims_single_claim() {
3019 let json = serde_json::json!({
3020 "ops:world-patent-data": {
3021 "ftxt:fulltext-documents": {
3022 "ftxt:fulltext-document": {
3023 "claims": {
3024 "@lang": "en",
3025 "claim": {
3026 "@id": "claim001",
3027 "@num": "0001",
3028 "claim-text": {"$": "1. A widget comprising a sprocket."}
3029 }
3030 }
3031 }
3032 }
3033 }
3034 });
3035 let r = parse_claims(&json, "EP1");
3036 assert_eq!(r.language.as_deref(), Some("en"));
3037 assert_eq!(r.claims.len(), 1);
3038 assert_eq!(r.claims[0].id.as_deref(), Some("claim001"));
3039 assert_eq!(r.claims[0].num.as_deref(), Some("0001"));
3040 assert_eq!(r.claims[0].text, "1. A widget comprising a sprocket.");
3041 }
3042
3043 #[test]
3044 fn test_parse_claims_flattens_inline_formatting() {
3045 let json = serde_json::json!({
3049 "ops:world-patent-data": {
3050 "ftxt:fulltext-documents": {
3051 "ftxt:fulltext-document": {
3052 "claims": {
3053 "@lang": "en",
3054 "claim": {
3055 "@num": "0001",
3056 "claim-text": [
3057 "1. A widget with mass m",
3058 {"sub": {"$": "0"}},
3059 " comprising a lattice of ",
3060 {"i": {"$": "polycrystalline"}},
3061 " silicon doped at 10",
3062 {"sup": {"$": "15"}},
3063 " atoms/cm",
3064 {"sup": {"$": "3"}},
3065 "."
3066 ]
3067 }
3068 }
3069 }
3070 }
3071 }
3072 });
3073 let r = parse_claims(&json, "EP1");
3074 assert_eq!(r.claims.len(), 1);
3075 assert_eq!(
3078 r.claims[0].text,
3079 "1. A widget with mass m0 comprising a lattice of polycrystalline silicon doped at 1015 atoms/cm3."
3080 );
3081 }
3082
3083 #[test]
3084 fn test_parse_claims_packed_claim_set_in_single_wrapper() {
3085 let json = serde_json::json!({
3090 "ops:world-patent-data": {
3091 "ftxt:fulltext-documents": {
3092 "ftxt:fulltext-document": {
3093 "claims": {
3094 "@lang": "EN",
3095 "claim": {
3096 "claim-text": [
3097 {"$": "1. A widget."},
3098 {"$": "2. The widget of claim 1."},
3099 {"$": "3. The widget of claim 2."}
3100 ]
3101 }
3102 }
3103 }
3104 }
3105 }
3106 });
3107 let r = parse_claims(&json, "EP1");
3108 assert_eq!(r.claims.len(), 3, "packed claim set must split per leaf");
3109 assert_eq!(r.claims[0].text, "1. A widget.");
3110 assert_eq!(r.claims[1].text, "2. The widget of claim 1.");
3111 assert_eq!(r.claims[2].text, "3. The widget of claim 2.");
3112 }
3115
3116 #[test]
3117 fn test_parse_claims_multi_with_array_text_and_formatting() {
3118 let json = serde_json::json!({
3120 "ops:world-patent-data": {
3121 "ftxt:fulltext-documents": {
3122 "ftxt:fulltext-document": {
3123 "claims": {
3124 "@lang": "en",
3125 "claim": [
3126 {
3127 "@num": "0001",
3128 "claim-text": [
3129 "1. A method comprising:",
3130 {"$": "applying force to a "},
3131 {"$": "widget"},
3132 "."
3133 ]
3134 },
3135 {
3136 "@num": "0002",
3137 "claim-text": "2. The method of claim 1, wherein the force is gradient."
3138 },
3139 {
3140 "@num": "0003",
3141 "claim-text": "" }
3143 ]
3144 }
3145 }
3146 }
3147 }
3148 });
3149 let r = parse_claims(&json, "EP1");
3150 assert_eq!(r.claims.len(), 2, "empty claim should be skipped");
3151 assert!(r.claims[0].text.contains("widget"));
3152 assert!(r.claims[0].text.contains("method"));
3153 assert_eq!(r.claims[1].num.as_deref(), Some("0002"));
3154 assert!(r.plain_text.contains("\n\n"));
3155 }
3156
3157 #[test]
3158 fn test_parse_citations_extracts_metadata() {
3159 let json = serde_json::json!({
3162 "ops:world-patent-data": {
3163 "exchange-documents": {
3164 "exchange-document": {
3165 "bibliographic-data": {
3166 "references-cited": {
3167 "citation": {
3168 "@cited-by": "examiner",
3169 "@cited-phase": "national-search-report",
3170 "category": {"$": "X"},
3171 "patcit": {
3172 "document-id": [
3173 {
3174 "@document-id-type": "epodoc",
3175 "date": {"$": "19951108"},
3176 "doc-number": {"$": "EP0680812"},
3177 "name": {"$": "BOER BEHEER NIJMEGEN BV DE"}
3178 },
3179 {
3180 "@document-id-type": "docdb",
3181 "country": {"$": "EP"},
3182 "doc-number": {"$": "0680812"},
3183 "kind": {"$": "A1"}
3184 }
3185 ]
3186 }
3187 }
3188 }
3189 }
3190 }
3191 }
3192 }
3193 });
3194 let r = parse_citations(&json);
3195 assert_eq!(r.cited.len(), 1);
3196 let c = &r.cited[0];
3197 assert_eq!(c.patent_id, "EP0680812");
3198 assert_eq!(c.phase, "national-search-report");
3199 assert_eq!(c.category.as_deref(), Some("X"));
3200 assert_eq!(c.cited_by.as_deref(), Some("examiner"));
3201 assert_eq!(c.date.as_deref(), Some("1995-11-08"));
3202 assert_eq!(c.name.as_deref(), Some("BOER BEHEER NIJMEGEN BV DE"));
3203 }
3204
3205 #[test]
3206 fn test_parse_citations_skips_non_patent() {
3207 let json = serde_json::json!({
3208 "ops:world-patent-data": {
3209 "exchange-documents": {
3210 "exchange-document": {
3211 "bibliographic-data": {
3212 "references-cited": {
3213 "citation": [
3214 {
3215 "nplcit": {
3216 "text": {"$": "Non-patent literature reference"}
3217 }
3218 },
3219 {
3220 "@cited-phase": "search",
3221 "patcit": {
3222 "document-id": [{
3223 "@document-id-type": "epodoc",
3224 "country": {"$": "US"},
3225 "doc-number": {"$": "US999"},
3226 "kind": {"$": "A1"}
3227 }]
3228 }
3229 }
3230 ]
3231 }
3232 }
3233 }
3234 }
3235 }
3236 });
3237 let result = parse_citations(&json);
3238 assert_eq!(result.cited.len(), 1);
3239 assert_eq!(result.cited[0].patent_id, "US999");
3240 }
3241
3242 fn search_doc(country: &str, num: &str, kind: &str, title: &str) -> serde_json::Value {
3245 serde_json::json!({
3246 "bibliographic-data": {
3247 "publication-reference": {
3248 "document-id": [{
3249 "@document-id-type": "epodoc",
3250 "country": {"$": country},
3251 "doc-number": {"$": num},
3252 "kind": {"$": kind}
3253 }]
3254 },
3255 "invention-title": {"$": title, "@lang": "en"}
3256 }
3257 })
3258 }
3259
3260 #[test]
3265 fn test_parse_search_results_real_shape() {
3266 let json = serde_json::json!({
3267 "ops:world-patent-data": {
3268 "ops:biblio-search": {
3269 "@total-result-count": "194",
3270 "ops:range": {"@begin": "1", "@end": "10"},
3271 "ops:search-result": {
3272 "exchange-documents": [
3273 {
3274 "exchange-document": {
3275 "@country": "CN",
3276 "@kind": "A",
3277 "@family-id": "89371517",
3278 "abstract": [
3279 {"@lang": "en", "p": {"$": "First abstract."}},
3280 {"@lang": "ol", "p": {"$": "其他语言"}}
3281 ],
3282 "bibliographic-data": {
3283 "invention-title": [
3284 {"@lang": "ol", "$": "其他"},
3285 {"@lang": "en", "$": "Hit one"}
3286 ],
3287 "publication-reference": {
3288 "document-id": [{
3289 "@document-id-type": "epodoc",
3290 "date": {"$": "20240105"},
3291 "doc-number": {"$": "CN111"}
3292 }]
3293 },
3294 "application-reference": {
3295 "document-id": [{
3296 "@document-id-type": "epodoc",
3297 "date": {"$": "20220526"},
3298 "doc-number": {"$": "CN20228037675"}
3299 }]
3300 },
3301 "classifications-ipcr": {
3302 "classification-ipcr": {
3303 "@sequence": "1",
3304 "text": {"$": "G01C 21/ 34 A I"}
3305 }
3306 },
3307 "patent-classifications": {
3308 "patent-classification": [{
3309 "@sequence": "1",
3310 "classification-scheme": {"@scheme": "CPCI"},
3311 "section": {"$": "G"},
3312 "class": {"$": "01"},
3313 "subclass": {"$": "C"},
3314 "main-group": {"$": "21"},
3315 "subgroup": {"$": "34"}
3316 }]
3317 },
3318 "priority-claims": {
3319 "priority-claim": {
3320 "document-id": {
3321 "@document-id-type": "epodoc",
3322 "date": {"$": "20210702"},
3323 "doc-number": {"$": "US202163218215P"}
3324 }
3325 }
3326 },
3327 "parties": {
3328 "applicants": {
3329 "applicant": [
3330 {
3331 "@data-format": "epodoc",
3332 "@sequence": "1",
3333 "applicant-name": {"name": {"$": "APPLE INC"}}
3334 },
3335 {
3336 "@data-format": "original",
3337 "@sequence": "1",
3338 "applicant-name": {"name": {"$": "苹果公司"}}
3339 }
3340 ]
3341 },
3342 "inventors": {
3343 "inventor": [{
3344 "@data-format": "epodoc",
3345 "@sequence": "1",
3346 "inventor-name": {"name": {"$": "KIM YUN-JAE"}}
3347 }]
3348 }
3349 }
3350 }
3351 }
3352 },
3353 {
3354 "exchange-document": {
3355 "@country": "US",
3356 "@kind": "A1",
3357 "bibliographic-data": {
3358 "invention-title": {"@lang": "en", "$": "Hit two"},
3359 "publication-reference": {
3360 "document-id": [{
3361 "@document-id-type": "epodoc",
3362 "doc-number": {"$": "US222"}
3363 }]
3364 }
3365 }
3366 }
3367 }
3368 ]
3369 }
3370 }
3371 }
3372 });
3373 let r = parse_search_results(&json);
3374 assert_eq!(r.total_count, 194);
3375 assert_eq!(
3376 r.range,
3377 (1, 10),
3378 "range should come from biblio-search.ops:range"
3379 );
3380 assert_eq!(
3381 r.patents.len(),
3382 2,
3383 "should unwrap each `exchange-document` from the array"
3384 );
3385
3386 let p0 = &r.patents[0];
3387 assert_eq!(p0.patent_id, "CN111");
3388 assert_eq!(p0.title, "Hit one");
3389 assert_eq!(p0.abstract_text, "First abstract.");
3390 assert_eq!(p0.kind_code.as_deref(), Some("A"));
3391 assert_eq!(p0.family_id.as_deref(), Some("89371517"));
3392 assert_eq!(p0.publication_date.as_deref(), Some("2024-01-05"));
3393 assert_eq!(p0.filing_date.as_deref(), Some("2022-05-26"));
3394 assert_eq!(p0.priority_date.as_deref(), Some("2021-07-02"));
3395 assert_eq!(p0.assignee.as_deref(), Some("APPLE INC"));
3396 assert_eq!(p0.applicants, vec!["APPLE INC"]);
3397 assert_eq!(p0.inventors, vec!["KIM YUN-JAE"]);
3398 assert_eq!(p0.classification, vec!["G01C21/34"]);
3399 assert_eq!(p0.cpc_classifications, vec!["G01C21/34"]);
3400
3401 let p1 = &r.patents[1];
3402 assert_eq!(p1.patent_id, "US222");
3403 assert_eq!(p1.title, "Hit two");
3404 assert!(p1.applicants.is_empty());
3406 assert!(p1.inventors.is_empty());
3407 assert!(p1.priority_date.is_none());
3408 }
3409
3410 #[test]
3411 fn test_parse_search_results_skips_missing_id() {
3412 let json = serde_json::json!({
3415 "ops:world-patent-data": {
3416 "ops:biblio-search": {
3417 "@total-result-count": "2",
3418 "ops:search-result": {
3419 "ops:range": {"@begin": "1", "@end": "2"},
3420 "exchange-documents": {
3421 "exchange-document": [
3422 search_doc("EP", "EP42", "A1", "Good"),
3423 serde_json::json!({
3424 "bibliographic-data": {
3425 "invention-title": {"$": "Orphan", "@lang": "en"}
3426 }
3427 }),
3428 ]
3429 }
3430 }
3431 }
3432 }
3433 });
3434 let result = parse_search_results(&json);
3435 assert_eq!(result.patents.len(), 1);
3436 assert_eq!(result.patents[0].patent_id, "EP42");
3437 }
3438}