Skip to main content

epo/
lib.rs

1//! Async client for the [EPO Open Patent Services](https://ops.epo.org) v3.2 REST API.
2//!
3//! Handles OAuth2 token caching, bibliographic / family / citation / search
4//! fetching, deeply nested JSON response parsing, and exponential backoff on
5//! 403/429. Also exposes a lightweight CQL syntax validator so callers can
6//! pre-flight queries before sending them.
7//!
8//! # Quick start
9//!
10//! ```no_run
11//! use epo::{EpoClient, EpoError};
12//!
13//! # async fn run() -> Result<(), EpoError> {
14//! let client = EpoClient::new(
15//!     std::env::var("EPO_CONSUMER_KEY").unwrap(),
16//!     std::env::var("EPO_CONSUMER_SECRET").unwrap(),
17//!     None, // defaults to https://ops.epo.org/3.2
18//! );
19//!
20//! let biblio = client.fetch_biblio("EP1000000").await?;
21//! println!("{}: {}", biblio.title, biblio.assignee.as_deref().unwrap_or(""));
22//! # Ok(())
23//! # }
24//! ```
25//!
26//! # CQL pre-flight
27//!
28//! ```
29//! use epo::validate_cql;
30//!
31//! // Date ranges must use `within`, not `>=`/`<=` — EPO rejects the latter.
32//! let q = r#"pa=Apple AND pd within "20240101,20240107""#;
33//! let v = validate_cql(q).expect("syntactically valid");
34//! assert_eq!(v.fields[0], ("pa".into(), "Apple".into()));
35//! ```
36//!
37//! # Endpoints
38//!
39//! - [`EpoClient::fetch_biblio`] — bibliographic data (title, abstract, applicants, classifications, dates).
40//! - [`EpoClient::fetch_description`] — full-text description body, paragraph-level. Separate quota; may 404 even when biblio works.
41//! - [`EpoClient::fetch_claims`] — full claim set, claim-level. Same separate quota as description.
42//! - [`EpoClient::fetch_family`] — INPADOC family members.
43//! - [`EpoClient::fetch_citations`] — backward citations from the biblio endpoint.
44//! - [`EpoClient::fetch_citing`] — forward citations via CQL search.
45//! - [`EpoClient::fetch_search`] — CQL search with pagination.
46//!
47//! All four return typed [`EpoError`] variants on failure, with retry built in
48//! for `403`/`429`. The token endpoint response is cached and refreshed
49//! automatically when it nears expiry.
50//!
51//! # Credentials
52//!
53//! You need an EPO OPS consumer key + secret from
54//! <https://developers.epo.org>. The free tier limits weekly traffic by bytes;
55//! the client honours the documented 4 req/s pacing internally.
56
57use std::collections::HashSet;
58use std::sync::Arc;
59use tokio::sync::Mutex;
60use tokio::time::{Duration, sleep};
61use tracing::{Span, debug, field, info, instrument, warn};
62
63mod validator;
64pub use validator::{CqlError, CqlErrorKind, FIELD_CODES, ValidatedCql, validate as validate_cql};
65
66const DEFAULT_EPO_BASE: &str = "https://ops.epo.org/3.2";
67
68/// Errors returned by every [`EpoClient`] method.
69///
70/// Variants distinguish recoverable failures (`RateLimit` — try again) from
71/// terminal ones (`NotFound`, `Auth`) and from transport noise (`Network`)
72/// vs. server-side anomalies (`Api`).
73#[derive(Debug, Clone)]
74#[non_exhaustive]
75pub enum EpoError {
76    /// OAuth2 token endpoint rejected the credentials, or the response was
77    /// missing a usable `access_token`. The contained string carries the
78    /// status + body for debugging.
79    Auth(String),
80    /// HTTP 404 from a data endpoint — the patent isn't in EPO's index.
81    NotFound,
82    /// HTTP 403 or 429 — rate limit hit. The client retries automatically
83    /// (per [`ClientConfig::retry_backoff`]); this variant means the retry
84    /// schedule was exhausted.
85    RateLimit,
86    /// Any other non-success HTTP status (4xx other than 403/404/429, or
87    /// 5xx). Contains the status code and response body.
88    Api(String),
89    /// Transport failure (DNS, connection refused, TLS, timeout). Contains
90    /// the underlying [`reqwest`] error message.
91    Network(String),
92}
93
94impl std::fmt::Display for EpoError {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        match self {
97            EpoError::Auth(msg) => write!(f, "EPO auth error: {msg}"),
98            EpoError::NotFound => write!(f, "Patent not found"),
99            EpoError::RateLimit => write!(f, "EPO rate limit exceeded"),
100            EpoError::Api(msg) => write!(f, "EPO API error: {msg}"),
101            EpoError::Network(msg) => write!(f, "Network error: {msg}"),
102        }
103    }
104}
105
106impl std::error::Error for EpoError {}
107
108/// One member of an INPADOC patent family. Returned by [`EpoClient::fetch_family`].
109#[derive(Debug, Clone, Default, serde::Serialize)]
110pub struct FamilyMember {
111    /// Publication number in epodoc format (e.g. `EP1000000`, `US12131170`).
112    pub patent_id: String,
113    /// ISO country code of the publication office (`EP`, `US`, `JP`, …).
114    pub country: String,
115    /// Publication kind code (`A1`, `B1`, `T1`, `C2`, …). Country-specific
116    /// scheme — `B` is granted in EP/US, `C` in NL, `T` in AT/DE for translations.
117    pub kind: String,
118    /// EPO's family endpoint does not return invention titles. Always empty
119    /// from [`parse_family`]; use [`EpoClient::enrich_family_titles`] to
120    /// fan out per-member biblio fetches if you need them.
121    pub title: String,
122    /// Publication date (`YYYY-MM-DD`) of this publication, when EPO
123    /// supplied one.
124    pub publication_date: Option<String>,
125}
126
127/// One backward or forward citation. Returned in [`Citations::cited`] from
128/// [`EpoClient::fetch_citations`] (with `phase = "search" | "examination" | …`)
129/// and in the result of [`EpoClient::fetch_citing`] (with `phase = "citing"`).
130#[derive(Debug, Clone, serde::Serialize)]
131#[non_exhaustive]
132pub struct Citation {
133    /// Cited patent's publication number in epodoc format.
134    pub patent_id: String,
135    /// Citation phase from `@cited-phase`: `national-search-report`,
136    /// `examination`, `opposition`, `undefined`, … or the literal `"citing"`
137    /// for forward citations from [`EpoClient::fetch_citing`].
138    pub phase: String,
139    /// EPO citation category from `category.$`: `X` (novelty / inventive step),
140    /// `Y` (combination), `A` (technology background), `E` (earlier patent),
141    /// `D` (cited in application), `P`/`L`/`O`/`T` (rare). X and Y are the
142    /// strongest prior-art signals. `None` for forward citations.
143    pub category: Option<String>,
144    /// `@cited-by`: `examiner`, `applicant`, `third-party`, `unknown`. `None`
145    /// for forward citations.
146    pub cited_by: Option<String>,
147    /// Publication date of the cited patent (`YYYY-MM-DD`), when supplied.
148    pub date: Option<String>,
149    /// Applicant/assignee of the cited patent. Useful for detecting
150    /// self-citations (same name as the parent's applicants).
151    pub name: Option<String>,
152}
153
154/// Citation directions for one patent. Returned by [`EpoClient::fetch_citations`].
155///
156/// EPO's biblio endpoint exposes only backward citations; `citing` is always
157/// empty here. Use [`EpoClient::fetch_citing`] to populate forward refs via
158/// a separate search call.
159#[derive(Debug, Clone, serde::Serialize)]
160#[non_exhaustive]
161pub struct Citations {
162    /// Backward citations — patents that THIS patent's application cites
163    /// (submitted by applicant or examiner).
164    pub cited: Vec<Citation>,
165    /// Forward citations — patents that cite THIS patent. Empty from
166    /// [`EpoClient::fetch_citations`]; populated by
167    /// [`EpoClient::fetch_citing`].
168    pub citing: Vec<Citation>,
169}
170
171/// Bibliographic metadata for one patent. Returned by [`EpoClient::fetch_biblio`].
172///
173/// All fields except `title`/`abstract_text`/`classification`/`cpc_classifications`
174/// are `Option`/empty when EPO did not supply them. The parser prefers the
175/// granted publication (kind starting with `B`) when EPO returns an
176/// application+granted pair for the same patent.
177#[derive(Debug, Clone, Default)]
178#[non_exhaustive]
179pub struct PatentBiblio {
180    /// English-language invention title, falling back to the first available
181    /// language when no English variant is present.
182    pub title: String,
183    /// Full English-language abstract body, extracted from the `<abstract><p>`
184    /// shape. Empty when EPO didn't include an abstract (some applications
185    /// ship without one).
186    pub abstract_text: String,
187    /// First applicant in epodoc format. Kept for backward compatibility; new
188    /// callers should prefer [`applicants`](Self::applicants) for the full
189    /// list. Equivalent to `applicants.first().cloned()`.
190    pub assignee: Option<String>,
191    /// All distinct epodoc-format applicants (or the original-format list as
192    /// fallback). EPO often lists multiple — joint applicants, subsidiaries,
193    /// translation variants — and each carries the country-code suffix
194    /// (`[NL]`, `[US]`, …) when known.
195    pub applicants: Vec<String>,
196    /// Inventors in epodoc format. Same fallback as
197    /// [`applicants`](Self::applicants) — original-format names are skipped
198    /// when an epodoc entry exists.
199    pub inventors: Vec<String>,
200    /// Application filing date (`YYYY-MM-DD`).
201    pub filing_date: Option<String>,
202    /// Publication date (`YYYY-MM-DD`) of the chosen exchange-document. When
203    /// EPO returns A1+B1 for the same patent, this is the B1 (granted) date.
204    pub publication_date: Option<String>,
205    /// Earliest priority claim date (`YYYY-MM-DD`). For PCT applications this
206    /// is usually the original national filing.
207    pub priority_date: Option<String>,
208    /// Publication kind code: `A1`/`A2` (application), `B1`/`B2` (granted),
209    /// `T1`/`T2`/`T3` (translations), country-specific.
210    pub kind_code: Option<String>,
211    /// EPO `@family-id`. Pass to [`EpoClient::fetch_family`] callers if you
212    /// want to skip a round-trip — same family-id always yields the same
213    /// member list.
214    pub family_id: Option<String>,
215    /// Full IPC codes (`B28B1/29`), de-duplicated across the
216    /// `classification-ipc` and `classifications-ipcr` blocks. Not truncated
217    /// to subclass.
218    pub classification: Vec<String>,
219    /// CPC codes from the structured `patent-classifications` block,
220    /// de-duplicated across reporting offices (US/EP/IL/KR/CN can each
221    /// classify the same patent with the same code).
222    pub cpc_classifications: Vec<String>,
223}
224
225/// One result row from [`EpoClient::fetch_search`]. Mirrors [`PatentBiblio`]
226/// but with the patent's own publication number as the primary key.
227#[derive(Debug, Clone, serde::Serialize)]
228#[non_exhaustive]
229pub struct SearchResultPatent {
230    /// Publication number in epodoc format.
231    pub patent_id: String,
232    /// English-language title, with fallback as in [`PatentBiblio::title`].
233    pub title: String,
234    /// English-language abstract body. Empty when EPO didn't include one.
235    pub abstract_text: String,
236    /// First epodoc applicant. Backward-compat shortcut for `applicants[0]`.
237    pub assignee: Option<String>,
238    /// All distinct epodoc-format applicants. See [`PatentBiblio::applicants`].
239    pub applicants: Vec<String>,
240    /// Inventors in epodoc format.
241    pub inventors: Vec<String>,
242    /// Application filing date (`YYYY-MM-DD`).
243    pub filing_date: Option<String>,
244    /// Publication date (`YYYY-MM-DD`).
245    pub publication_date: Option<String>,
246    /// Earliest priority claim date (`YYYY-MM-DD`).
247    pub priority_date: Option<String>,
248    /// Publication kind code (`A1`, `B1`, `T1`, …).
249    pub kind_code: Option<String>,
250    /// EPO `@family-id`.
251    pub family_id: Option<String>,
252    /// Full IPC codes (`B28B1/29`).
253    pub classification: Vec<String>,
254    /// CPC codes from `patent-classifications`.
255    pub cpc_classifications: Vec<String>,
256}
257
258/// Full-text description for a published patent. Returned by
259/// [`EpoClient::fetch_description`].
260///
261/// Description availability is patchy: many older publications and
262/// non-major-jurisdiction filings are biblio-only in EPO's index, even
263/// though `fetch_biblio` succeeds. A successful biblio fetch is **not** a
264/// guarantee that the description endpoint will return 200.
265#[derive(Debug, Clone, Default)]
266#[non_exhaustive]
267pub struct PatentDescription {
268    /// The publication number that was queried.
269    pub patent_id: String,
270    /// Description language as reported by EPO (`@lang` attribute on
271    /// `<description>`). Usually the original filing language. `None` when
272    /// EPO didn't tag the response.
273    pub language: Option<String>,
274    /// Paragraph-level breakdown, preserving EPO's `@num` ordering tags
275    /// (e.g. `"0001"`, `"0002"`, …) when present. Empty paragraphs are
276    /// skipped.
277    pub paragraphs: Vec<DescriptionParagraph>,
278    /// Convenience field: all paragraph texts joined with `"\n\n"`. Use
279    /// this when you want the description as a single blob for embedding
280    /// or LLM context; use [`paragraphs`](Self::paragraphs) when you need
281    /// per-paragraph addressing.
282    pub plain_text: String,
283}
284
285/// One paragraph of a [`PatentDescription`].
286#[derive(Debug, Clone)]
287#[non_exhaustive]
288pub struct DescriptionParagraph {
289    /// EPO's `@num` attribute on the `<p>` element — usually a 4-digit
290    /// zero-padded sequence (`"0001"`). `None` when EPO didn't tag it.
291    pub num: Option<String>,
292    /// Paragraph body text, trimmed.
293    pub text: String,
294}
295
296/// Full claim set for a published patent. Returned by
297/// [`EpoClient::fetch_claims`].
298///
299/// Claims are what define infringement (descriptions are background); a
300/// patent without claims is rare but possible (`fetch_claims` returns
301/// [`EpoError::NotFound`] in that case).
302#[derive(Debug, Clone, Default)]
303#[non_exhaustive]
304pub struct PatentClaims {
305    /// The publication number that was queried.
306    pub patent_id: String,
307    /// Claim set language as reported by EPO (`@lang` attribute).
308    pub language: Option<String>,
309    /// Individual claims in publication order. EPO's `@id`/`@num`
310    /// attributes are preserved when present.
311    pub claims: Vec<Claim>,
312    /// Convenience field: all claim bodies joined with `"\n\n"`. Use
313    /// [`claims`](Self::claims) when you need per-claim addressing
314    /// (claim charts, dependency graphs).
315    pub plain_text: String,
316}
317
318/// One claim from a [`PatentClaims`].
319#[derive(Debug, Clone)]
320#[non_exhaustive]
321pub struct Claim {
322    /// EPO's `@num` attribute (e.g. `"0001"`, `"0002"`). `None` when
323    /// EPO didn't tag it.
324    pub num: Option<String>,
325    /// EPO's `@id` attribute (e.g. `"claim001"`). `None` when absent.
326    pub id: Option<String>,
327    /// Claim body text. EPO sometimes wraps formatting (italics,
328    /// subscripts, math) in nested elements — those are flattened to
329    /// plain text here. If you need formatted output, parse the raw
330    /// JSON yourself with [`parse_claims`].
331    pub text: String,
332}
333
334/// Page of results from [`EpoClient::fetch_search`].
335#[derive(Debug, Clone)]
336#[non_exhaustive]
337pub struct SearchResults {
338    /// Total number of patents matching the query across the whole index
339    /// (NOT just the returned page). EPO caps very large queries.
340    pub total_count: u32,
341    /// `(begin, end)` of the returned slice (1-indexed, inclusive). For a
342    /// `Range: 1-10` request returning 10 hits, this is `(1, 10)`.
343    pub range: (u32, u32),
344    /// The actual hits, deduplicated by `patent_id` within the page.
345    pub patents: Vec<SearchResultPatent>,
346}
347
348struct TokenState {
349    access_token: String,
350    expires_at: std::time::Instant,
351}
352
353/// Per-endpoint min-interval pacer. Serializes callers via an async mutex
354/// holding the last-call timestamp; each waiter sleeps until
355/// `last_call + interval` before its turn.
356struct EndpointPacer {
357    last_call: Mutex<Option<std::time::Instant>>,
358    interval: Duration,
359}
360
361pub struct EpoClient {
362    http: reqwest::Client,
363    consumer_key: String,
364    consumer_secret: String,
365    config: ClientConfig,
366    token: Arc<Mutex<Option<TokenState>>>,
367    /// Most recent throttling state from EPO response headers. `None` until
368    /// the first successful HTTP call.
369    throttling: Arc<std::sync::Mutex<Option<ThrottlingState>>>,
370    /// Per-endpoint pacers. `inpadoc` covers biblio/family/citations,
371    /// `search` covers search/citing, `retrieval` covers description/claims,
372    /// `other` covers the auth + token refresh path.
373    pacer_inpadoc: Arc<EndpointPacer>,
374    pacer_search: Arc<EndpointPacer>,
375    pacer_retrieval: Arc<EndpointPacer>,
376    pacer_other: Arc<EndpointPacer>,
377    /// Bounds in-flight requests across every endpoint. Built from
378    /// [`ClientConfig::max_concurrent`]; the per-endpoint pacers run beneath
379    /// it so a small permit count + tight pacers compose without conflict.
380    sem: Arc<tokio::sync::Semaphore>,
381}
382
383/// Tunables for [`EpoClient`]. Use [`ClientConfig::default`] for sensible
384/// defaults that match EPO's documented per-endpoint ceilings, or
385/// [`EpoClient::with_config`] to override individual fields.
386#[derive(Debug, Clone)]
387pub struct ClientConfig {
388    /// Base URL for the EPO OPS API. Default: `https://ops.epo.org/3.2`.
389    pub base_url: String,
390    /// Backoff schedule for retrying `403`/`429` errors. Total attempts =
391    /// `retry_backoff.len()`. Element 0 is unused (the per-endpoint pacer
392    /// already enforces baseline pacing on the first attempt); subsequent
393    /// elements are the extra delay before each retry: `retry_backoff[1]`
394    /// before attempt 1, `retry_backoff[2]` before attempt 2, …
395    /// Default: `[250 ms, 500 ms, 1000 ms, 2000 ms]` — 4 attempts, with
396    /// 500/1000/2000 ms extra spacing between retries (≈3.5 s aggregate).
397    pub retry_backoff: Vec<Duration>,
398    /// Buffer subtracted from the OAuth token's `expires_in` to trigger
399    /// early refresh. Default: 60 s.
400    pub token_refresh_buffer: Duration,
401    /// Min interval between calls to *inpadoc* endpoints (biblio, family,
402    /// citations). EPO's documented free-tier ceiling is 45 req/min →
403    /// default ≈ 1.334 s/call. Lower for paid plans.
404    pub inpadoc_interval: Duration,
405    /// Min interval between calls to *search* endpoints (search, citing).
406    /// EPO's documented free-tier ceiling is 15 req/min → default 4 s/call.
407    /// Tightest of the four — search burns its budget fastest.
408    pub search_interval: Duration,
409    /// Min interval between calls to *retrieval* endpoints (description,
410    /// claims). EPO's documented free-tier ceiling is 100 req/min →
411    /// default 600 ms/call.
412    pub retrieval_interval: Duration,
413    /// Min interval for the *other* category (auth/status). 1000 req/min
414    /// → default 60 ms/call. Mostly negligible since the token is cached.
415    pub other_interval: Duration,
416    /// Soft warning threshold for weekly bytes used (`x-registeredquotaperweek-used`).
417    /// Logs a `warn!` when crossed. Default: 75% of 4 GB free-tier ceiling.
418    pub weekly_warn_bytes: u64,
419    /// Maximum in-flight requests across every endpoint. Acquired at the
420    /// entry of each public `fetch_*` method via an internal semaphore,
421    /// so callers don't have to wrap calls in their own concurrency
422    /// guard. Composes with the per-endpoint pacers: e.g. `max_concurrent =
423    /// 8` lets up to 8 requests overlap, each still waiting on its
424    /// endpoint pacer before firing. `0` is clamped to `1` at construction
425    /// (a zero-permit semaphore would deadlock the first call). Default: 8.
426    pub max_concurrent: usize,
427}
428
429/// Overall server load reported in `x-throttling-control`.
430#[derive(Debug, Clone, PartialEq, Eq)]
431#[non_exhaustive]
432pub enum ThrottlingLoad {
433    /// `idle` — normal operation.
434    Idle,
435    /// `busy` — under load but functional.
436    Busy,
437    /// `overloaded` — slow down voluntarily.
438    Overloaded,
439    /// `service_unavailable` — back off entirely.
440    Unavailable,
441    /// Unrecognised load string from EPO. Preserved verbatim so callers
442    /// can log / surface the unknown value rather than silently flattening
443    /// it. Example: a future `super_overloaded` state would land here.
444    Other(String),
445}
446
447/// Per-endpoint quota color from `x-throttling-control`.
448#[derive(Debug, Clone, Copy, PartialEq, Eq)]
449#[non_exhaustive]
450pub enum ThrottlingColor {
451    /// Plenty of headroom.
452    Green,
453    /// Slow down.
454    Yellow,
455    /// Close to limit.
456    Red,
457    /// Quota exhausted; requests will fail.
458    Black,
459}
460
461/// Per-endpoint slice of [`ThrottlingState`].
462#[derive(Debug, Clone)]
463#[non_exhaustive]
464pub struct EndpointQuota {
465    /// `green`/`yellow`/`red`/`black` from EPO.
466    pub color: ThrottlingColor,
467    /// Requests-per-minute remaining at this color level. EPO's number;
468    /// not adjusted for any in-flight requests on our side.
469    pub remaining_per_minute: u32,
470}
471
472/// Snapshot of the EPO throttling + quota state, parsed from the response
473/// headers of the most recent successful HTTP call. Returned by
474/// [`EpoClient::throttling_state`].
475///
476/// Updated after every successful response (token + data calls). Never
477/// stale by more than one round-trip; never updated on errors that
478/// returned no headers.
479#[derive(Debug, Clone)]
480#[non_exhaustive]
481pub struct ThrottlingState {
482    /// Overall load level (`idle`/`busy`/`overloaded`/`service_unavailable`).
483    pub load: ThrottlingLoad,
484    /// Per-endpoint allowances. Keys are EPO's canonical names: `images`,
485    /// `inpadoc`, `other`, `retrieval`, `search`. Endpoints absent from
486    /// the response header are absent from this map.
487    pub endpoints: std::collections::HashMap<String, EndpointQuota>,
488    /// `x-individualquotaperhour-used` — bytes pulled this hour.
489    pub hour_bytes_used: Option<u64>,
490    /// `x-registeredquotaperweek-used` — bytes pulled this week. The
491    /// free-tier weekly ceiling is 4 GB.
492    pub week_bytes_used: Option<u64>,
493}
494
495impl ThrottlingState {
496    /// `true` if any endpoint quota slot is unusable: either color is
497    /// `black` (EPO has marked it dead), or `remaining_per_minute == 0`
498    /// regardless of color (e.g. `green:0` — sometimes EPO surfaces this
499    /// transiently). Callers doing batch pre-flight should refuse new
500    /// work in either case; the next request would 403 anyway.
501    pub fn is_exhausted(&self) -> bool {
502        self.endpoints
503            .values()
504            .any(|q| q.color == ThrottlingColor::Black || q.remaining_per_minute == 0)
505    }
506
507    /// Convenience accessor: remaining slots for the `inpadoc` endpoint
508    /// group (biblio / family / citations).
509    pub fn inpadoc_remaining(&self) -> Option<u32> {
510        self.endpoints
511            .get("inpadoc")
512            .map(|q| q.remaining_per_minute)
513    }
514
515    /// Convenience accessor: remaining slots for the `search` endpoint
516    /// group (search / citing).
517    pub fn search_remaining(&self) -> Option<u32> {
518        self.endpoints.get("search").map(|q| q.remaining_per_minute)
519    }
520
521    /// Convenience accessor: remaining slots for the `retrieval` endpoint
522    /// group (description / claims).
523    pub fn retrieval_remaining(&self) -> Option<u32> {
524        self.endpoints
525            .get("retrieval")
526            .map(|q| q.remaining_per_minute)
527    }
528}
529
530/// `expires_in` is what EPO reports; subtract `buffer` and clamp to zero so
531/// a tiny EPO TTL plus a too-large buffer doesn't underflow.
532fn expires_in_with_buffer(expires_in: u64, buffer: Duration) -> Duration {
533    Duration::from_secs(expires_in.saturating_sub(buffer.as_secs()))
534}
535
536/// Compare new throttling state against previous, emit `warn!` on the
537/// transitions worth waking up an operator for.
538fn emit_threshold_warnings(
539    prev: Option<&ThrottlingState>,
540    new: &ThrottlingState,
541    weekly_warn_bytes: u64,
542) {
543    // Load level transition (idle → busy → overloaded → unavailable).
544    let prev_load = prev.map(|p| &p.load);
545    if prev_load != Some(&new.load) && new.load != ThrottlingLoad::Idle {
546        warn!(load = ?new.load, "EPO server load level changed");
547    }
548
549    // Per-endpoint color worsening.
550    for (name, quota) in &new.endpoints {
551        let prev_color = prev.and_then(|p| p.endpoints.get(name)).map(|q| q.color);
552        let worsened = matches!(
553            (prev_color, quota.color),
554            (
555                Some(ThrottlingColor::Green),
556                ThrottlingColor::Yellow | ThrottlingColor::Red | ThrottlingColor::Black
557            ) | (
558                Some(ThrottlingColor::Yellow),
559                ThrottlingColor::Red | ThrottlingColor::Black
560            ) | (Some(ThrottlingColor::Red), ThrottlingColor::Black)
561                | (
562                    None,
563                    ThrottlingColor::Yellow | ThrottlingColor::Red | ThrottlingColor::Black
564                )
565        );
566        if worsened {
567            warn!(
568                endpoint = %name,
569                color = ?quota.color,
570                remaining = quota.remaining_per_minute,
571                "EPO endpoint quota worsened"
572            );
573        }
574    }
575
576    // Weekly bytes crossing the warn threshold (rising edge only).
577    let prev_week = prev.and_then(|p| p.week_bytes_used).unwrap_or(0);
578    if let Some(now_week) = new.week_bytes_used
579        && prev_week < weekly_warn_bytes
580        && now_week >= weekly_warn_bytes
581    {
582        warn!(
583            bytes_used = now_week,
584            threshold = weekly_warn_bytes,
585            "EPO weekly quota crossed warn threshold"
586        );
587    }
588}
589
590/// Parse EPO's `x-throttling-control` header into a [`ThrottlingState`].
591///
592/// Format: `<load> (<endpoint>=<color>:<remaining>, ...)`. Examples:
593///
594/// ```text
595/// idle (images=green:200, inpadoc=green:60, retrieval=green:200, search=green:30)
596/// busy (images=green:100, inpadoc=yellow:30, search=red:5)
597/// overloaded (search=black:0)
598/// ```
599///
600/// `hour_used` and `week_used` are filled by the caller from the
601/// `x-individualquotaperhour-used` / `x-registeredquotaperweek-used`
602/// headers. Returns `None` when the header is malformed beyond recognition;
603/// callers treat that as "no update".
604pub(crate) fn parse_throttling_header(
605    header: &str,
606    hour_used: Option<u64>,
607    week_used: Option<u64>,
608) -> Option<ThrottlingState> {
609    let header = header.trim();
610    let (load_str, rest) = match header.split_once('(') {
611        Some((load, rest)) => (load.trim(), rest.trim_end_matches(')').trim()),
612        None => (header, ""),
613    };
614
615    let load = match load_str.to_ascii_lowercase().as_str() {
616        "idle" => ThrottlingLoad::Idle,
617        "busy" => ThrottlingLoad::Busy,
618        "overloaded" => ThrottlingLoad::Overloaded,
619        "service_unavailable" => ThrottlingLoad::Unavailable,
620        _ => ThrottlingLoad::Other(load_str.to_string()),
621    };
622
623    let mut endpoints = std::collections::HashMap::new();
624    for entry in rest.split(',') {
625        let entry = entry.trim();
626        let Some((name, rest)) = entry.split_once('=') else {
627            continue;
628        };
629        let Some((color_str, remaining_str)) = rest.split_once(':') else {
630            continue;
631        };
632        let color = match color_str.to_ascii_lowercase().as_str() {
633            "green" => ThrottlingColor::Green,
634            "yellow" => ThrottlingColor::Yellow,
635            "red" => ThrottlingColor::Red,
636            "black" => ThrottlingColor::Black,
637            _ => continue,
638        };
639        let Ok(remaining) = remaining_str.trim().parse::<u32>() else {
640            continue;
641        };
642        endpoints.insert(
643            name.trim().to_ascii_lowercase(),
644            EndpointQuota {
645                color,
646                remaining_per_minute: remaining,
647            },
648        );
649    }
650
651    Some(ThrottlingState {
652        load,
653        endpoints,
654        hour_bytes_used: hour_used,
655        week_bytes_used: week_used,
656    })
657}
658
659impl Default for ClientConfig {
660    fn default() -> Self {
661        Self {
662            base_url: DEFAULT_EPO_BASE.to_string(),
663            retry_backoff: vec![
664                Duration::from_millis(250),
665                Duration::from_millis(500),
666                Duration::from_millis(1000),
667                Duration::from_millis(2000),
668            ],
669            token_refresh_buffer: Duration::from_secs(60),
670            // Defaults derived from EPO OPS free-tier per-minute ceilings:
671            // inpadoc=45 → 1334ms, search=15 → 4000ms,
672            // retrieval=100 → 600ms, other=1000 → 60ms.
673            inpadoc_interval: Duration::from_millis(1334),
674            search_interval: Duration::from_millis(4000),
675            retrieval_interval: Duration::from_millis(600),
676            other_interval: Duration::from_millis(60),
677            // 75% of 4 GB free-tier weekly ceiling.
678            weekly_warn_bytes: (4u64 * 1024 * 1024 * 1024) * 3 / 4,
679            // 8 in-flight requests is the sweet spot for the free-tier
680            // pacers: enough overlap that I/O dominates, not so many that
681            // the pacers serialise into a single-file queue.
682            max_concurrent: 8,
683        }
684    }
685}
686
687/// Shared retry-with-pacer wrapper. Each attempt waits for the per-endpoint
688/// pacer before firing; failed attempts that returned [`EpoError::RateLimit`]
689/// add an extra sleep from `backoff` (skipping `backoff[0]` since the pacer
690/// already enforces baseline pacing). Stops after `backoff.len()` attempts.
691async fn retry_with_pacer<F, Fut, T>(
692    pacer: &EndpointPacer,
693    backoff: &[Duration],
694    patent_id: &str,
695    endpoint: &str,
696    mut op: F,
697) -> Result<T, EpoError>
698where
699    F: FnMut() -> Fut,
700    Fut: std::future::Future<Output = Result<T, EpoError>>,
701{
702    let attempts = backoff.len().max(1);
703    let span = Span::current();
704    for attempt in 0..attempts {
705        if attempt > 0
706            && let Some(delay) = backoff.get(attempt)
707        {
708            sleep(*delay).await;
709        }
710        pacer.wait_turn().await;
711        match op().await {
712            Ok(v) => {
713                span.record("attempts", attempt as u64 + 1);
714                return Ok(v);
715            }
716            Err(EpoError::RateLimit) if attempt + 1 < attempts => {
717                warn!(
718                    endpoint,
719                    patent_id,
720                    attempt = attempt + 1,
721                    "EPO rate limit, will retry"
722                );
723            }
724            Err(e) => {
725                span.record("attempts", attempt as u64 + 1);
726                return Err(e);
727            }
728        }
729    }
730    span.record("attempts", attempts as u64);
731    Err(EpoError::RateLimit)
732}
733
734impl EndpointPacer {
735    fn new(interval: Duration) -> Self {
736        Self {
737            last_call: Mutex::new(None),
738            interval,
739        }
740    }
741
742    async fn wait_turn(&self) {
743        if self.interval.is_zero() {
744            return;
745        }
746        let mut g = self.last_call.lock().await;
747        if let Some(last) = *g {
748            let elapsed = std::time::Instant::now().saturating_duration_since(last);
749            if elapsed < self.interval {
750                sleep(self.interval - elapsed).await;
751            }
752        }
753        *g = Some(std::time::Instant::now());
754    }
755}
756
757impl EpoClient {
758    /// Construct a client with default tunables. `base_url` defaults to
759    /// `https://ops.epo.org/3.2` when `None`.
760    pub fn new(consumer_key: String, consumer_secret: String, base_url: Option<String>) -> Self {
761        let mut config = ClientConfig::default();
762        if let Some(url) = base_url {
763            config.base_url = url;
764        }
765        Self::with_config(consumer_key, consumer_secret, config)
766    }
767
768    /// Construct a client with explicit [`ClientConfig`]. Use this to tune
769    /// retry/backoff for your own quotas, point at a mock server, or shorten
770    /// the token-refresh buffer for tests.
771    pub fn with_config(
772        consumer_key: String,
773        consumer_secret: String,
774        config: ClientConfig,
775    ) -> Self {
776        let pacer_inpadoc = Arc::new(EndpointPacer::new(config.inpadoc_interval));
777        let pacer_search = Arc::new(EndpointPacer::new(config.search_interval));
778        let pacer_retrieval = Arc::new(EndpointPacer::new(config.retrieval_interval));
779        let pacer_other = Arc::new(EndpointPacer::new(config.other_interval));
780        // Zero-permit semaphore would deadlock the first `acquire`; clamp.
781        let sem = Arc::new(tokio::sync::Semaphore::new(config.max_concurrent.max(1)));
782        Self {
783            http: reqwest::Client::new(),
784            consumer_key,
785            consumer_secret,
786            config,
787            token: Arc::new(Mutex::new(None)),
788            throttling: Arc::new(std::sync::Mutex::new(None)),
789            pacer_inpadoc,
790            pacer_search,
791            pacer_retrieval,
792            pacer_other,
793            sem,
794        }
795    }
796
797    /// Snapshot of EPO's throttling + quota state from the most recent
798    /// successful response. `None` until the first call completes.
799    ///
800    /// Updated after every successful HTTP round-trip; never updated on
801    /// errors that returned no headers. Use this for batch pre-flight
802    /// (refuse new work when [`ThrottlingState::is_exhausted`]) and for
803    /// metrics dashboards.
804    pub fn throttling_state(&self) -> Option<ThrottlingState> {
805        self.throttling.lock().ok().and_then(|g| g.clone())
806    }
807
808    /// Extract throttling/quota headers from a response and update the
809    /// snapshot. Logs `warn!` on transitions that callers should care
810    /// about: load going non-idle, any endpoint going non-green, weekly
811    /// bytes crossing [`ClientConfig::weekly_warn_bytes`].
812    fn update_throttling(&self, headers: &reqwest::header::HeaderMap) {
813        let header_value = headers
814            .get("x-throttling-control")
815            .and_then(|v| v.to_str().ok());
816        let hour = headers
817            .get("x-individualquotaperhour-used")
818            .and_then(|v| v.to_str().ok())
819            .and_then(|s| s.parse().ok());
820        let week = headers
821            .get("x-registeredquotaperweek-used")
822            .and_then(|v| v.to_str().ok())
823            .and_then(|s| s.parse().ok());
824
825        let Some(throttling_str) = header_value else {
826            return;
827        };
828        let Some(new_state) = parse_throttling_header(throttling_str, hour, week) else {
829            return;
830        };
831
832        let mut g = match self.throttling.lock() {
833            Ok(g) => g,
834            Err(_) => return, // poisoned lock, skip silently
835        };
836        let prev = g.clone();
837        emit_threshold_warnings(prev.as_ref(), &new_state, self.config.weekly_warn_bytes);
838        *g = Some(new_state);
839    }
840
841    async fn get_token(&self) -> Result<String, EpoError> {
842        let mut guard = self.token.lock().await;
843
844        // Return cached token if still valid
845        if let Some(ref state) = *guard
846            && state.expires_at > std::time::Instant::now()
847        {
848            return Ok(state.access_token.clone());
849        }
850
851        // Token endpoint counts as `other` quota — pace independently.
852        self.pacer_other.wait_turn().await;
853
854        // Fetch new token
855        let resp = self
856            .http
857            .post(format!("{}/auth/accesstoken", self.config.base_url))
858            .basic_auth(&self.consumer_key, Some(&self.consumer_secret))
859            .form(&[("grant_type", "client_credentials")])
860            .send()
861            .await
862            .map_err(|e| EpoError::Auth(e.to_string()))?;
863
864        // EPO sets throttling/quota headers on auth-error responses too —
865        // auth counts against the `other` quota — so capture before any
866        // early-return to keep the snapshot fresh on 4xx.
867        self.update_throttling(resp.headers());
868
869        if !resp.status().is_success() {
870            let status = resp.status();
871            let body = resp.text().await.unwrap_or_default();
872            return Err(EpoError::Auth(format!("{status}: {body}")));
873        }
874
875        let json: serde_json::Value = resp
876            .json()
877            .await
878            .map_err(|e| EpoError::Auth(e.to_string()))?;
879
880        let access_token = json["access_token"]
881            .as_str()
882            .ok_or_else(|| EpoError::Auth("No access_token in response".into()))?
883            .to_string();
884
885        let expires_in = json["expires_in"]
886            .as_u64()
887            .or_else(|| json["expires_in"].as_str().and_then(|s| s.parse().ok()))
888            .unwrap_or(1200); // default 20 min
889
890        let expires_at = std::time::Instant::now()
891            + expires_in_with_buffer(expires_in, self.config.token_refresh_buffer);
892
893        *guard = Some(TokenState {
894            access_token: access_token.clone(),
895            expires_at,
896        });
897
898        Ok(access_token)
899    }
900
901    #[instrument(skip(self), fields(endpoint = "biblio", attempts = field::Empty))]
902    pub async fn fetch_biblio(&self, patent_id: &str) -> Result<PatentBiblio, EpoError> {
903        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
904        retry_with_pacer(
905            &self.pacer_inpadoc,
906            &self.config.retry_backoff,
907            patent_id,
908            "biblio",
909            || self.fetch_biblio_once(patent_id),
910        )
911        .await
912    }
913
914    async fn fetch_biblio_once(&self, patent_id: &str) -> Result<PatentBiblio, EpoError> {
915        Ok(parse_biblio(&self.fetch_biblio_json_once(patent_id).await?))
916    }
917
918    /// One-shot HTTP call to the biblio endpoint, returning the raw JSON.
919    /// Shared by [`fetch_biblio`](Self::fetch_biblio),
920    /// [`fetch_citations`](Self::fetch_citations), and
921    /// [`fetch_biblio_with_citations`](Self::fetch_biblio_with_citations) so
922    /// callers that need both biblio + citations only pay one round-trip.
923    async fn fetch_biblio_json_once(&self, patent_id: &str) -> Result<serde_json::Value, EpoError> {
924        let token = self.get_token().await?;
925
926        let url = format!(
927            "{}/rest-services/published-data/publication/epodoc/{patent_id}/biblio",
928            self.config.base_url
929        );
930
931        let resp = self
932            .http
933            .get(&url)
934            .header("Accept", "application/json")
935            .bearer_auth(&token)
936            .send()
937            .await
938            .map_err(|e| EpoError::Network(e.to_string()))?;
939
940        // Update before the status match so 403/429 responses (which carry
941        // fresh throttling headers — exactly when callers most need them)
942        // refresh the snapshot before we early-return.
943        self.update_throttling(resp.headers());
944
945        match resp.status().as_u16() {
946            200 => {}
947            404 => return Err(EpoError::NotFound),
948            403 | 429 => return Err(EpoError::RateLimit),
949            status => {
950                let body = resp.text().await.unwrap_or_default();
951                return Err(EpoError::Api(format!("{status}: {body}")));
952            }
953        }
954
955        resp.json::<serde_json::Value>()
956            .await
957            .map_err(|e| EpoError::Api(e.to_string()))
958    }
959
960    /// Fetch biblio + citations in a single HTTP round-trip.
961    ///
962    /// EPO's biblio endpoint already includes the `references-cited` block,
963    /// so calling [`fetch_biblio`](Self::fetch_biblio) followed by
964    /// [`fetch_citations`](Self::fetch_citations) doubles the request count
965    /// (and the quota cost) for no benefit. Use this whenever you need
966    /// both. Same retry/backoff schedule as [`fetch_biblio`](Self::fetch_biblio).
967    #[instrument(skip(self), fields(endpoint = "biblio+citations", attempts = field::Empty))]
968    pub async fn fetch_biblio_with_citations(
969        &self,
970        patent_id: &str,
971    ) -> Result<(PatentBiblio, Citations), EpoError> {
972        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
973        let json = retry_with_pacer(
974            &self.pacer_inpadoc,
975            &self.config.retry_backoff,
976            patent_id,
977            "biblio+citations",
978            || self.fetch_biblio_json_once(patent_id),
979        )
980        .await?;
981        let biblio = parse_biblio(&json);
982        let citations = parse_citations(&json);
983        debug!(
984            patent_id,
985            cited = citations.cited.len(),
986            "EPO biblio+citations fetch succeeded"
987        );
988        Ok((biblio, citations))
989    }
990
991    #[instrument(skip(self), fields(endpoint = "citations", attempts = field::Empty))]
992    pub async fn fetch_citations(&self, patent_id: &str) -> Result<Citations, EpoError> {
993        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
994        let json = retry_with_pacer(
995            &self.pacer_inpadoc,
996            &self.config.retry_backoff,
997            patent_id,
998            "citations",
999            || self.fetch_biblio_json_once(patent_id),
1000        )
1001        .await?;
1002        Ok(parse_citations(&json))
1003    }
1004
1005    #[instrument(skip(self, cql), fields(
1006        endpoint = "search",
1007        query = %cql,
1008        range_begin,
1009        range_end,
1010        http_status = field::Empty,
1011        total_count = field::Empty,
1012        returned = field::Empty,
1013        attempts = field::Empty,
1014    ))]
1015    pub async fn fetch_search(
1016        &self,
1017        cql: &str,
1018        range_begin: u32,
1019        range_end: u32,
1020    ) -> Result<SearchResults, EpoError> {
1021        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1022        let val = retry_with_pacer(
1023            &self.pacer_search,
1024            &self.config.retry_backoff,
1025            cql,
1026            "search",
1027            || self.fetch_search_once(cql, range_begin, range_end),
1028        )
1029        .await?;
1030        let span = Span::current();
1031        span.record("total_count", val.total_count);
1032        span.record("returned", val.patents.len());
1033        info!("EPO search fetch succeeded");
1034        Ok(val)
1035    }
1036
1037    async fn fetch_search_once(
1038        &self,
1039        cql: &str,
1040        range_begin: u32,
1041        range_end: u32,
1042    ) -> Result<SearchResults, EpoError> {
1043        let token = self.get_token().await?;
1044
1045        let url = format!(
1046            "{}/rest-services/published-data/search/biblio",
1047            self.config.base_url
1048        );
1049
1050        let resp = self
1051            .http
1052            .get(&url)
1053            .header("Accept", "application/json")
1054            .bearer_auth(&token)
1055            .query(&[("q", cql), ("Range", &format!("{range_begin}-{range_end}"))])
1056            .send()
1057            .await
1058            .map_err(|e| EpoError::Network(e.to_string()))?;
1059
1060        let status = resp.status().as_u16();
1061        Span::current().record("http_status", status);
1062        self.update_throttling(resp.headers());
1063
1064        match status {
1065            200 => {}
1066            404 => {
1067                return Ok(SearchResults {
1068                    total_count: 0,
1069                    range: (0, 0),
1070                    patents: Vec::new(),
1071                });
1072            }
1073            403 | 429 => return Err(EpoError::RateLimit),
1074            other => {
1075                let body = resp.text().await.unwrap_or_default();
1076                return Err(EpoError::Api(format!("{other}: {body}")));
1077            }
1078        }
1079
1080        let json: serde_json::Value = resp
1081            .json()
1082            .await
1083            .map_err(|e| EpoError::Api(e.to_string()))?;
1084
1085        Ok(parse_search_results(&json))
1086    }
1087
1088    #[instrument(skip(self), fields(endpoint = "family", attempts = field::Empty))]
1089    pub async fn fetch_family(&self, patent_id: &str) -> Result<Vec<FamilyMember>, EpoError> {
1090        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1091        retry_with_pacer(
1092            &self.pacer_inpadoc,
1093            &self.config.retry_backoff,
1094            patent_id,
1095            "family",
1096            || self.fetch_family_once(patent_id),
1097        )
1098        .await
1099    }
1100
1101    async fn fetch_family_once(&self, patent_id: &str) -> Result<Vec<FamilyMember>, EpoError> {
1102        let token = self.get_token().await?;
1103
1104        let url = format!(
1105            "{}/rest-services/family/publication/epodoc/{patent_id}",
1106            self.config.base_url
1107        );
1108
1109        let resp = self
1110            .http
1111            .get(&url)
1112            .header("Accept", "application/json")
1113            .bearer_auth(&token)
1114            .send()
1115            .await
1116            .map_err(|e| EpoError::Network(e.to_string()))?;
1117
1118        self.update_throttling(resp.headers());
1119
1120        match resp.status().as_u16() {
1121            200 => {}
1122            404 => return Err(EpoError::NotFound),
1123            403 | 429 => return Err(EpoError::RateLimit),
1124            status => {
1125                let body = resp.text().await.unwrap_or_default();
1126                return Err(EpoError::Api(format!("{status}: {body}")));
1127            }
1128        }
1129
1130        let json: serde_json::Value = resp
1131            .json()
1132            .await
1133            .map_err(|e| EpoError::Api(e.to_string()))?;
1134
1135        let family = parse_family(&json);
1136        debug!(
1137            patent_id,
1138            members = family.len(),
1139            "EPO family fetch succeeded"
1140        );
1141        Ok(family)
1142    }
1143
1144    /// Fetch the full-text description body for a published patent.
1145    ///
1146    /// Description availability depends on jurisdiction and language —
1147    /// many older publications are biblio-only in EPO's index. A patent
1148    /// that succeeds on [`fetch_biblio`](Self::fetch_biblio) may still
1149    /// return [`EpoError::NotFound`] here.
1150    ///
1151    /// Counts against EPO's "retrieval" quota, which is separate from
1152    /// biblio. Honours [`ClientConfig::retrieval_interval`] pacing and
1153    /// retry-on-403/429 schedule.
1154    #[instrument(skip(self), fields(endpoint = "description", attempts = field::Empty))]
1155    pub async fn fetch_description(&self, patent_id: &str) -> Result<PatentDescription, EpoError> {
1156        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1157        retry_with_pacer(
1158            &self.pacer_retrieval,
1159            &self.config.retry_backoff,
1160            patent_id,
1161            "description",
1162            || self.fetch_description_once(patent_id),
1163        )
1164        .await
1165    }
1166
1167    async fn fetch_description_once(&self, patent_id: &str) -> Result<PatentDescription, EpoError> {
1168        let token = self.get_token().await?;
1169        let url = format!(
1170            "{}/rest-services/published-data/publication/epodoc/{patent_id}/description",
1171            self.config.base_url
1172        );
1173        let resp = self
1174            .http
1175            .get(&url)
1176            .header("Accept", "application/json")
1177            .bearer_auth(&token)
1178            .send()
1179            .await
1180            .map_err(|e| EpoError::Network(e.to_string()))?;
1181
1182        self.update_throttling(resp.headers());
1183
1184        match resp.status().as_u16() {
1185            200 => {}
1186            404 => return Err(EpoError::NotFound),
1187            403 | 429 => return Err(EpoError::RateLimit),
1188            status => {
1189                let body = resp.text().await.unwrap_or_default();
1190                return Err(EpoError::Api(format!("{status}: {body}")));
1191            }
1192        }
1193
1194        let json: serde_json::Value = resp
1195            .json()
1196            .await
1197            .map_err(|e| EpoError::Api(e.to_string()))?;
1198        Ok(parse_description(&json, patent_id))
1199    }
1200
1201    /// Fetch the claim set for a published patent.
1202    ///
1203    /// Counts against EPO's "retrieval" quota (separate from biblio).
1204    /// Claim availability mirrors descriptions — major-jurisdiction recent
1205    /// publications usually have them; older or smaller-jurisdiction
1206    /// filings may 404 here even when [`fetch_biblio`](Self::fetch_biblio)
1207    /// works.
1208    ///
1209    /// EPO sometimes wraps inline formatting (italics, subscripts, math
1210    /// formulas) inside claim text. The default parse flattens them; if
1211    /// you need to preserve formatting, use [`parse_claims`] on the raw
1212    /// JSON yourself.
1213    #[instrument(skip(self), fields(endpoint = "claims", attempts = field::Empty))]
1214    pub async fn fetch_claims(&self, patent_id: &str) -> Result<PatentClaims, EpoError> {
1215        let _permit = self.sem.acquire().await.expect("EPO semaphore poisoned");
1216        retry_with_pacer(
1217            &self.pacer_retrieval,
1218            &self.config.retry_backoff,
1219            patent_id,
1220            "claims",
1221            || self.fetch_claims_once(patent_id),
1222        )
1223        .await
1224    }
1225
1226    async fn fetch_claims_once(&self, patent_id: &str) -> Result<PatentClaims, EpoError> {
1227        let token = self.get_token().await?;
1228        let url = format!(
1229            "{}/rest-services/published-data/publication/epodoc/{patent_id}/claims",
1230            self.config.base_url
1231        );
1232        let resp = self
1233            .http
1234            .get(&url)
1235            .header("Accept", "application/json")
1236            .bearer_auth(&token)
1237            .send()
1238            .await
1239            .map_err(|e| EpoError::Network(e.to_string()))?;
1240
1241        self.update_throttling(resp.headers());
1242
1243        match resp.status().as_u16() {
1244            200 => {}
1245            404 => return Err(EpoError::NotFound),
1246            403 | 429 => return Err(EpoError::RateLimit),
1247            status => {
1248                let body = resp.text().await.unwrap_or_default();
1249                return Err(EpoError::Api(format!("{status}: {body}")));
1250            }
1251        }
1252
1253        let json: serde_json::Value = resp
1254            .json()
1255            .await
1256            .map_err(|e| EpoError::Api(e.to_string()))?;
1257        Ok(parse_claims(&json, patent_id))
1258    }
1259
1260    /// Fetch forward citations: the patents that cite `patent_id`.
1261    ///
1262    /// EPO's biblio endpoint exposes only backward citations (the references
1263    /// the applicant or examiner submitted). Forward citations require a
1264    /// separate search. This helper runs a CQL `ct=<patent_id>` against the
1265    /// search endpoint and maps each hit to a [`Citation`] with `phase` set
1266    /// to `"citing"` so callers can distinguish them from backward refs.
1267    ///
1268    /// `max` caps the number of returned citations (EPO's range is 1-based,
1269    /// `max` becomes the upper bound). Realistic ceiling is 100.
1270    #[instrument(skip(self), fields(endpoint = "citing"))]
1271    pub async fn fetch_citing(&self, patent_id: &str, max: u32) -> Result<Vec<Citation>, EpoError> {
1272        let cql = format!("ct={patent_id}");
1273        let results = self.fetch_search(&cql, 1, max.max(1)).await?;
1274        Ok(results
1275            .patents
1276            .into_iter()
1277            .map(|p| Citation {
1278                patent_id: p.patent_id,
1279                phase: "citing".to_string(),
1280                category: None,
1281                cited_by: None,
1282                date: p.publication_date,
1283                name: p.assignee,
1284            })
1285            .collect())
1286    }
1287
1288    /// Fan out per-member [`fetch_biblio`](Self::fetch_biblio) calls to fill
1289    /// in the `title` field that EPO's family endpoint omits.
1290    ///
1291    /// Costs N additional biblio requests for N family members, paced by
1292    /// the client's [`ClientConfig::inpadoc_interval`]. For a family of 6,
1293    /// expect ~8 s under default config (1.334 s × 6 paced calls).
1294    ///
1295    /// Members with a non-empty title are skipped. On per-member fetch
1296    /// errors the title stays empty (the function logs at debug level and
1297    /// continues). Returns the number of titles successfully filled.
1298    pub async fn enrich_family_titles(&self, family: &mut [FamilyMember]) -> usize {
1299        let mut filled = 0usize;
1300        for member in family.iter_mut() {
1301            if !member.title.is_empty() {
1302                continue;
1303            }
1304            match self.fetch_biblio(&member.patent_id).await {
1305                Ok(b) if !b.title.is_empty() => {
1306                    member.title = b.title;
1307                    filled += 1;
1308                }
1309                Ok(_) => {}
1310                Err(e) => {
1311                    debug!(
1312                        patent_id = %member.patent_id,
1313                        error = %e,
1314                        "enrich_family_titles: per-member biblio fetch failed"
1315                    );
1316                }
1317            }
1318        }
1319        filled
1320    }
1321}
1322
1323/// Extract bibliographic data from EPO JSON response.
1324///
1325/// EPO returns multiple `exchange-document` entries per publication — one per
1326/// kind code (A1 = application, B1 = granted, …). We prefer B-kind (granted is
1327/// canonical) over A-kind, then any remaining doc. Fields like the abstract
1328/// fall back across documents when the chosen one is missing them, since EPO
1329/// sometimes omits a field on B1 that A1 had.
1330pub fn parse_biblio(json: &serde_json::Value) -> PatentBiblio {
1331    let docs = locate_exchange_docs(json);
1332    let chosen = match pick_preferred_doc(&docs) {
1333        Some(d) => d,
1334        None => return PatentBiblio::default(),
1335    };
1336
1337    let biblio = &chosen["bibliographic-data"];
1338    let applicants = extract_applicants_all(biblio);
1339    let assignee = applicants.first().cloned();
1340
1341    PatentBiblio {
1342        title: extract_text_by_lang(&biblio["invention-title"]),
1343        abstract_text: extract_abstract_with_fallback(chosen, &docs),
1344        assignee,
1345        applicants,
1346        inventors: extract_inventors(biblio),
1347        filing_date: extract_date(&biblio["application-reference"]["document-id"]),
1348        publication_date: extract_date(&biblio["publication-reference"]["document-id"]),
1349        priority_date: extract_priority_date(biblio),
1350        kind_code: chosen["@kind"].as_str().map(str::to_string),
1351        family_id: chosen["@family-id"].as_str().map(str::to_string),
1352        classification: extract_classifications(biblio),
1353        cpc_classifications: extract_cpc_classifications(biblio),
1354    }
1355}
1356
1357/// Walk both the biblio (`ops:world-patent-data.exchange-documents.exchange-document`)
1358/// and search (`ops:world-patent-data.ops:biblio-search.ops:search-result.exchange-documents.exchange-document`)
1359/// shapes; the field can be a single object or array.
1360fn locate_exchange_docs(json: &serde_json::Value) -> Vec<&serde_json::Value> {
1361    let mut candidate = &json["ops:world-patent-data"]["exchange-documents"]["exchange-document"];
1362    if candidate.is_null() {
1363        candidate = &json["ops:world-patent-data"]["ops:biblio-search"]["ops:search-result"]["exchange-documents"]
1364            ["exchange-document"];
1365    }
1366    if let Some(arr) = candidate.as_array() {
1367        arr.iter().collect()
1368    } else if candidate.is_null() {
1369        Vec::new()
1370    } else {
1371        vec![candidate]
1372    }
1373}
1374
1375/// Prefer B-kind (granted) over A-kind (application), else first available.
1376fn pick_preferred_doc<'a>(docs: &[&'a serde_json::Value]) -> Option<&'a serde_json::Value> {
1377    let by_b = docs
1378        .iter()
1379        .find(|d| d["@kind"].as_str().is_some_and(|k| k.starts_with('B')));
1380    if let Some(d) = by_b {
1381        return Some(*d);
1382    }
1383    let by_a = docs
1384        .iter()
1385        .find(|d| d["@kind"].as_str().is_some_and(|k| k.starts_with('A')));
1386    if let Some(d) = by_a {
1387        return Some(*d);
1388    }
1389    docs.first().copied()
1390}
1391
1392fn extract_abstract_with_fallback(
1393    chosen: &serde_json::Value,
1394    all: &[&serde_json::Value],
1395) -> String {
1396    let primary = extract_text_by_lang(&chosen["abstract"]);
1397    if !primary.is_empty() {
1398        return primary;
1399    }
1400    for d in all {
1401        let txt = extract_text_by_lang(&d["abstract"]);
1402        if !txt.is_empty() {
1403            return txt;
1404        }
1405    }
1406    String::new()
1407}
1408
1409/// Parse search results from EPO OPS biblio-search response.
1410///
1411/// EPO returns two distinct shapes here:
1412///   * `ops:range` lives on `ops:biblio-search` (NOT on `ops:search-result`),
1413///     and `@total-result-count` lives there too.
1414///   * `exchange-documents` in a search response is an **array of
1415///     `{ "exchange-document": {…} }` wrappers**, one per result — different
1416///     from the biblio response, where `exchange-documents` is a single
1417///     object containing `exchange-document: [array]`. We accept both
1418///     shapes so the same parser works for biblio-shaped fixtures and
1419///     real search responses.
1420#[instrument(skip(json), fields(
1421    doc_count = field::Empty,
1422    parsed = field::Empty,
1423    duplicates = field::Empty,
1424    malformed = field::Empty,
1425))]
1426pub fn parse_search_results(json: &serde_json::Value) -> SearchResults {
1427    let biblio_search = &json["ops:world-patent-data"]["ops:biblio-search"];
1428    let search_result = &biblio_search["ops:search-result"];
1429
1430    if search_result.is_null() {
1431        return SearchResults {
1432            total_count: 0,
1433            range: (0, 0),
1434            patents: Vec::new(),
1435        };
1436    }
1437
1438    let total_count = biblio_search["@total-result-count"]
1439        .as_str()
1440        .and_then(|s| s.parse().ok())
1441        .or_else(|| biblio_search["@total-result-count"].as_u64())
1442        .unwrap_or(0) as u32;
1443
1444    // `ops:range` is on biblio-search, not search-result. Tolerate the
1445    // alternative location so older fixtures still parse.
1446    let range_obj = if biblio_search["ops:range"].is_object() {
1447        &biblio_search["ops:range"]
1448    } else {
1449        &search_result["ops:range"]
1450    };
1451    let range_begin = range_obj["@begin"]
1452        .as_str()
1453        .and_then(|s| s.parse().ok())
1454        .unwrap_or(0);
1455    let range_end = range_obj["@end"]
1456        .as_str()
1457        .and_then(|s| s.parse().ok())
1458        .unwrap_or(0);
1459
1460    let doc_refs = collect_search_docs(search_result);
1461
1462    let mut patents = Vec::new();
1463    let mut seen = HashSet::new();
1464    let mut duplicates: u64 = 0;
1465    let mut malformed: u64 = 0;
1466    let doc_count = doc_refs.len() as u64;
1467
1468    for doc in &doc_refs {
1469        let biblio = &doc["bibliographic-data"];
1470
1471        let pub_ref = &biblio["publication-reference"]["document-id"];
1472        let patent_id = extract_patent_id_from_doc_ids(pub_ref);
1473
1474        if patent_id.is_empty() {
1475            malformed += 1;
1476            debug!(
1477                pub_ref = %pub_ref,
1478                "EPO search doc missing patent_id"
1479            );
1480            continue;
1481        }
1482        if seen.contains(&patent_id) {
1483            duplicates += 1;
1484            continue;
1485        }
1486        seen.insert(patent_id.clone());
1487
1488        let applicants = extract_applicants_all(biblio);
1489        let assignee = applicants.first().cloned();
1490        patents.push(SearchResultPatent {
1491            patent_id,
1492            title: extract_text_by_lang(&biblio["invention-title"]),
1493            abstract_text: extract_text_by_lang(&doc["abstract"]),
1494            assignee,
1495            applicants,
1496            inventors: extract_inventors(biblio),
1497            filing_date: extract_date(&biblio["application-reference"]["document-id"]),
1498            publication_date: extract_date(&biblio["publication-reference"]["document-id"]),
1499            priority_date: extract_priority_date(biblio),
1500            kind_code: doc["@kind"].as_str().map(str::to_string),
1501            family_id: doc["@family-id"].as_str().map(str::to_string),
1502            classification: extract_classifications(biblio),
1503            cpc_classifications: extract_cpc_classifications(biblio),
1504        });
1505    }
1506
1507    let span = Span::current();
1508    span.record("doc_count", doc_count);
1509    span.record("parsed", patents.len() as u64);
1510    span.record("duplicates", duplicates);
1511    span.record("malformed", malformed);
1512    if duplicates > 0 || malformed > 0 {
1513        debug!(
1514            parsed = patents.len(),
1515            duplicates, malformed, "EPO search parse dropped entries"
1516        );
1517    }
1518
1519    SearchResults {
1520        total_count,
1521        range: (range_begin, range_end),
1522        patents,
1523    }
1524}
1525
1526/// Walk the `exchange-documents` field of a search response.
1527///
1528/// Real EPO search responses use `exchange-documents: [{exchange-document: {…}}, …]`
1529/// — an array where each element wraps a single doc. Some test fixtures and
1530/// older EPO responses use the biblio-style `exchange-documents: {exchange-document: […]}`.
1531/// This helper handles both shapes (and the single-doc-as-object variant)
1532/// so the parser doesn't silently drop results.
1533fn collect_search_docs(search_result: &serde_json::Value) -> Vec<&serde_json::Value> {
1534    let docs_field = &search_result["exchange-documents"];
1535
1536    if let Some(arr) = docs_field.as_array() {
1537        // Real search shape: array of `{exchange-document: {…}}` wrappers.
1538        // Tolerate elements that are docs directly (no wrapper) for fixture flex.
1539        return arr
1540            .iter()
1541            .map(|item| {
1542                let nested = &item["exchange-document"];
1543                if nested.is_null() { item } else { nested }
1544            })
1545            .filter(|d| !d.is_null() && d.is_object())
1546            .collect();
1547    }
1548
1549    // Biblio-style shape: `exchange-documents: {exchange-document: [array | object]}`.
1550    let nested = &docs_field["exchange-document"];
1551    if let Some(arr) = nested.as_array() {
1552        arr.iter().collect()
1553    } else if !nested.is_null() {
1554        vec![nested]
1555    } else {
1556        Vec::new()
1557    }
1558}
1559
1560/// Extract patent ID from document-id array, preferring epodoc format.
1561fn extract_patent_id_from_doc_ids(doc_ids: &serde_json::Value) -> String {
1562    if doc_ids.is_null() {
1563        return String::new();
1564    }
1565
1566    let items = if doc_ids.is_array() {
1567        doc_ids.as_array().unwrap().as_slice()
1568    } else {
1569        std::slice::from_ref(doc_ids)
1570    };
1571
1572    // Prefer epodoc
1573    for item in items {
1574        let doc_type = item["@document-id-type"].as_str().unwrap_or("");
1575        if doc_type == "epodoc"
1576            && let Some(num) = item["doc-number"]["$"].as_str()
1577            && !num.is_empty()
1578        {
1579            return num.to_string();
1580        }
1581    }
1582
1583    // Fall back to docdb (country + doc-number)
1584    for item in items {
1585        let doc_type = item["@document-id-type"].as_str().unwrap_or("");
1586        if doc_type == "docdb" {
1587            let country = item["country"]["$"].as_str().unwrap_or("");
1588            let num = item["doc-number"]["$"].as_str().unwrap_or("");
1589            if !num.is_empty() {
1590                return format!("{country}{num}");
1591            }
1592        }
1593    }
1594
1595    String::new()
1596}
1597
1598/// Extract text preferring English from EPO's lang-tagged objects.
1599/// Can be a single object with `$` and `@lang`, or an array of them.
1600/// Falls through `.p.$` for abstract-shaped payloads.
1601pub(crate) fn extract_text_by_lang(val: &serde_json::Value) -> String {
1602    if val.is_null() {
1603        return String::new();
1604    }
1605
1606    let items = if val.is_array() {
1607        val.as_array().unwrap().as_slice()
1608    } else {
1609        std::slice::from_ref(val)
1610    };
1611
1612    for item in items {
1613        if item["@lang"].as_str() == Some("en")
1614            && let Some(text) = item_text(item)
1615        {
1616            return text.to_string();
1617        }
1618    }
1619
1620    for item in items {
1621        if let Some(text) = item_text(item) {
1622            return text.to_string();
1623        }
1624    }
1625
1626    val.as_str().unwrap_or("").to_string()
1627}
1628
1629/// EPO wraps long-form text either directly in `$` (titles) or nested under
1630/// `p.$` (abstracts). Try direct first, then the paragraph wrapper.
1631fn item_text(item: &serde_json::Value) -> Option<&str> {
1632    item["$"].as_str().or_else(|| item["p"]["$"].as_str())
1633}
1634
1635/// Extract date from document-id array (look for epodoc format).
1636pub(crate) fn extract_date(doc_ids: &serde_json::Value) -> Option<String> {
1637    if doc_ids.is_null() {
1638        return None;
1639    }
1640
1641    let items = if doc_ids.is_array() {
1642        doc_ids.as_array().unwrap().as_slice()
1643    } else {
1644        std::slice::from_ref(doc_ids)
1645    };
1646
1647    for item in items {
1648        if let Some(date) = item["date"]["$"].as_str() {
1649            // EPO dates are YYYYMMDD, convert to YYYY-MM-DD
1650            if date.len() == 8 {
1651                return Some(format!("{}-{}-{}", &date[..4], &date[4..6], &date[6..8]));
1652            }
1653            return Some(date.to_string());
1654        }
1655    }
1656
1657    None
1658}
1659
1660/// Extract full IPC classification codes (e.g. `B28B1/29`).
1661///
1662/// Reads two EPO blocks:
1663///   * `classification-ipc.text[].$` — already in compact form like `B28B1/29`.
1664///   * `classifications-ipcr.classification-ipcr[].text.$` — padded form like
1665///     `"B28B   1/    29            A I"`, normalised to `B28B1/29`.
1666///
1667/// De-duped, no truncation.
1668pub(crate) fn extract_classifications(biblio: &serde_json::Value) -> Vec<String> {
1669    let mut result = Vec::new();
1670    let mut seen = HashSet::new();
1671
1672    let ipc = &biblio["classification-ipc"]["text"];
1673    for item in iter_array_or_one(ipc) {
1674        if let Some(code) = item["$"].as_str() {
1675            let code = code.trim().to_string();
1676            if !code.is_empty() && seen.insert(code.clone()) {
1677                result.push(code);
1678            }
1679        }
1680    }
1681
1682    let ipcr = &biblio["classifications-ipcr"]["classification-ipcr"];
1683    for item in iter_array_or_one(ipcr) {
1684        if let Some(text) = item["text"]["$"].as_str() {
1685            let code = normalize_ipcr_text(text);
1686            if !code.is_empty() && seen.insert(code.clone()) {
1687                result.push(code);
1688            }
1689        }
1690    }
1691
1692    result
1693}
1694
1695/// Extract CPC codes from the structured `patent-classifications` block.
1696///
1697/// Each entry has `section`/`class`/`subclass`/`main-group`/`subgroup`. We
1698/// concatenate to compact form (`B28B1/29`) so callers can compare IPC and CPC
1699/// using the same string shape.
1700pub(crate) fn extract_cpc_classifications(biblio: &serde_json::Value) -> Vec<String> {
1701    let mut result = Vec::new();
1702    let mut seen = HashSet::new();
1703
1704    let items = &biblio["patent-classifications"]["patent-classification"];
1705    for item in iter_array_or_one(items) {
1706        let scheme = item["classification-scheme"]["@scheme"]
1707            .as_str()
1708            .unwrap_or("");
1709        // CPCI = Inventive, CPCA = Additional. Skip non-CPC schemes (e.g. NEW, ECLA).
1710        if !matches!(scheme, "CPCI" | "CPCA" | "CPC") {
1711            continue;
1712        }
1713        let section = item["section"]["$"].as_str().unwrap_or("");
1714        let class = item["class"]["$"].as_str().unwrap_or("");
1715        let subclass = item["subclass"]["$"].as_str().unwrap_or("");
1716        let main = item["main-group"]["$"].as_str().unwrap_or("");
1717        let sub = item["subgroup"]["$"].as_str().unwrap_or("");
1718        if section.is_empty() || class.is_empty() || subclass.is_empty() {
1719            continue;
1720        }
1721        let code = format!("{section}{class}{subclass}{main}/{sub}");
1722        if seen.insert(code.clone()) {
1723            result.push(code);
1724        }
1725    }
1726
1727    result
1728}
1729
1730/// Extract inventors, preferring epodoc-format entries (which carry the
1731/// canonical name + country code suffix). Falls back to original-format if
1732/// no epodoc inventor is present.
1733pub(crate) fn extract_inventors(biblio: &serde_json::Value) -> Vec<String> {
1734    let items = &biblio["parties"]["inventors"]["inventor"];
1735    extract_party_names(items)
1736}
1737
1738/// Extract all distinct applicants (assignees), preferring epodoc-format
1739/// entries; falls back to original-format when no epodoc entry exists.
1740pub(crate) fn extract_applicants_all(biblio: &serde_json::Value) -> Vec<String> {
1741    let items = &biblio["parties"]["applicants"]["applicant"];
1742    extract_party_names(items)
1743}
1744
1745fn extract_party_names(items: &serde_json::Value) -> Vec<String> {
1746    let mut result = Vec::new();
1747    let mut seen = HashSet::new();
1748
1749    let entries: Vec<&serde_json::Value> = iter_array_or_one(items);
1750    let has_epodoc = entries
1751        .iter()
1752        .any(|e| e["@data-format"].as_str() == Some("epodoc"));
1753
1754    for entry in &entries {
1755        let format = entry["@data-format"].as_str().unwrap_or("");
1756        // If any epodoc entries exist, take only those — original duplicates them
1757        // with formatting differences (mixed case, punctuation).
1758        if has_epodoc && format != "epodoc" {
1759            continue;
1760        }
1761        let name = entry["applicant-name"]["name"]["$"]
1762            .as_str()
1763            .or_else(|| entry["inventor-name"]["name"]["$"].as_str())
1764            .or_else(|| entry["applicant-name"]["$"].as_str())
1765            .or_else(|| entry["inventor-name"]["$"].as_str())
1766            .unwrap_or("")
1767            .trim();
1768        if !name.is_empty() && seen.insert(name.to_string()) {
1769            result.push(name.to_string());
1770        }
1771    }
1772
1773    result
1774}
1775
1776/// Earliest priority claim date (YYYY-MM-DD), if any. EPO returns priority
1777/// claims with epodoc `date.$` in YYYYMMDD form.
1778pub(crate) fn extract_priority_date(biblio: &serde_json::Value) -> Option<String> {
1779    let claims = &biblio["priority-claims"]["priority-claim"];
1780    let mut best: Option<String> = None;
1781
1782    for claim in iter_array_or_one(claims) {
1783        let doc_ids = &claim["document-id"];
1784        for did in iter_array_or_one(doc_ids) {
1785            if did["@document-id-type"].as_str() != Some("epodoc") {
1786                continue;
1787            }
1788            let Some(date) = did["date"]["$"].as_str() else {
1789                continue;
1790            };
1791            if date.len() != 8 {
1792                continue;
1793            }
1794            let formatted = format!("{}-{}-{}", &date[..4], &date[4..6], &date[6..8]);
1795            best = Some(match best {
1796                Some(b) if b <= formatted => b,
1797                _ => formatted,
1798            });
1799        }
1800    }
1801
1802    best
1803}
1804
1805/// EPO IPCR text is space-padded: `"B28B   1/    29            A I"`. The
1806/// trailing single-letter tokens are flags (A = advanced, I = inventive). We
1807/// drop them and concatenate the code parts to compact form (`B28B1/29`).
1808fn normalize_ipcr_text(text: &str) -> String {
1809    let mut out = String::new();
1810    for tok in text.split_whitespace() {
1811        let is_flag = tok.len() == 1 && tok.chars().all(|c| c.is_ascii_alphabetic());
1812        if is_flag {
1813            break;
1814        }
1815        out.push_str(tok);
1816    }
1817    out
1818}
1819
1820fn iter_array_or_one(val: &serde_json::Value) -> Vec<&serde_json::Value> {
1821    if let Some(arr) = val.as_array() {
1822        arr.iter().collect()
1823    } else if val.is_null() {
1824        Vec::new()
1825    } else {
1826        vec![val]
1827    }
1828}
1829
1830/// Parse citations from EPO biblio response.
1831///
1832/// EPO's biblio endpoint can return multiple `exchange-document` entries (A1,
1833/// B1, …); citations sometimes live only on the application doc (A1) while
1834/// the granted doc (B1) carries no `references-cited` block. We walk every
1835/// document and merge with dedup so the caller doesn't depend on which kind
1836/// the chosen biblio happens to be.
1837///
1838/// Forward citations (who cites this patent) are not available from biblio —
1839/// that needs a separate search call. `citing` stays empty.
1840pub fn parse_citations(json: &serde_json::Value) -> Citations {
1841    let mut cited = Vec::new();
1842    let mut seen = HashSet::new();
1843
1844    for doc in locate_exchange_docs(json) {
1845        let refs = &doc["bibliographic-data"]["references-cited"]["citation"];
1846        for cit in iter_array_or_one(refs) {
1847            let patcit = &cit["patcit"];
1848            if patcit.is_null() {
1849                continue;
1850            }
1851
1852            let phase = cit["@cited-phase"].as_str().unwrap_or("").to_string();
1853            let category = cit["category"]["$"]
1854                .as_str()
1855                .map(str::to_string)
1856                .filter(|s| !s.is_empty());
1857            let cited_by = cit["@cited-by"]
1858                .as_str()
1859                .map(str::to_string)
1860                .filter(|s| !s.is_empty());
1861
1862            let doc_ids = iter_array_or_one(&patcit["document-id"]);
1863            let date = extract_date(&patcit["document-id"]);
1864            let name = doc_ids
1865                .iter()
1866                .find_map(|d| d["name"]["$"].as_str())
1867                .map(str::to_string)
1868                .filter(|s| !s.is_empty());
1869
1870            for did in &doc_ids {
1871                let doc_type = did["@document-id-type"].as_str().unwrap_or("");
1872                if doc_type != "epodoc" && doc_type != "docdb" {
1873                    continue;
1874                }
1875                let country = did["country"]["$"].as_str().unwrap_or("");
1876                let doc_number = did["doc-number"]["$"].as_str().unwrap_or("");
1877                if doc_number.is_empty() {
1878                    continue;
1879                }
1880                let patent_id = if doc_type == "epodoc" {
1881                    doc_number.to_string()
1882                } else {
1883                    format!("{country}{doc_number}")
1884                };
1885                if !seen.insert(patent_id.clone()) {
1886                    continue;
1887                }
1888                cited.push(Citation {
1889                    patent_id,
1890                    phase: phase.clone(),
1891                    category: category.clone(),
1892                    cited_by: cited_by.clone(),
1893                    date: date.clone(),
1894                    name: name.clone(),
1895                });
1896                break;
1897            }
1898        }
1899    }
1900
1901    Citations {
1902        cited,
1903        citing: Vec::new(),
1904    }
1905}
1906
1907/// Parse a full-text description response. EPO's shape:
1908///
1909/// ```text
1910/// ops:world-patent-data.ftxt:fulltext-documents.ftxt:fulltext-document.description
1911///   .@lang             // language tag
1912///   .p                 // single object or array of paragraph objects
1913///     .@num            // "0001", "0002", … (optional)
1914///     .$               // paragraph text
1915/// ```
1916///
1917/// Empty paragraphs are skipped. The convenience `plain_text` field joins
1918/// all paragraph bodies with `"\n\n"` for embedding / LLM contexts.
1919pub fn parse_description(json: &serde_json::Value, patent_id: &str) -> PatentDescription {
1920    let desc = &json["ops:world-patent-data"]["ftxt:fulltext-documents"]["ftxt:fulltext-document"]
1921        ["description"];
1922
1923    if desc.is_null() {
1924        return PatentDescription {
1925            patent_id: patent_id.to_string(),
1926            ..Default::default()
1927        };
1928    }
1929
1930    let language = desc["@lang"].as_str().map(str::to_string);
1931
1932    let mut paragraphs = Vec::new();
1933    for item in iter_array_or_one(&desc["p"]) {
1934        let num = item["@num"].as_str().map(str::to_string);
1935        let text = item_text(item).unwrap_or("").trim().to_string();
1936        if !text.is_empty() {
1937            paragraphs.push(DescriptionParagraph { num, text });
1938        }
1939    }
1940
1941    let plain_text = paragraphs
1942        .iter()
1943        .map(|p| p.text.as_str())
1944        .collect::<Vec<_>>()
1945        .join("\n\n");
1946
1947    PatentDescription {
1948        patent_id: patent_id.to_string(),
1949        language,
1950        paragraphs,
1951        plain_text,
1952    }
1953}
1954
1955/// Parse a claims response. EPO's shape:
1956///
1957/// ```text
1958/// ops:world-patent-data.ftxt:fulltext-documents.ftxt:fulltext-document.claims
1959///   .@lang             // language tag
1960///   .claim             // single object or array
1961///     .@id             // "claim001" (optional)
1962///     .@num            // "0001" (optional)
1963///     .claim-text      // string, object with `$`, or array
1964/// ```
1965///
1966/// `claim-text` can carry inline formatting (italics, subscripts, math
1967/// formulas) as nested elements; this parser flattens them to plain text.
1968/// If formatting matters for your downstream, walk the raw JSON yourself.
1969pub fn parse_claims(json: &serde_json::Value, patent_id: &str) -> PatentClaims {
1970    let claims_field = &json["ops:world-patent-data"]["ftxt:fulltext-documents"]["ftxt:fulltext-document"]
1971        ["claims"];
1972
1973    if claims_field.is_null() {
1974        return PatentClaims {
1975            patent_id: patent_id.to_string(),
1976            ..Default::default()
1977        };
1978    }
1979
1980    let language = claims_field["@lang"].as_str().map(str::to_string);
1981
1982    let mut claims = Vec::new();
1983    for item in iter_array_or_one(&claims_field["claim"]) {
1984        let num = item["@num"].as_str().map(str::to_string);
1985        let id = item["@id"].as_str().map(str::to_string);
1986        let claim_text = &item["claim-text"];
1987
1988        // EPO sometimes packs multiple numbered claims into a single
1989        // <claim> wrapper as an array of self-contained claim-text leaves
1990        // — distinct from the inline-formatting shape where the array
1991        // mixes strings with nested <sub>/<i>/<sup> tag objects. The
1992        // wrapper carrying its own @num/@id signals a single logical
1993        // claim with formatting; absence of both, plus a leaf-only array,
1994        // signals a packed claim set that must be split.
1995        if num.is_none()
1996            && id.is_none()
1997            && let Some(arr) = claim_text.as_array()
1998            && arr.len() > 1
1999            && arr.iter().all(is_leaf_claim_text)
2000        {
2001            for elem in arr {
2002                let text = flatten_claim_text(elem);
2003                if !text.is_empty() {
2004                    claims.push(Claim {
2005                        num: None,
2006                        id: None,
2007                        text,
2008                    });
2009                }
2010            }
2011        } else {
2012            let text = flatten_claim_text(claim_text);
2013            if !text.is_empty() {
2014                claims.push(Claim { num, id, text });
2015            }
2016        }
2017    }
2018
2019    let plain_text = claims
2020        .iter()
2021        .map(|c| c.text.as_str())
2022        .collect::<Vec<_>>()
2023        .join("\n\n");
2024
2025    PatentClaims {
2026        patent_id: patent_id.to_string(),
2027        language,
2028        claims,
2029        plain_text,
2030    }
2031}
2032
2033/// Walk an EPO `claim-text` value and concatenate every text fragment in
2034/// document order, flattening nested inline formatting (`<i>`, `<sub>`,
2035/// `<sup>`, math, …) which EPO encodes as nested JSON objects with their
2036/// own `$` payloads.
2037///
2038/// Handles bare string / single object / array shapes and recurses into
2039/// any object/array children. `$` is the canonical XML-text key
2040/// (visited first to keep document order); `@`-prefixed keys are
2041/// attributes and skipped. Intra-fragment whitespace is preserved so
2042/// inline formatting boundaries don't collapse words together; the
2043/// outer string is trimmed once at the end.
2044///
2045/// Use this when you want plain text. If you need to preserve formatting,
2046/// walk the raw JSON yourself.
2047fn flatten_claim_text(val: &serde_json::Value) -> String {
2048    let mut out = String::new();
2049    collect_text_fragments(val, &mut out);
2050    out.trim().to_string()
2051}
2052
2053/// `true` if `val` is a self-contained EPO claim-text leaf — a bare string
2054/// or an object whose only data key is `$` (the canonical XML-text key).
2055/// Used to distinguish a packed claim set (`claim-text` is an array of
2056/// such leaves, each a numbered claim) from an inline-formatting payload
2057/// (array mixes strings with `<sub>` / `<i>` / `<sup>` tag objects).
2058fn is_leaf_claim_text(val: &serde_json::Value) -> bool {
2059    match val {
2060        serde_json::Value::String(_) => true,
2061        serde_json::Value::Object(map) => map.keys().all(|k| k == "$" || k.starts_with('@')),
2062        _ => false,
2063    }
2064}
2065
2066/// Recursive helper for [`flatten_claim_text`]. Appends every string
2067/// value reachable from `val` to `out` in document order. Preserves
2068/// internal whitespace; trimming is the caller's responsibility.
2069fn collect_text_fragments(val: &serde_json::Value, out: &mut String) {
2070    match val {
2071        serde_json::Value::String(s) => out.push_str(s),
2072        serde_json::Value::Array(arr) => {
2073            for item in arr {
2074                collect_text_fragments(item, out);
2075            }
2076        }
2077        serde_json::Value::Object(map) => {
2078            if let Some(text) = map.get("$") {
2079                collect_text_fragments(text, out);
2080            }
2081            for (k, v) in map {
2082                if k == "$" || k.starts_with('@') {
2083                    continue;
2084                }
2085                collect_text_fragments(v, out);
2086            }
2087        }
2088        _ => {}
2089    }
2090}
2091
2092/// Parse INPADOC family response to extract family members.
2093///
2094/// Returns one entry per publication. EPO emits multiple publications per
2095/// `patent_id` (e.g. EP1000000 appears as both A1 in 2000-05-17 and B1 in
2096/// 2003-02-12 — same patent at different stages); both are returned, in the
2097/// order EPO sent them. Callers that want only the granted version per
2098/// country can filter on `kind`.
2099pub fn parse_family(json: &serde_json::Value) -> Vec<FamilyMember> {
2100    let members =
2101        iter_array_or_one(&json["ops:world-patent-data"]["ops:patent-family"]["ops:family-member"]);
2102
2103    let mut result: Vec<FamilyMember> = Vec::new();
2104
2105    for member in members {
2106        let pub_ref = &member["publication-reference"]["document-id"];
2107        let doc_ids = iter_array_or_one(pub_ref);
2108        if doc_ids.is_empty() {
2109            continue;
2110        }
2111
2112        // Pick the first epodoc/docdb id; that is the canonical publication ref.
2113        let mut chosen: Option<&serde_json::Value> = None;
2114        for doc_id in &doc_ids {
2115            let doc_type = doc_id["@document-id-type"].as_str().unwrap_or("");
2116            if doc_type == "epodoc" || doc_type == "docdb" {
2117                chosen = Some(doc_id);
2118                break;
2119            }
2120        }
2121        let Some(doc_id) = chosen else { continue };
2122
2123        let doc_type = doc_id["@document-id-type"].as_str().unwrap_or("");
2124        let country = doc_id["country"]["$"].as_str().unwrap_or("").to_string();
2125        let doc_number = doc_id["doc-number"]["$"].as_str().unwrap_or("");
2126        let kind = doc_id["kind"]["$"].as_str().unwrap_or("").to_string();
2127
2128        if doc_number.is_empty() {
2129            continue;
2130        }
2131
2132        let patent_id = if doc_type == "epodoc" {
2133            doc_number.to_string()
2134        } else {
2135            format!("{country}{doc_number}")
2136        };
2137
2138        result.push(FamilyMember {
2139            patent_id,
2140            country,
2141            kind,
2142            title: extract_text_by_lang(&member["invention-title"]),
2143            publication_date: extract_date(pub_ref),
2144        });
2145    }
2146
2147    result
2148}
2149
2150#[cfg(test)]
2151mod tests {
2152    use super::*;
2153
2154    #[test]
2155    fn test_parse_family_empty() {
2156        let json = serde_json::json!({});
2157        let result = parse_family(&json);
2158        assert!(result.is_empty());
2159    }
2160
2161    #[test]
2162    fn test_parse_family_single_member() {
2163        let json = serde_json::json!({
2164            "ops:world-patent-data": {
2165                "ops:patent-family": {
2166                    "ops:family-member": {
2167                        "publication-reference": {
2168                            "document-id": [
2169                                {
2170                                    "@document-id-type": "epodoc",
2171                                    "country": {"$": "EP"},
2172                                    "doc-number": {"$": "EP1234567"},
2173                                    "kind": {"$": "A1"}
2174                                }
2175                            ]
2176                        },
2177                        "invention-title": {"$": "Test invention", "@lang": "en"}
2178                    }
2179                }
2180            }
2181        });
2182        let result = parse_family(&json);
2183        assert_eq!(result.len(), 1);
2184        assert_eq!(result[0].patent_id, "EP1234567");
2185        assert_eq!(result[0].country, "EP");
2186        assert_eq!(result[0].kind, "A1");
2187        assert_eq!(result[0].title, "Test invention");
2188    }
2189
2190    #[test]
2191    fn test_parse_family_multiple_members() {
2192        let json = serde_json::json!({
2193            "ops:world-patent-data": {
2194                "ops:patent-family": {
2195                    "ops:family-member": [
2196                        {
2197                            "publication-reference": {
2198                                "document-id": [
2199                                    {
2200                                        "@document-id-type": "epodoc",
2201                                        "country": {"$": "EP"},
2202                                        "doc-number": {"$": "EP111"},
2203                                        "kind": {"$": "A1"}
2204                                    }
2205                                ]
2206                            },
2207                            "invention-title": {"$": "Invention A", "@lang": "en"}
2208                        },
2209                        {
2210                            "publication-reference": {
2211                                "document-id": [
2212                                    {
2213                                        "@document-id-type": "epodoc",
2214                                        "country": {"$": "US"},
2215                                        "doc-number": {"$": "US222"},
2216                                        "kind": {"$": "B1"}
2217                                    }
2218                                ]
2219                            },
2220                            "invention-title": {"$": "Invention B", "@lang": "en"}
2221                        }
2222                    ]
2223                }
2224            }
2225        });
2226        let result = parse_family(&json);
2227        assert_eq!(result.len(), 2);
2228        assert_eq!(result[0].patent_id, "EP111");
2229        assert_eq!(result[1].patent_id, "US222");
2230    }
2231
2232    #[test]
2233    fn test_parse_family_keeps_all_publications_per_patent() {
2234        // Same patent at A1 (application) and B1 (granted) — both returned,
2235        // in the order EPO sent them.
2236        let json = serde_json::json!({
2237            "ops:world-patent-data": {
2238                "ops:patent-family": {
2239                    "ops:family-member": [
2240                        {
2241                            "publication-reference": {
2242                                "document-id": [
2243                                    {
2244                                        "@document-id-type": "epodoc",
2245                                        "country": {"$": "EP"},
2246                                        "date": {"$": "20000517"},
2247                                        "doc-number": {"$": "EP111"},
2248                                        "kind": {"$": "A1"}
2249                                    }
2250                                ]
2251                            },
2252                            "invention-title": {"$": "Same patent", "@lang": "en"}
2253                        },
2254                        {
2255                            "publication-reference": {
2256                                "document-id": [
2257                                    {
2258                                        "@document-id-type": "epodoc",
2259                                        "country": {"$": "EP"},
2260                                        "date": {"$": "20030212"},
2261                                        "doc-number": {"$": "EP111"},
2262                                        "kind": {"$": "B1"}
2263                                    }
2264                                ]
2265                            },
2266                            "invention-title": {"$": "Same patent v2", "@lang": "en"}
2267                        }
2268                    ]
2269                }
2270            }
2271        });
2272        let result = parse_family(&json);
2273        assert_eq!(result.len(), 2);
2274        assert_eq!(result[0].kind, "A1");
2275        assert_eq!(result[0].publication_date.as_deref(), Some("2000-05-17"));
2276        assert_eq!(result[1].kind, "B1");
2277        assert_eq!(result[1].publication_date.as_deref(), Some("2003-02-12"));
2278    }
2279
2280    #[test]
2281    fn test_parse_family_keeps_all_publications_when_no_dates() {
2282        let json = serde_json::json!({
2283            "ops:world-patent-data": {
2284                "ops:patent-family": {
2285                    "ops:family-member": [
2286                        {
2287                            "publication-reference": {
2288                                "document-id": [{
2289                                    "@document-id-type": "epodoc",
2290                                    "country": {"$": "EP"},
2291                                    "doc-number": {"$": "EP111"},
2292                                    "kind": {"$": "A1"}
2293                                }]
2294                            }
2295                        },
2296                        {
2297                            "publication-reference": {
2298                                "document-id": [{
2299                                    "@document-id-type": "epodoc",
2300                                    "country": {"$": "EP"},
2301                                    "doc-number": {"$": "EP111"},
2302                                    "kind": {"$": "A2"}
2303                                }]
2304                            }
2305                        }
2306                    ]
2307                }
2308            }
2309        });
2310        let result = parse_family(&json);
2311        assert_eq!(result.len(), 2);
2312        assert_eq!(result[0].kind, "A1");
2313        assert_eq!(result[1].kind, "A2");
2314    }
2315
2316    #[test]
2317    fn test_parse_search_results_empty() {
2318        let json = serde_json::json!({});
2319        let result = parse_search_results(&json);
2320        assert_eq!(result.total_count, 0);
2321        assert!(result.patents.is_empty());
2322    }
2323
2324    #[test]
2325    fn test_parse_search_results_single() {
2326        let json = serde_json::json!({
2327            "ops:world-patent-data": {
2328                "ops:biblio-search": {
2329                    "@total-result-count": "1",
2330                    "ops:search-result": {
2331                        "ops:range": {"@begin": "1", "@end": "1"},
2332                        "exchange-documents": {
2333                            "exchange-document": {
2334                                "bibliographic-data": {
2335                                    "invention-title": [{"$": "Test title", "@lang": "en"}],
2336                                    "parties": {
2337                                        "applicants": {
2338                                            "applicant": [{"applicant-name": {"name": {"$": "Acme Corp"}}}]
2339                                        }
2340                                    },
2341                                    "publication-reference": {
2342                                        "document-id": [{
2343                                            "@document-id-type": "epodoc",
2344                                            "doc-number": {"$": "EP1234567"}
2345                                        }]
2346                                    },
2347                                    "application-reference": {
2348                                        "document-id": [{"date": {"$": "20200115"}}]
2349                                    },
2350                                    "classifications-ipcr": {
2351                                        "classification-ipcr": [{"text": {"$": "H01M 10/48"}}]
2352                                    }
2353                                },
2354                                "abstract": [{"$": "Test abstract text", "@lang": "en"}]
2355                            }
2356                        }
2357                    }
2358                }
2359            }
2360        });
2361        let result = parse_search_results(&json);
2362        assert_eq!(result.total_count, 1);
2363        assert_eq!(result.range, (1, 1));
2364        assert_eq!(result.patents.len(), 1);
2365        assert_eq!(result.patents[0].patent_id, "EP1234567");
2366        assert_eq!(result.patents[0].title, "Test title");
2367        assert_eq!(result.patents[0].abstract_text, "Test abstract text");
2368        assert_eq!(result.patents[0].assignee.as_deref(), Some("Acme Corp"));
2369        assert_eq!(result.patents[0].filing_date.as_deref(), Some("2020-01-15"));
2370    }
2371
2372    #[test]
2373    fn test_parse_search_results_multiple() {
2374        let json = serde_json::json!({
2375            "ops:world-patent-data": {
2376                "ops:biblio-search": {
2377                    "@total-result-count": "42",
2378                    "ops:search-result": {
2379                        "ops:range": {"@begin": "1", "@end": "2"},
2380                        "exchange-documents": {
2381                            "exchange-document": [
2382                                {
2383                                    "bibliographic-data": {
2384                                        "invention-title": [{"$": "Patent A", "@lang": "en"}],
2385                                        "publication-reference": {
2386                                            "document-id": [{
2387                                                "@document-id-type": "epodoc",
2388                                                "doc-number": {"$": "EP111"}
2389                                            }]
2390                                        }
2391                                    },
2392                                    "abstract": [{"$": "Abstract A", "@lang": "en"}]
2393                                },
2394                                {
2395                                    "bibliographic-data": {
2396                                        "invention-title": [{"$": "Patent B", "@lang": "en"}],
2397                                        "publication-reference": {
2398                                            "document-id": [{
2399                                                "@document-id-type": "epodoc",
2400                                                "doc-number": {"$": "US222"}
2401                                            }]
2402                                        }
2403                                    },
2404                                    "abstract": [{"$": "Abstract B", "@lang": "en"}]
2405                                }
2406                            ]
2407                        }
2408                    }
2409                }
2410            }
2411        });
2412        let result = parse_search_results(&json);
2413        assert_eq!(result.total_count, 42);
2414        assert_eq!(result.range, (1, 2));
2415        assert_eq!(result.patents.len(), 2);
2416        assert_eq!(result.patents[0].patent_id, "EP111");
2417        assert_eq!(result.patents[1].patent_id, "US222");
2418    }
2419
2420    #[test]
2421    fn test_parse_search_results_deduplicates() {
2422        let json = serde_json::json!({
2423            "ops:world-patent-data": {
2424                "ops:biblio-search": {
2425                    "@total-result-count": "2",
2426                    "ops:search-result": {
2427                        "ops:range": {"@begin": "1", "@end": "2"},
2428                        "exchange-documents": {
2429                            "exchange-document": [
2430                                {
2431                                    "bibliographic-data": {
2432                                        "invention-title": [{"$": "Same", "@lang": "en"}],
2433                                        "publication-reference": {
2434                                            "document-id": [{"@document-id-type": "epodoc", "doc-number": {"$": "EP111"}}]
2435                                        }
2436                                    }
2437                                },
2438                                {
2439                                    "bibliographic-data": {
2440                                        "invention-title": [{"$": "Same v2", "@lang": "en"}],
2441                                        "publication-reference": {
2442                                            "document-id": [{"@document-id-type": "epodoc", "doc-number": {"$": "EP111"}}]
2443                                        }
2444                                    }
2445                                }
2446                            ]
2447                        }
2448                    }
2449                }
2450            }
2451        });
2452        let result = parse_search_results(&json);
2453        assert_eq!(
2454            result.patents.len(),
2455            1,
2456            "Duplicate patent IDs should be deduplicated"
2457        );
2458    }
2459
2460    #[test]
2461    fn test_parse_biblio_empty() {
2462        let json = serde_json::json!({});
2463        let result = parse_biblio(&json);
2464        assert!(result.title.is_empty());
2465        assert!(result.abstract_text.is_empty());
2466        assert!(result.applicants.is_empty());
2467        assert!(result.inventors.is_empty());
2468        assert!(result.kind_code.is_none());
2469        assert!(result.family_id.is_none());
2470        assert!(result.cpc_classifications.is_empty());
2471    }
2472
2473    /// Mirrors the real EP1000000 response shape: two exchange-documents
2474    /// (A1 + B1), abstract nested under `p.$`, full IPC codes via two blocks
2475    /// (`classification-ipc` + `classifications-ipcr`), CPC under
2476    /// `patent-classifications`, inventors + priority claim. Verifies the
2477    /// parser picks B1, extracts every field, and produces full IPC codes.
2478    #[test]
2479    fn test_parse_biblio_real_shape_a1_b1() {
2480        let abstract_obj = serde_json::json!({
2481            "@lang": "en",
2482            "p": {"$": "An apparatus for manufacturing green bricks."}
2483        });
2484        let json = serde_json::json!({
2485            "ops:world-patent-data": {
2486                "exchange-documents": {
2487                    "exchange-document": [
2488                        {
2489                            "@country": "EP",
2490                            "@doc-number": "1000000",
2491                            "@family-id": "19768124",
2492                            "@kind": "A1",
2493                            "abstract": abstract_obj,
2494                            "bibliographic-data": {
2495                                "invention-title": [
2496                                    {"@lang": "de", "$": "DE title"},
2497                                    {"@lang": "en", "$": "Apparatus for manufacturing green bricks"}
2498                                ],
2499                                "publication-reference": {
2500                                    "document-id": [{
2501                                        "@document-id-type": "epodoc",
2502                                        "date": {"$": "20000517"},
2503                                        "doc-number": {"$": "EP1000000"}
2504                                    }]
2505                                },
2506                                "application-reference": {
2507                                    "document-id": [{
2508                                        "@document-id-type": "epodoc",
2509                                        "date": {"$": "19991108"},
2510                                        "doc-number": {"$": "EP19990203729"}
2511                                    }]
2512                                },
2513                                "references-cited": {
2514                                    "citation": [{
2515                                        "@cited-phase": "national-search-report",
2516                                        "patcit": {
2517                                            "document-id": [{
2518                                                "@document-id-type": "epodoc",
2519                                                "doc-number": {"$": "EP0680812"}
2520                                            }]
2521                                        }
2522                                    }]
2523                                }
2524                            }
2525                        },
2526                        {
2527                            "@country": "EP",
2528                            "@doc-number": "1000000",
2529                            "@family-id": "19768124",
2530                            "@kind": "B1",
2531                            "abstract": abstract_obj,
2532                            "bibliographic-data": {
2533                                "invention-title": [
2534                                    {"@lang": "en", "$": "Apparatus for manufacturing green bricks"}
2535                                ],
2536                                "publication-reference": {
2537                                    "document-id": [{
2538                                        "@document-id-type": "epodoc",
2539                                        "date": {"$": "20030212"},
2540                                        "doc-number": {"$": "EP1000000"}
2541                                    }]
2542                                },
2543                                "application-reference": {
2544                                    "document-id": [{
2545                                        "@document-id-type": "epodoc",
2546                                        "date": {"$": "19991108"},
2547                                        "doc-number": {"$": "EP19990203729"}
2548                                    }]
2549                                },
2550                                "classification-ipc": {
2551                                    "text": [
2552                                        {"$": "B28B1/29"},
2553                                        {"$": "B28B5/02"}
2554                                    ]
2555                                },
2556                                "classifications-ipcr": {
2557                                    "classification-ipcr": [
2558                                        {"@sequence": "1", "text": {"$": "B28B   1/    29            A I"}},
2559                                        {"@sequence": "2", "text": {"$": "H02P   6/    08            A I"}}
2560                                    ]
2561                                },
2562                                "patent-classifications": {
2563                                    "patent-classification": [{
2564                                        "@sequence": "1",
2565                                        "classification-scheme": {"@office": "EP", "@scheme": "CPCI"},
2566                                        "section": {"$": "B"},
2567                                        "class": {"$": "28"},
2568                                        "subclass": {"$": "B"},
2569                                        "main-group": {"$": "1"},
2570                                        "subgroup": {"$": "29"}
2571                                    }]
2572                                },
2573                                "priority-claims": {
2574                                    "priority-claim": {
2575                                        "document-id": [{
2576                                            "@document-id-type": "epodoc",
2577                                            "date": {"$": "19981112"},
2578                                            "doc-number": {"$": "NL19981010536"}
2579                                        }]
2580                                    }
2581                                },
2582                                "parties": {
2583                                    "applicants": {
2584                                        "applicant": [
2585                                            {
2586                                                "@data-format": "epodoc",
2587                                                "@sequence": "1",
2588                                                "applicant-name": {"name": {"$": "BOER BEHEER NIJMEGEN BV DE [NL]"}}
2589                                            },
2590                                            {
2591                                                "@data-format": "original",
2592                                                "@sequence": "1",
2593                                                "applicant-name": {"name": {"$": "BEHEERMAATSCHAPPIJ DE BOER NIJMEGEN B.V."}}
2594                                            }
2595                                        ]
2596                                    },
2597                                    "inventors": {
2598                                        "inventor": [{
2599                                            "@data-format": "epodoc",
2600                                            "inventor-name": {"name": {"$": "KOSMAN WILHELMUS JACOBUS MARIA [NL]"}}
2601                                        }]
2602                                    }
2603                                }
2604                            }
2605                        }
2606                    ]
2607                }
2608            }
2609        });
2610
2611        let r = parse_biblio(&json);
2612
2613        assert_eq!(r.title, "Apparatus for manufacturing green bricks");
2614        assert_eq!(
2615            r.abstract_text,
2616            "An apparatus for manufacturing green bricks."
2617        );
2618        assert_eq!(
2619            r.kind_code.as_deref(),
2620            Some("B1"),
2621            "should prefer B1 over A1"
2622        );
2623        assert_eq!(r.family_id.as_deref(), Some("19768124"));
2624        assert_eq!(
2625            r.publication_date.as_deref(),
2626            Some("2003-02-12"),
2627            "publication_date should come from chosen B1 doc, not A1"
2628        );
2629        assert_eq!(r.filing_date.as_deref(), Some("1999-11-08"));
2630        assert_eq!(r.priority_date.as_deref(), Some("1998-11-12"));
2631        assert_eq!(
2632            r.assignee.as_deref(),
2633            Some("BOER BEHEER NIJMEGEN BV DE [NL]")
2634        );
2635        assert_eq!(r.applicants, vec!["BOER BEHEER NIJMEGEN BV DE [NL]"]);
2636        assert_eq!(r.inventors, vec!["KOSMAN WILHELMUS JACOBUS MARIA [NL]"]);
2637        assert_eq!(
2638            r.classification,
2639            vec!["B28B1/29", "B28B5/02", "H02P6/08"],
2640            "should produce full IPC codes (compact form), de-duped across IPC + IPCR blocks"
2641        );
2642        assert_eq!(r.cpc_classifications, vec!["B28B1/29"]);
2643    }
2644
2645    /// EP1000000's A1 has citations, B1 doesn't. Parser must merge across
2646    /// docs so the caller sees A1's citations even though we chose B1 for
2647    /// biblio.
2648    #[test]
2649    fn test_parse_citations_merges_across_a1_b1() {
2650        let json = serde_json::json!({
2651            "ops:world-patent-data": {
2652                "exchange-documents": {
2653                    "exchange-document": [
2654                        {
2655                            "@kind": "A1",
2656                            "bibliographic-data": {
2657                                "references-cited": {
2658                                    "citation": {
2659                                        "@cited-phase": "search",
2660                                        "patcit": {
2661                                            "document-id": [{
2662                                                "@document-id-type": "epodoc",
2663                                                "doc-number": {"$": "EP0680812"}
2664                                            }]
2665                                        }
2666                                    }
2667                                }
2668                            }
2669                        },
2670                        {
2671                            "@kind": "B1",
2672                            "bibliographic-data": {}
2673                        }
2674                    ]
2675                }
2676            }
2677        });
2678        let r = parse_citations(&json);
2679        assert_eq!(r.cited.len(), 1);
2680        assert_eq!(r.cited[0].patent_id, "EP0680812");
2681    }
2682
2683    #[test]
2684    fn test_normalize_ipcr_drops_flags() {
2685        // 2-digit subgroup, basic shape.
2686        assert_eq!(
2687            normalize_ipcr_text("B28B   1/    29            A I"),
2688            "B28B1/29"
2689        );
2690        // Different section, same width.
2691        assert_eq!(
2692            normalize_ipcr_text("H02P   6/    08            A I"),
2693            "H02P6/08"
2694        );
2695        // 4-digit subgroup (subdivided main-group).
2696        assert_eq!(
2697            normalize_ipcr_text("G05B  19/  4093            A I"),
2698            "G05B19/4093"
2699        );
2700        // 5-digit subgroup — common in CPC-aligned IPCR for new subdivisions.
2701        assert_eq!(
2702            normalize_ipcr_text("G05B  19/  40938            A I"),
2703            "G05B19/40938"
2704        );
2705        // 3-digit subgroup with leading zero significance.
2706        assert_eq!(
2707            normalize_ipcr_text("G06T   7/   246            A I"),
2708            "G06T7/246"
2709        );
2710        // Single-digit subgroup with `00` padding still normalises cleanly.
2711        assert_eq!(
2712            normalize_ipcr_text("B28B   7/    00            A I"),
2713            "B28B7/00"
2714        );
2715        // Non-inventive flag (just `A` instead of `A I`).
2716        assert_eq!(
2717            normalize_ipcr_text("A61B   3/   113            A"),
2718            "A61B3/113"
2719        );
2720        // No flags at all (terse form).
2721        assert_eq!(normalize_ipcr_text("B28B 1/29"), "B28B1/29");
2722        // Empty / whitespace-only input shouldn't panic.
2723        assert_eq!(normalize_ipcr_text(""), "");
2724        assert_eq!(normalize_ipcr_text("   "), "");
2725    }
2726
2727    #[test]
2728    fn test_extract_text_by_lang_handles_p_wrapper() {
2729        // Abstract shape: { "@lang": "en", "p": { "$": "..." } }
2730        let val = serde_json::json!({"@lang": "en", "p": {"$": "Abstract body."}});
2731        assert_eq!(extract_text_by_lang(&val), "Abstract body.");
2732
2733        // Title shape: { "@lang": "en", "$": "..." }
2734        let val = serde_json::json!({"@lang": "en", "$": "Title body."});
2735        assert_eq!(extract_text_by_lang(&val), "Title body.");
2736    }
2737
2738    #[test]
2739    fn test_parse_citations_empty() {
2740        let json = serde_json::json!({});
2741        let result = parse_citations(&json);
2742        assert!(result.cited.is_empty());
2743        assert!(result.citing.is_empty());
2744    }
2745
2746    #[test]
2747    fn test_parse_citations_single() {
2748        let json = serde_json::json!({
2749            "ops:world-patent-data": {
2750                "exchange-documents": {
2751                    "exchange-document": {
2752                        "bibliographic-data": {
2753                            "references-cited": {
2754                                "citation": {
2755                                    "@cited-phase": "search",
2756                                    "patcit": {
2757                                        "document-id": [
2758                                            {
2759                                                "@document-id-type": "epodoc",
2760                                                "country": {"$": "US"},
2761                                                "doc-number": {"$": "US7654321"},
2762                                                "kind": {"$": "A1"}
2763                                            }
2764                                        ]
2765                                    }
2766                                }
2767                            }
2768                        }
2769                    }
2770                }
2771            }
2772        });
2773        let result = parse_citations(&json);
2774        assert_eq!(result.cited.len(), 1);
2775        assert_eq!(result.cited[0].patent_id, "US7654321");
2776        assert_eq!(result.cited[0].phase, "search");
2777    }
2778
2779    #[test]
2780    fn test_parse_citations_multiple() {
2781        let json = serde_json::json!({
2782            "ops:world-patent-data": {
2783                "exchange-documents": {
2784                    "exchange-document": {
2785                        "bibliographic-data": {
2786                            "references-cited": {
2787                                "citation": [
2788                                    {
2789                                        "@cited-phase": "search",
2790                                        "patcit": {
2791                                            "document-id": [
2792                                                {
2793                                                    "@document-id-type": "epodoc",
2794                                                    "country": {"$": "US"},
2795                                                    "doc-number": {"$": "US111"},
2796                                                    "kind": {"$": "A1"}
2797                                                }
2798                                            ]
2799                                        }
2800                                    },
2801                                    {
2802                                        "@cited-phase": "examination",
2803                                        "patcit": {
2804                                            "document-id": [
2805                                                {
2806                                                    "@document-id-type": "epodoc",
2807                                                    "country": {"$": "EP"},
2808                                                    "doc-number": {"$": "EP222"},
2809                                                    "kind": {"$": "B1"}
2810                                                }
2811                                            ]
2812                                        }
2813                                    }
2814                                ]
2815                            }
2816                        }
2817                    }
2818                }
2819            }
2820        });
2821        let result = parse_citations(&json);
2822        assert_eq!(result.cited.len(), 2);
2823        assert_eq!(result.cited[0].patent_id, "US111");
2824        assert_eq!(result.cited[1].patent_id, "EP222");
2825    }
2826
2827    #[test]
2828    fn test_parse_citations_deduplicates() {
2829        let json = serde_json::json!({
2830            "ops:world-patent-data": {
2831                "exchange-documents": {
2832                    "exchange-document": {
2833                        "bibliographic-data": {
2834                            "references-cited": {
2835                                "citation": [
2836                                    {
2837                                        "@cited-phase": "search",
2838                                        "patcit": {
2839                                            "document-id": [{
2840                                                "@document-id-type": "epodoc",
2841                                                "country": {"$": "US"},
2842                                                "doc-number": {"$": "US111"},
2843                                                "kind": {"$": "A1"}
2844                                            }]
2845                                        }
2846                                    },
2847                                    {
2848                                        "@cited-phase": "examination",
2849                                        "patcit": {
2850                                            "document-id": [{
2851                                                "@document-id-type": "epodoc",
2852                                                "country": {"$": "US"},
2853                                                "doc-number": {"$": "US111"},
2854                                                "kind": {"$": "A2"}
2855                                            }]
2856                                        }
2857                                    }
2858                                ]
2859                            }
2860                        }
2861                    }
2862                }
2863            }
2864        });
2865        let result = parse_citations(&json);
2866        assert_eq!(
2867            result.cited.len(),
2868            1,
2869            "Duplicate patent IDs should be deduplicated"
2870        );
2871    }
2872
2873    #[test]
2874    fn test_parse_throttling_idle() {
2875        let s = parse_throttling_header(
2876            "idle (images=green:200, inpadoc=green:60, retrieval=green:200, search=green:30)",
2877            None,
2878            None,
2879        )
2880        .unwrap();
2881        assert_eq!(s.load, ThrottlingLoad::Idle);
2882        assert_eq!(s.endpoints["search"].remaining_per_minute, 30);
2883        assert_eq!(s.endpoints["search"].color, ThrottlingColor::Green);
2884        assert!(!s.is_exhausted());
2885    }
2886
2887    #[test]
2888    fn test_parse_throttling_busy_real_shape() {
2889        // Verbatim from a real EPO biblio response.
2890        let header = "busy (images=green:100, inpadoc=green:45, other=green:1000, retrieval=green:100, search=green:15)";
2891        let s = parse_throttling_header(header, Some(20360), Some(1427448)).unwrap();
2892        assert_eq!(s.load, ThrottlingLoad::Busy);
2893        assert_eq!(s.inpadoc_remaining(), Some(45));
2894        assert_eq!(s.search_remaining(), Some(15));
2895        assert_eq!(s.retrieval_remaining(), Some(100));
2896        assert_eq!(s.hour_bytes_used, Some(20360));
2897        assert_eq!(s.week_bytes_used, Some(1427448));
2898    }
2899
2900    #[test]
2901    fn test_parse_throttling_color_progression() {
2902        let s = parse_throttling_header(
2903            "overloaded (search=yellow:5, retrieval=red:1, images=black:0)",
2904            None,
2905            None,
2906        )
2907        .unwrap();
2908        assert_eq!(s.load, ThrottlingLoad::Overloaded);
2909        assert_eq!(s.endpoints["search"].color, ThrottlingColor::Yellow);
2910        assert_eq!(s.endpoints["retrieval"].color, ThrottlingColor::Red);
2911        assert_eq!(s.endpoints["images"].color, ThrottlingColor::Black);
2912        assert!(s.is_exhausted(), "any black endpoint => exhausted");
2913    }
2914
2915    #[test]
2916    fn test_parse_throttling_zero_remaining_treated_as_exhausted() {
2917        // remaining=0 even with green color is effectively exhausted.
2918        let s = parse_throttling_header("idle (search=green:0)", None, None).unwrap();
2919        assert!(s.is_exhausted());
2920    }
2921
2922    #[test]
2923    fn test_parse_throttling_unknown_load_preserves_endpoints() {
2924        let s = parse_throttling_header("weird_state (inpadoc=green:45)", None, None).unwrap();
2925        assert_eq!(
2926            s.load,
2927            ThrottlingLoad::Other("weird_state".to_string()),
2928            "unknown load string should be preserved verbatim"
2929        );
2930        assert_eq!(s.inpadoc_remaining(), Some(45));
2931    }
2932
2933    #[test]
2934    fn test_parse_throttling_malformed_entries_skipped() {
2935        // The good entry survives; the malformed one is dropped silently.
2936        let s = parse_throttling_header(
2937            "idle (inpadoc=green:45, malformed-entry, search=:notanumber)",
2938            None,
2939            None,
2940        )
2941        .unwrap();
2942        assert_eq!(s.endpoints.len(), 1);
2943        assert_eq!(s.inpadoc_remaining(), Some(45));
2944    }
2945
2946    #[test]
2947    fn test_parse_description_empty_returns_patent_id() {
2948        let r = parse_description(&serde_json::json!({}), "EP1");
2949        assert_eq!(r.patent_id, "EP1");
2950        assert!(r.language.is_none());
2951        assert!(r.paragraphs.is_empty());
2952        assert!(r.plain_text.is_empty());
2953    }
2954
2955    #[test]
2956    fn test_parse_description_single_paragraph() {
2957        // EPO sometimes returns `p` as a single object, not an array.
2958        let json = serde_json::json!({
2959            "ops:world-patent-data": {
2960                "ftxt:fulltext-documents": {
2961                    "ftxt:fulltext-document": {
2962                        "description": {
2963                            "@lang": "en",
2964                            "p": {"@num": "0001", "$": "  The invention.  "}
2965                        }
2966                    }
2967                }
2968            }
2969        });
2970        let r = parse_description(&json, "EP1");
2971        assert_eq!(r.language.as_deref(), Some("en"));
2972        assert_eq!(r.paragraphs.len(), 1);
2973        assert_eq!(r.paragraphs[0].num.as_deref(), Some("0001"));
2974        assert_eq!(r.paragraphs[0].text, "The invention.");
2975        assert_eq!(r.plain_text, "The invention.");
2976    }
2977
2978    #[test]
2979    fn test_parse_description_multi_paragraph_joins_blank_skipped() {
2980        let json = serde_json::json!({
2981            "ops:world-patent-data": {
2982                "ftxt:fulltext-documents": {
2983                    "ftxt:fulltext-document": {
2984                        "description": {
2985                            "@lang": "en",
2986                            "p": [
2987                                {"@num": "0001", "$": "BACKGROUND OF THE INVENTION"},
2988                                {"@num": "0002", "$": ""},      // empty: dropped
2989                                {"@num": "0003", "$": "Para three."},
2990                                {"$": "Untagged."}              // no @num
2991                            ]
2992                        }
2993                    }
2994                }
2995            }
2996        });
2997        let r = parse_description(&json, "EP1000000");
2998        assert_eq!(r.paragraphs.len(), 3, "empty paragraph should be skipped");
2999        assert_eq!(r.paragraphs[0].text, "BACKGROUND OF THE INVENTION");
3000        assert_eq!(r.paragraphs[1].num.as_deref(), Some("0003"));
3001        assert_eq!(r.paragraphs[2].num, None);
3002        assert_eq!(
3003            r.plain_text,
3004            "BACKGROUND OF THE INVENTION\n\nPara three.\n\nUntagged."
3005        );
3006    }
3007
3008    #[test]
3009    fn test_parse_claims_empty_returns_patent_id() {
3010        let r = parse_claims(&serde_json::json!({}), "EP1");
3011        assert_eq!(r.patent_id, "EP1");
3012        assert!(r.language.is_none());
3013        assert!(r.claims.is_empty());
3014        assert!(r.plain_text.is_empty());
3015    }
3016
3017    #[test]
3018    fn test_parse_claims_single_claim() {
3019        let json = serde_json::json!({
3020            "ops:world-patent-data": {
3021                "ftxt:fulltext-documents": {
3022                    "ftxt:fulltext-document": {
3023                        "claims": {
3024                            "@lang": "en",
3025                            "claim": {
3026                                "@id": "claim001",
3027                                "@num": "0001",
3028                                "claim-text": {"$": "1. A widget comprising a sprocket."}
3029                            }
3030                        }
3031                    }
3032                }
3033            }
3034        });
3035        let r = parse_claims(&json, "EP1");
3036        assert_eq!(r.language.as_deref(), Some("en"));
3037        assert_eq!(r.claims.len(), 1);
3038        assert_eq!(r.claims[0].id.as_deref(), Some("claim001"));
3039        assert_eq!(r.claims[0].num.as_deref(), Some("0001"));
3040        assert_eq!(r.claims[0].text, "1. A widget comprising a sprocket.");
3041    }
3042
3043    #[test]
3044    fn test_parse_claims_flattens_inline_formatting() {
3045        // EPO encodes inline tags (italics, subscripts, superscripts, math)
3046        // as nested objects whose text lives under their own `$`. The
3047        // flatten must recurse and preserve intra-fragment whitespace.
3048        let json = serde_json::json!({
3049            "ops:world-patent-data": {
3050                "ftxt:fulltext-documents": {
3051                    "ftxt:fulltext-document": {
3052                        "claims": {
3053                            "@lang": "en",
3054                            "claim": {
3055                                "@num": "0001",
3056                                "claim-text": [
3057                                    "1. A widget with mass m",
3058                                    {"sub": {"$": "0"}},
3059                                    " comprising a lattice of ",
3060                                    {"i": {"$": "polycrystalline"}},
3061                                    " silicon doped at 10",
3062                                    {"sup": {"$": "15"}},
3063                                    " atoms/cm",
3064                                    {"sup": {"$": "3"}},
3065                                    "."
3066                                ]
3067                            }
3068                        }
3069                    }
3070                }
3071            }
3072        });
3073        let r = parse_claims(&json, "EP1");
3074        assert_eq!(r.claims.len(), 1);
3075        // Subscripts/superscripts/italics flattened in document order;
3076        // intra-fragment whitespace preserved.
3077        assert_eq!(
3078            r.claims[0].text,
3079            "1. A widget with mass m0 comprising a lattice of polycrystalline silicon doped at 1015 atoms/cm3."
3080        );
3081    }
3082
3083    #[test]
3084    fn test_parse_claims_packed_claim_set_in_single_wrapper() {
3085        // Real EP1000000 shape: one <claim> wrapper with no @num, holding
3086        // an array of 11 self-contained <claim-text> leaves — each a
3087        // numbered claim. Must split into separate Claim entries instead
3088        // of concatenating into one mega-claim.
3089        let json = serde_json::json!({
3090            "ops:world-patent-data": {
3091                "ftxt:fulltext-documents": {
3092                    "ftxt:fulltext-document": {
3093                        "claims": {
3094                            "@lang": "EN",
3095                            "claim": {
3096                                "claim-text": [
3097                                    {"$": "1. A widget."},
3098                                    {"$": "2. The widget of claim 1."},
3099                                    {"$": "3. The widget of claim 2."}
3100                                ]
3101                            }
3102                        }
3103                    }
3104                }
3105            }
3106        });
3107        let r = parse_claims(&json, "EP1");
3108        assert_eq!(r.claims.len(), 3, "packed claim set must split per leaf");
3109        assert_eq!(r.claims[0].text, "1. A widget.");
3110        assert_eq!(r.claims[1].text, "2. The widget of claim 1.");
3111        assert_eq!(r.claims[2].text, "3. The widget of claim 2.");
3112        // Inline formatting case must still be handled as one claim:
3113        // covered by test_parse_claims_flattens_inline_formatting.
3114    }
3115
3116    #[test]
3117    fn test_parse_claims_multi_with_array_text_and_formatting() {
3118        // claim-text as array with nested object (simulating inline formatting).
3119        let json = serde_json::json!({
3120            "ops:world-patent-data": {
3121                "ftxt:fulltext-documents": {
3122                    "ftxt:fulltext-document": {
3123                        "claims": {
3124                            "@lang": "en",
3125                            "claim": [
3126                                {
3127                                    "@num": "0001",
3128                                    "claim-text": [
3129                                        "1. A method comprising:",
3130                                        {"$": "applying force to a "},
3131                                        {"$": "widget"},
3132                                        "."
3133                                    ]
3134                                },
3135                                {
3136                                    "@num": "0002",
3137                                    "claim-text": "2. The method of claim 1, wherein the force is gradient."
3138                                },
3139                                {
3140                                    "@num": "0003",
3141                                    "claim-text": ""  // empty: dropped
3142                                }
3143                            ]
3144                        }
3145                    }
3146                }
3147            }
3148        });
3149        let r = parse_claims(&json, "EP1");
3150        assert_eq!(r.claims.len(), 2, "empty claim should be skipped");
3151        assert!(r.claims[0].text.contains("widget"));
3152        assert!(r.claims[0].text.contains("method"));
3153        assert_eq!(r.claims[1].num.as_deref(), Some("0002"));
3154        assert!(r.plain_text.contains("\n\n"));
3155    }
3156
3157    #[test]
3158    fn test_parse_citations_extracts_metadata() {
3159        // Real EPO shape: each citation carries @cited-by, @cited-phase,
3160        // category.$, document-id[].name.$, document-id[epodoc].date.$.
3161        let json = serde_json::json!({
3162            "ops:world-patent-data": {
3163                "exchange-documents": {
3164                    "exchange-document": {
3165                        "bibliographic-data": {
3166                            "references-cited": {
3167                                "citation": {
3168                                    "@cited-by": "examiner",
3169                                    "@cited-phase": "national-search-report",
3170                                    "category": {"$": "X"},
3171                                    "patcit": {
3172                                        "document-id": [
3173                                            {
3174                                                "@document-id-type": "epodoc",
3175                                                "date": {"$": "19951108"},
3176                                                "doc-number": {"$": "EP0680812"},
3177                                                "name": {"$": "BOER BEHEER NIJMEGEN BV DE"}
3178                                            },
3179                                            {
3180                                                "@document-id-type": "docdb",
3181                                                "country": {"$": "EP"},
3182                                                "doc-number": {"$": "0680812"},
3183                                                "kind": {"$": "A1"}
3184                                            }
3185                                        ]
3186                                    }
3187                                }
3188                            }
3189                        }
3190                    }
3191                }
3192            }
3193        });
3194        let r = parse_citations(&json);
3195        assert_eq!(r.cited.len(), 1);
3196        let c = &r.cited[0];
3197        assert_eq!(c.patent_id, "EP0680812");
3198        assert_eq!(c.phase, "national-search-report");
3199        assert_eq!(c.category.as_deref(), Some("X"));
3200        assert_eq!(c.cited_by.as_deref(), Some("examiner"));
3201        assert_eq!(c.date.as_deref(), Some("1995-11-08"));
3202        assert_eq!(c.name.as_deref(), Some("BOER BEHEER NIJMEGEN BV DE"));
3203    }
3204
3205    #[test]
3206    fn test_parse_citations_skips_non_patent() {
3207        let json = serde_json::json!({
3208            "ops:world-patent-data": {
3209                "exchange-documents": {
3210                    "exchange-document": {
3211                        "bibliographic-data": {
3212                            "references-cited": {
3213                                "citation": [
3214                                    {
3215                                        "nplcit": {
3216                                            "text": {"$": "Non-patent literature reference"}
3217                                        }
3218                                    },
3219                                    {
3220                                        "@cited-phase": "search",
3221                                        "patcit": {
3222                                            "document-id": [{
3223                                                "@document-id-type": "epodoc",
3224                                                "country": {"$": "US"},
3225                                                "doc-number": {"$": "US999"},
3226                                                "kind": {"$": "A1"}
3227                                            }]
3228                                        }
3229                                    }
3230                                ]
3231                            }
3232                        }
3233                    }
3234                }
3235            }
3236        });
3237        let result = parse_citations(&json);
3238        assert_eq!(result.cited.len(), 1);
3239        assert_eq!(result.cited[0].patent_id, "US999");
3240    }
3241
3242    // --- parse_search_results: malformed-doc + dedup counting ---
3243
3244    fn search_doc(country: &str, num: &str, kind: &str, title: &str) -> serde_json::Value {
3245        serde_json::json!({
3246            "bibliographic-data": {
3247                "publication-reference": {
3248                    "document-id": [{
3249                        "@document-id-type": "epodoc",
3250                        "country": {"$": country},
3251                        "doc-number": {"$": num},
3252                        "kind": {"$": kind}
3253                    }]
3254                },
3255                "invention-title": {"$": title, "@lang": "en"}
3256            }
3257        })
3258    }
3259
3260    /// Mirrors the real EPO shape: `ops:range` lives on `ops:biblio-search`
3261    /// (NOT on `ops:search-result`), and `exchange-documents` is an array
3262    /// of `{exchange-document: {…}}` wrappers, one per result. Pre-fix the
3263    /// parser silently returned `range: (0,0), patents: []` for live data.
3264    #[test]
3265    fn test_parse_search_results_real_shape() {
3266        let json = serde_json::json!({
3267            "ops:world-patent-data": {
3268                "ops:biblio-search": {
3269                    "@total-result-count": "194",
3270                    "ops:range": {"@begin": "1", "@end": "10"},
3271                    "ops:search-result": {
3272                        "exchange-documents": [
3273                            {
3274                                "exchange-document": {
3275                                    "@country": "CN",
3276                                    "@kind": "A",
3277                                    "@family-id": "89371517",
3278                                    "abstract": [
3279                                        {"@lang": "en", "p": {"$": "First abstract."}},
3280                                        {"@lang": "ol", "p": {"$": "其他语言"}}
3281                                    ],
3282                                    "bibliographic-data": {
3283                                        "invention-title": [
3284                                            {"@lang": "ol", "$": "其他"},
3285                                            {"@lang": "en", "$": "Hit one"}
3286                                        ],
3287                                        "publication-reference": {
3288                                            "document-id": [{
3289                                                "@document-id-type": "epodoc",
3290                                                "date": {"$": "20240105"},
3291                                                "doc-number": {"$": "CN111"}
3292                                            }]
3293                                        },
3294                                        "application-reference": {
3295                                            "document-id": [{
3296                                                "@document-id-type": "epodoc",
3297                                                "date": {"$": "20220526"},
3298                                                "doc-number": {"$": "CN20228037675"}
3299                                            }]
3300                                        },
3301                                        "classifications-ipcr": {
3302                                            "classification-ipcr": {
3303                                                "@sequence": "1",
3304                                                "text": {"$": "G01C  21/    34            A I"}
3305                                            }
3306                                        },
3307                                        "patent-classifications": {
3308                                            "patent-classification": [{
3309                                                "@sequence": "1",
3310                                                "classification-scheme": {"@scheme": "CPCI"},
3311                                                "section": {"$": "G"},
3312                                                "class": {"$": "01"},
3313                                                "subclass": {"$": "C"},
3314                                                "main-group": {"$": "21"},
3315                                                "subgroup": {"$": "34"}
3316                                            }]
3317                                        },
3318                                        "priority-claims": {
3319                                            "priority-claim": {
3320                                                "document-id": {
3321                                                    "@document-id-type": "epodoc",
3322                                                    "date": {"$": "20210702"},
3323                                                    "doc-number": {"$": "US202163218215P"}
3324                                                }
3325                                            }
3326                                        },
3327                                        "parties": {
3328                                            "applicants": {
3329                                                "applicant": [
3330                                                    {
3331                                                        "@data-format": "epodoc",
3332                                                        "@sequence": "1",
3333                                                        "applicant-name": {"name": {"$": "APPLE INC"}}
3334                                                    },
3335                                                    {
3336                                                        "@data-format": "original",
3337                                                        "@sequence": "1",
3338                                                        "applicant-name": {"name": {"$": "苹果公司"}}
3339                                                    }
3340                                                ]
3341                                            },
3342                                            "inventors": {
3343                                                "inventor": [{
3344                                                    "@data-format": "epodoc",
3345                                                    "@sequence": "1",
3346                                                    "inventor-name": {"name": {"$": "KIM YUN-JAE"}}
3347                                                }]
3348                                            }
3349                                        }
3350                                    }
3351                                }
3352                            },
3353                            {
3354                                "exchange-document": {
3355                                    "@country": "US",
3356                                    "@kind": "A1",
3357                                    "bibliographic-data": {
3358                                        "invention-title": {"@lang": "en", "$": "Hit two"},
3359                                        "publication-reference": {
3360                                            "document-id": [{
3361                                                "@document-id-type": "epodoc",
3362                                                "doc-number": {"$": "US222"}
3363                                            }]
3364                                        }
3365                                    }
3366                                }
3367                            }
3368                        ]
3369                    }
3370                }
3371            }
3372        });
3373        let r = parse_search_results(&json);
3374        assert_eq!(r.total_count, 194);
3375        assert_eq!(
3376            r.range,
3377            (1, 10),
3378            "range should come from biblio-search.ops:range"
3379        );
3380        assert_eq!(
3381            r.patents.len(),
3382            2,
3383            "should unwrap each `exchange-document` from the array"
3384        );
3385
3386        let p0 = &r.patents[0];
3387        assert_eq!(p0.patent_id, "CN111");
3388        assert_eq!(p0.title, "Hit one");
3389        assert_eq!(p0.abstract_text, "First abstract.");
3390        assert_eq!(p0.kind_code.as_deref(), Some("A"));
3391        assert_eq!(p0.family_id.as_deref(), Some("89371517"));
3392        assert_eq!(p0.publication_date.as_deref(), Some("2024-01-05"));
3393        assert_eq!(p0.filing_date.as_deref(), Some("2022-05-26"));
3394        assert_eq!(p0.priority_date.as_deref(), Some("2021-07-02"));
3395        assert_eq!(p0.assignee.as_deref(), Some("APPLE INC"));
3396        assert_eq!(p0.applicants, vec!["APPLE INC"]);
3397        assert_eq!(p0.inventors, vec!["KIM YUN-JAE"]);
3398        assert_eq!(p0.classification, vec!["G01C21/34"]);
3399        assert_eq!(p0.cpc_classifications, vec!["G01C21/34"]);
3400
3401        let p1 = &r.patents[1];
3402        assert_eq!(p1.patent_id, "US222");
3403        assert_eq!(p1.title, "Hit two");
3404        // p1 is the minimal/sparse variant — fields gracefully fall back to defaults.
3405        assert!(p1.applicants.is_empty());
3406        assert!(p1.inventors.is_empty());
3407        assert!(p1.priority_date.is_none());
3408    }
3409
3410    #[test]
3411    fn test_parse_search_results_skips_missing_id() {
3412        // Doc with no publication-reference must be dropped (malformed) so
3413        // an empty-id patent row never surfaces to the tool caller.
3414        let json = serde_json::json!({
3415            "ops:world-patent-data": {
3416                "ops:biblio-search": {
3417                    "@total-result-count": "2",
3418                    "ops:search-result": {
3419                        "ops:range": {"@begin": "1", "@end": "2"},
3420                        "exchange-documents": {
3421                            "exchange-document": [
3422                                search_doc("EP", "EP42", "A1", "Good"),
3423                                serde_json::json!({
3424                                    "bibliographic-data": {
3425                                        "invention-title": {"$": "Orphan", "@lang": "en"}
3426                                    }
3427                                }),
3428                            ]
3429                        }
3430                    }
3431                }
3432            }
3433        });
3434        let result = parse_search_results(&json);
3435        assert_eq!(result.patents.len(), 1);
3436        assert_eq!(result.patents[0].patent_id, "EP42");
3437    }
3438}