learner/retriever/
json.rs

1//! JSON response parser implementation.
2//!
3//! This module handles parsing of JSON API responses into Paper objects using
4//! configurable field mappings. It supports flexible path-based field extraction
5//! with optional transformations.
6//!
7//! # Example Configuration
8//!
9//! ```toml
10//! [response_format]
11//! type = "json"
12//!
13//! [response_format.field_maps]
14//! title = { path = "message/title/0" }
15//! abstract = { path = "message/abstract" }
16//! publication_date = { path = "message/published-print/date-parts/0" }
17//! authors = { path = "message/author" }
18//! ```
19
20use serde_json::Value;
21
22use super::*;
23
24/// Configuration for processing JSON API responses.
25///
26/// Provides field mapping rules to extract paper metadata from JSON responses
27/// using path-based access patterns.
28///
29/// # Examples
30///
31/// ```no_run
32/// # use std::collections::HashMap;
33/// # use learner::retriever::{json::JsonConfig, FieldMap};
34/// let config = JsonConfig {
35///   field_maps: HashMap::from([("title".to_string(), FieldMap {
36///     path:      "message/title/0".to_string(),
37///     transform: None,
38///   })]),
39/// };
40/// ```
41#[derive(Debug, Clone, Deserialize)]
42pub struct JsonConfig {
43  /// JSON path mappings for paper metadata fields
44  pub field_maps: HashMap<String, FieldMap>,
45}
46
47#[async_trait]
48impl ResponseProcessor for JsonConfig {
49  /// Processes a JSON API response into a Paper object.
50  ///
51  /// Extracts paper metadata from the JSON response using configured field mappings.
52  /// Required fields (title, abstract, publication date, authors) must be present
53  /// and valid.
54  ///
55  /// # Arguments
56  ///
57  /// * `data` - Raw JSON response bytes
58  ///
59  /// # Returns
60  ///
61  /// Returns a Result containing either:
62  /// - A populated Paper object
63  /// - A LearnerError if parsing fails or required fields are missing
64  ///
65  /// # Errors
66  ///
67  /// This method will return an error if:
68  /// - JSON parsing fails
69  /// - Required fields are missing
70  /// - Field values are invalid or cannot be transformed
71  async fn process_response(&self, data: &[u8]) -> Result<Paper> {
72    let json: Value = serde_json::from_slice(data)
73      .map_err(|e| LearnerError::ApiError(format!("Failed to parse JSON: {}", e)))?;
74
75    trace!("Processing JSON response: {}", serde_json::to_string_pretty(&json).unwrap());
76
77    let title = self.extract_field(&json, "title")?;
78    let abstract_text = self.extract_field(&json, "abstract")?;
79    let publication_date =
80      chrono::DateTime::parse_from_rfc3339(&self.extract_field(&json, "publication_date")?)
81        .map(|dt| dt.with_timezone(&Utc))
82        .map_err(|e| LearnerError::ApiError(format!("Invalid date format: {}", e)))?;
83
84    let authors = if let Some(map) = self.field_maps.get("authors") {
85      self.extract_authors(&json, map)?
86    } else {
87      return Err(LearnerError::ApiError("Missing authors mapping".to_string()));
88    };
89
90    let pdf_url = self.field_maps.get("pdf_url").and_then(|map| {
91      self.get_by_path(&json, &map.path).map(|url| {
92        if let Some(transform) = &map.transform {
93          apply_transform(&url, transform).ok().unwrap_or_else(|| url.clone())
94        } else {
95          url.clone()
96        }
97      })
98    });
99
100    let doi = self
101      .field_maps
102      .get("doi")
103      .and_then(|map| self.get_by_path(&json, &map.path))
104      .map(String::from);
105
106    Ok(Paper {
107      title,
108      authors,
109      abstract_text,
110      publication_date,
111      source: String::new(),
112      source_identifier: String::new(),
113      pdf_url,
114      doi,
115    })
116  }
117}
118
119impl JsonConfig {
120  /// Extracts a single field value using configured mapping.
121  ///
122  /// # Errors
123  ///
124  /// Returns error if:
125  /// - Field mapping is missing
126  /// - Field value cannot be found
127  /// - Value transformation fails
128  fn extract_field(&self, json: &Value, field: &str) -> Result<String> {
129    let map = self
130      .field_maps
131      .get(field)
132      .ok_or_else(|| LearnerError::ApiError(format!("Missing field mapping for {}", field)))?;
133
134    let value = self
135      .get_by_path(json, &map.path)
136      .ok_or_else(|| LearnerError::ApiError(format!("No content found for {}", field)))?;
137
138    if let Some(transform) = &map.transform {
139      apply_transform(&value, transform)
140    } else {
141      Ok(value)
142    }
143  }
144
145  /// Retrieves a value from JSON using slash-separated path.
146  ///
147  /// Supports both object key and array index access:
148  /// - "message/title" -> object access
149  /// - "authors/0/name" -> array access
150  ///
151  /// Handles string, array, and number values with appropriate conversion.
152  fn get_by_path(&self, json: &Value, path: &str) -> Option<String> {
153    let mut current = json;
154
155    for part in path.split('/') {
156      current = if let Ok(index) = part.parse::<usize>() {
157        // Handle numeric indices for arrays
158        current.as_array()?.get(index)?
159      } else {
160        // Handle regular object keys
161        current.get(part)?
162      };
163    }
164
165    match current {
166      Value::String(s) => Some(s.clone()),
167      Value::Array(arr) if !arr.is_empty() => arr[0].as_str().map(String::from),
168      Value::Number(n) => Some(n.to_string()),
169      _ => current.as_str().map(String::from),
170    }
171  }
172
173  /// Extracts and processes author information from JSON.
174  ///
175  /// Handles author objects with given/family name fields and optional
176  /// affiliation information. Expects authors as an array matching the
177  /// configured path.
178  ///
179  /// # Errors
180  ///
181  /// Returns error if no valid authors are found in the response.
182  fn extract_authors(&self, json: &Value, map: &FieldMap) -> Result<Vec<Author>> {
183    let authors = if let Some(Value::Array(arr)) = get_path_value(json, &map.path) {
184      arr
185        .iter()
186        .filter_map(|author| {
187          let name = match (author.get("given"), author.get("family")) {
188            (Some(given), Some(family)) => {
189              format!("{} {}", given.as_str().unwrap_or(""), family.as_str().unwrap_or(""))
190            },
191            (Some(given), None) => given.as_str()?.to_string(),
192            (None, Some(family)) => family.as_str()?.to_string(),
193            (None, None) => return None,
194          };
195
196          let affiliation = author
197            .get("affiliation")
198            .and_then(|a| a.as_array())
199            .and_then(|arr| arr.first())
200            .and_then(|aff| aff.get("name"))
201            .and_then(|n| n.as_str())
202            .map(String::from);
203
204          Some(Author { name, affiliation, email: None })
205        })
206        .collect()
207    } else {
208      Vec::new()
209    };
210
211    if authors.is_empty() {
212      Err(LearnerError::ApiError("No authors found".to_string()))
213    } else {
214      Ok(authors)
215    }
216  }
217}
218
219/// Helper function to navigate JSON structure using path.
220///
221/// Similar to get_by_path but returns raw JSON Value instead of
222/// converted string.
223fn get_path_value<'a>(json: &'a Value, path: &str) -> Option<&'a Value> {
224  let mut current = json;
225  for part in path.split('/') {
226    current = current.get(part)?;
227  }
228  Some(current)
229}