Skip to main content

clayers_repo/query/
mod.rs

1//! `XPath` queries on repository objects.
2//!
3//! Provides `QueryStore` trait, default `XPath` evaluation via xee-xpath,
4//! revision resolution, and cross-ref search.
5
6#[cfg(any(test, feature = "compliance"))]
7#[allow(clippy::missing_panics_doc)]
8pub mod tests;
9
10use std::collections::HashMap;
11use std::pin::pin;
12
13use async_trait::async_trait;
14use clayers_xml::ContentHash;
15use futures_core::Stream;
16use crate::error::{Error, Result};
17use crate::object::Object;
18use crate::refs;
19use crate::store::{ObjectStore, RefStore};
20
21// ---------------------------------------------------------------------------
22// Types
23// ---------------------------------------------------------------------------
24
25/// Output mode for `XPath` queries.
26#[derive(Debug, Clone, Copy)]
27pub enum QueryMode {
28    /// Return the count of matching nodes.
29    Count,
30    /// Return the text content of matching nodes.
31    Text,
32    /// Return the serialized XML of matching nodes.
33    Xml,
34}
35
36/// Result of an `XPath` query.
37#[derive(Debug)]
38pub enum QueryResult {
39    /// Node count.
40    Count(usize),
41    /// Text content of each matching node.
42    Text(Vec<String>),
43    /// Serialized XML of each matching node.
44    Xml(Vec<String>),
45}
46
47/// Namespace prefix-to-URI map for `XPath` evaluation.
48pub type NamespaceMap = Vec<(String, String)>;
49
50/// Per-document query result, pairing a file path with its matches.
51#[derive(Debug)]
52pub struct DocumentQueryResult {
53    /// File path within the tree (e.g., `overview.xml`).
54    pub path: String,
55    /// The query result for this document.
56    pub result: QueryResult,
57}
58
59// ---------------------------------------------------------------------------
60// QueryStore trait
61// ---------------------------------------------------------------------------
62
63/// Trait for querying documents stored in the object store.
64///
65/// Backends can override this to use specialized indexing/search strategies.
66/// The default implementation collects the subtree, builds XML via export,
67/// and evaluates `XPath` using xee-xpath.
68#[async_trait]
69pub trait QueryStore: Send + Sync {
70    /// Query a document by its hash.
71    async fn query_document(
72        &self,
73        doc_hash: ContentHash,
74        xpath: &str,
75        mode: QueryMode,
76        namespaces: &NamespaceMap,
77    ) -> Result<QueryResult>;
78}
79
80// ---------------------------------------------------------------------------
81// Default implementation
82// ---------------------------------------------------------------------------
83
84/// Collect a stream into a `HashMap`.
85async fn try_collect_stream<S>(stream: S) -> Result<HashMap<ContentHash, Object>>
86where
87    S: Stream<Item = Result<(ContentHash, Object)>>,
88{
89    let mut stream = pin!(stream);
90    let mut map = HashMap::new();
91    while let Some(item) = std::future::poll_fn(|cx| stream.as_mut().poll_next(cx)).await {
92        let (hash, obj) = item?;
93        map.insert(hash, obj);
94    }
95    Ok(map)
96}
97
98/// Default query implementation: collect subtree, serialize to XML, evaluate
99/// `XPath` via xee-xpath.
100///
101/// # Errors
102///
103/// Returns an error if the document cannot be loaded or the `XPath` is invalid.
104pub async fn default_query_document(
105    store: &dyn ObjectStore,
106    doc_hash: ContentHash,
107    xpath: &str,
108    mode: QueryMode,
109    namespaces: &NamespaceMap,
110) -> Result<QueryResult> {
111    let objects = try_collect_stream(store.subtree(&doc_hash)).await?;
112
113    // Find the document root.
114    let root_hash = match objects.get(&doc_hash) {
115        Some(Object::Document(doc)) => doc.root,
116        Some(_) => return Err(Error::InvalidObject("expected Document object".into())),
117        None => return Err(Error::NotFound(doc_hash)),
118    };
119
120    // Serialize objects to XML string via the export module.
121    let xml_string = crate::export::build_xml_from_objects(&objects, root_hash)?;
122
123    // Evaluate XPath in a plain fn (all !Send xee-xpath types stay off the async stack).
124    let ns_refs: Vec<(&str, &str)> = namespaces
125        .iter()
126        .map(|(p, u)| (p.as_str(), u.as_str()))
127        .collect();
128    let xml_mode = match mode {
129        QueryMode::Count => clayers_xml::query::QueryMode::Count,
130        QueryMode::Text => clayers_xml::query::QueryMode::Text,
131        QueryMode::Xml => clayers_xml::query::QueryMode::Xml,
132    };
133    let result = clayers_xml::query::evaluate_xpath(&xml_string, xpath, xml_mode, &ns_refs)?;
134    Ok(match result {
135        clayers_xml::query::QueryResult::Count(n) => QueryResult::Count(n),
136        clayers_xml::query::QueryResult::Text(t) => QueryResult::Text(t),
137        clayers_xml::query::QueryResult::Xml(x) => QueryResult::Xml(x),
138    })
139}
140
141// ---------------------------------------------------------------------------
142// Revision resolution
143// ---------------------------------------------------------------------------
144
145/// Resolve a revspec string to a document `ContentHash`.
146///
147/// Handles: raw hex hash, `refs/heads/{name}`, `refs/tags/{name}`, `HEAD`,
148/// bare branch/tag names. Follows commits through trees to reach a document
149/// (uses the first tree entry's document for backwards compatibility).
150///
151/// # Errors
152///
153/// Returns an error if the revspec cannot be resolved.
154pub async fn resolve_to_document(
155    store: &dyn ObjectStore,
156    ref_store: &dyn RefStore,
157    revspec: &str,
158) -> Result<ContentHash> {
159    let hash = resolve_revspec(ref_store, revspec).await?;
160    // Follow commits/tags to reach a document (via tree).
161    follow_to_document(store, hash).await
162}
163
164/// Resolve a revspec string to a tree `ContentHash` and `TreeObject`.
165///
166/// # Errors
167///
168/// Returns an error if the revspec cannot be resolved or doesn't point to a tree.
169pub async fn resolve_to_tree(
170    store: &dyn ObjectStore,
171    ref_store: &dyn RefStore,
172    revspec: &str,
173) -> Result<(ContentHash, crate::object::TreeObject)> {
174    let hash = resolve_revspec(ref_store, revspec).await?;
175    let tree_hash = follow_to_tree(store, hash).await?;
176    let obj = store.get(&tree_hash).await?.ok_or(Error::NotFound(tree_hash))?;
177    let Object::Tree(t) = obj else {
178        return Err(Error::InvalidObject("expected Tree object".into()));
179    };
180    Ok((tree_hash, t))
181}
182
183/// Resolve a revspec string to a commit/tag/direct hash.
184///
185/// # Errors
186///
187/// Returns an error if the revspec cannot be resolved.
188pub async fn resolve_revspec(
189    ref_store: &dyn RefStore,
190    revspec: &str,
191) -> Result<ContentHash> {
192    if let Ok(h) = try_parse_hash(revspec) {
193        Ok(h)
194    } else if revspec == "HEAD" {
195        refs::resolve_head(ref_store)
196            .await?
197            .ok_or_else(|| Error::Ref("HEAD not set".into()))
198    } else if revspec.starts_with("refs/") {
199        ref_store
200            .get_ref(revspec)
201            .await?
202            .ok_or_else(|| Error::Ref(format!("ref not found: {revspec}")))
203    } else if let Some(h) = ref_store.get_ref(&refs::branch_ref(revspec)).await? {
204        Ok(h)
205    } else if let Some(h) = ref_store.get_ref(&refs::tag_ref(revspec)).await? {
206        Ok(h)
207    } else {
208        Err(Error::Ref(format!("cannot resolve revspec: {revspec}")))
209    }
210}
211
212/// Try to parse a hex string as a `ContentHash`.
213fn try_parse_hash(s: &str) -> Result<ContentHash> {
214    if s.len() != 64 {
215        return Err(Error::Ref("not a valid hash".into()));
216    }
217    let bytes: Vec<u8> = (0..64)
218        .step_by(2)
219        .map(|i| u8::from_str_radix(&s[i..i + 2], 16))
220        .collect::<std::result::Result<Vec<u8>, _>>()
221        .map_err(|_| Error::Ref("not a valid hex hash".into()))?;
222    let arr: [u8; 32] = bytes
223        .try_into()
224        .map_err(|_| Error::Ref("not 32 bytes".into()))?;
225    Ok(ContentHash(arr))
226}
227
228/// Follow Commit->tree->first entry document, Tag->target->recurse until we
229/// reach a Document hash. For backwards compatibility with single-document queries.
230async fn follow_to_document(
231    store: &dyn ObjectStore,
232    hash: ContentHash,
233) -> Result<ContentHash> {
234    let obj = store.get(&hash).await?.ok_or(Error::NotFound(hash))?;
235    match obj {
236        Object::Document(_) => Ok(hash),
237        Object::Tree(t) => {
238            // Return the first entry's document for backwards compatibility.
239            t.entries.first()
240                .map(|e| e.document)
241                .ok_or_else(|| Error::InvalidObject("empty tree has no documents".into()))
242        }
243        Object::Commit(c) => Box::pin(follow_to_document(store, c.tree)).await,
244        Object::Tag(t) => Box::pin(follow_to_document(store, t.target)).await,
245        _ => Err(Error::InvalidObject(
246            "revspec resolved to a non-versioning object".into(),
247        )),
248    }
249}
250
251/// Follow Commit->tree, Tag->target->recurse until we reach a Tree hash.
252///
253/// # Errors
254///
255/// Returns an error if objects cannot be loaded or the chain leads to a non-versioning object.
256pub async fn follow_to_tree(
257    store: &dyn ObjectStore,
258    hash: ContentHash,
259) -> Result<ContentHash> {
260    let obj = store.get(&hash).await?.ok_or(Error::NotFound(hash))?;
261    match obj {
262        Object::Tree(_) => Ok(hash),
263        Object::Commit(c) => Ok(c.tree),
264        Object::Tag(t) => Box::pin(follow_to_tree(store, t.target)).await,
265        _ => Err(Error::InvalidObject(
266            "revspec resolved to a non-versioning object".into(),
267        )),
268    }
269}
270
271// ---------------------------------------------------------------------------
272// Cross-ref search
273// ---------------------------------------------------------------------------
274
275/// Result of querying across multiple refs.
276#[derive(Debug)]
277pub struct RefQueryResult {
278    /// The ref name (e.g., `refs/heads/main`).
279    pub ref_name: String,
280    /// The commit hash the ref points to.
281    pub commit_hash: ContentHash,
282    /// The document hash after following the commit.
283    pub doc_hash: ContentHash,
284    /// The query result for this document.
285    pub result: QueryResult,
286}
287
288/// Query all refs matching a prefix, deduplicating on document hash.
289///
290/// # Errors
291///
292/// Returns an error if refs cannot be listed or queries fail.
293pub async fn query_refs(
294    store: &(dyn ObjectStore + Sync),
295    query_store: &dyn QueryStore,
296    ref_store: &dyn RefStore,
297    prefix: &str,
298    xpath: &str,
299    mode: QueryMode,
300    namespaces: &NamespaceMap,
301) -> Result<Vec<RefQueryResult>> {
302    let all_refs = ref_store.list_refs(prefix).await?;
303    let mut results = Vec::new();
304    let mut seen_docs = std::collections::HashSet::new();
305
306    for (ref_name, commit_hash) in all_refs {
307        let tree_hash = follow_to_tree(store, commit_hash).await?;
308        if !seen_docs.insert(tree_hash) {
309            continue; // Already queried this tree.
310        }
311        let tree_obj = store.get(&tree_hash).await?.ok_or(Error::NotFound(tree_hash))?;
312        let Object::Tree(tree) = tree_obj else {
313            return Err(Error::InvalidObject("expected Tree object".into()));
314        };
315        // Query each document in the tree, aggregate results.
316        // Skip documents where XPath compilation fails (unknown prefix).
317        let mut doc_results = Vec::new();
318        for entry in &tree.entries {
319            match query_store
320                .query_document(entry.document, xpath, mode, namespaces)
321                .await
322            {
323                Ok(result) => {
324                    doc_results.push(DocumentQueryResult {
325                        path: entry.path.clone(),
326                        result,
327                    });
328                }
329                Err(Error::Xml(ref e)) if e.to_string().contains("compile error") => {}
330                Err(e) => return Err(e),
331            }
332        }
333        let combined = aggregate_results(mode, doc_results);
334        let doc_hash = tree.entries.first()
335            .map_or(tree_hash, |e| e.document);
336        results.push(RefQueryResult {
337            ref_name,
338            commit_hash,
339            doc_hash,
340            result: combined,
341        });
342    }
343
344    Ok(results)
345}
346
347/// Resolve a revspec and query each document in the tree, returning
348/// per-document results with file paths.
349///
350/// When `files` is non-empty, only documents whose path matches one of the
351/// entries are queried (substring match on the tree entry path).
352///
353/// # Errors
354///
355/// Returns an error if resolution or query fails.
356#[allow(clippy::too_many_arguments)]
357pub async fn query_by_document(
358    store: &(dyn ObjectStore + Sync),
359    query_store: &dyn QueryStore,
360    ref_store: &dyn RefStore,
361    revspec: &str,
362    xpath: &str,
363    mode: QueryMode,
364    namespaces: &NamespaceMap,
365    files: &[String],
366) -> Result<Vec<DocumentQueryResult>> {
367    let hash = resolve_revspec(ref_store, revspec).await?;
368    let tree_hash = follow_to_tree(store, hash).await?;
369    let tree_obj = store.get(&tree_hash).await?.ok_or(Error::NotFound(tree_hash))?;
370    let Object::Tree(tree) = tree_obj else {
371        return Err(Error::InvalidObject("expected Tree object".into()));
372    };
373
374    let mut results = Vec::new();
375    for entry in &tree.entries {
376        // Apply file filter if specified.
377        if !files.is_empty() && !files.iter().any(|f| entry.path.contains(f.as_str())) {
378            continue;
379        }
380
381        match query_store
382            .query_document(entry.document, xpath, mode, namespaces)
383            .await
384        {
385            Ok(result) => {
386                // Skip documents with zero matches.
387                let has_matches = match &result {
388                    QueryResult::Count(0) => false,
389                    QueryResult::Count(_) => true,
390                    QueryResult::Text(t) => !t.is_empty(),
391                    QueryResult::Xml(x) => !x.is_empty(),
392                };
393                if has_matches {
394                    results.push(DocumentQueryResult {
395                        path: entry.path.clone(),
396                        result,
397                    });
398                }
399            }
400            Err(Error::Xml(ref e)) if e.to_string().contains("compile error") => {
401                // Document doesn't know the namespace prefix; skip it.
402            }
403            Err(e) => return Err(e),
404        }
405    }
406    Ok(results)
407}
408
409/// Convenience: resolve a revspec, query all documents, aggregate results.
410///
411/// # Errors
412///
413/// Returns an error if resolution or query fails.
414pub async fn query(
415    store: &(dyn ObjectStore + Sync),
416    query_store: &dyn QueryStore,
417    ref_store: &dyn RefStore,
418    revspec: &str,
419    xpath: &str,
420    mode: QueryMode,
421    namespaces: &NamespaceMap,
422) -> Result<QueryResult> {
423    let docs = query_by_document(
424        store, query_store, ref_store, revspec, xpath, mode, namespaces, &[],
425    )
426    .await?;
427    Ok(aggregate_results(mode, docs))
428}
429
430/// Aggregate per-document results into a single combined result.
431fn aggregate_results(mode: QueryMode, docs: Vec<DocumentQueryResult>) -> QueryResult {
432    let mut combined_count = 0usize;
433    let mut combined_texts = Vec::new();
434    let mut combined_xmls = Vec::new();
435    for doc in docs {
436        match doc.result {
437            QueryResult::Count(n) => combined_count += n,
438            QueryResult::Text(ts) => combined_texts.extend(ts),
439            QueryResult::Xml(xs) => combined_xmls.extend(xs),
440        }
441    }
442    match mode {
443        QueryMode::Count => QueryResult::Count(combined_count),
444        QueryMode::Text => QueryResult::Text(combined_texts),
445        QueryMode::Xml => QueryResult::Xml(combined_xmls),
446    }
447}
448