Skip to main content

synaptic_lark/loaders/
doc.rs

1use async_trait::async_trait;
2use serde_json::Value;
3use std::collections::HashMap;
4use synaptic_core::{Document, Loader, SynapticError};
5
6use crate::{auth::TokenCache, LarkConfig};
7
8/// Load Feishu/Lark documents and Wiki pages into Synaptic [`Document`]s.
9///
10/// Supports loading specific document tokens directly or traversing a Wiki space
11/// to discover all nodes automatically.
12///
13/// # Example
14///
15/// ```rust,no_run
16/// use synaptic_lark::{LarkConfig, LarkDocLoader};
17/// use synaptic_core::Loader;
18///
19/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
20/// let config = LarkConfig::new("cli_xxx", "secret_xxx");
21/// let loader = LarkDocLoader::new(config)
22///     .with_doc_tokens(vec!["doxcnAbcXxx".to_string()])
23///     .with_wiki_space_id("space_xxx");
24///
25/// let docs = loader.load().await?;
26/// for doc in &docs {
27///     println!("Title: {}", doc.metadata["title"]);
28///     println!("Content length: {}", doc.content.len());
29/// }
30/// # Ok(())
31/// # }
32/// ```
33pub struct LarkDocLoader {
34    token_cache: TokenCache,
35    base_url: String,
36    doc_tokens: Vec<String>,
37    wiki_space_id: Option<String>,
38    client: reqwest::Client,
39}
40
41impl LarkDocLoader {
42    /// Create a new loader using the given config.
43    pub fn new(config: LarkConfig) -> Self {
44        let base_url = config.base_url.clone();
45        Self {
46            token_cache: config.token_cache(),
47            base_url,
48            doc_tokens: vec![],
49            wiki_space_id: None,
50            client: reqwest::Client::new(),
51        }
52    }
53
54    /// Add specific document tokens to load (e.g. `"doxcnAbcXxx"`).
55    pub fn with_doc_tokens(mut self, tokens: Vec<String>) -> Self {
56        self.doc_tokens = tokens;
57        self
58    }
59
60    /// Traverse a Wiki space to load all documents within it.
61    pub fn with_wiki_space_id(mut self, space_id: impl Into<String>) -> Self {
62        self.wiki_space_id = Some(space_id.into());
63        self
64    }
65
66    async fn auth_header(&self) -> Result<String, SynapticError> {
67        let token = self.token_cache.get_token().await?;
68        Ok(format!("Bearer {token}"))
69    }
70
71    /// Fetch the raw text content of a document.
72    async fn fetch_doc_content(&self, doc_token: &str) -> Result<Document, SynapticError> {
73        let auth = self.auth_header().await?;
74        let url = format!(
75            "{}/docx/v1/documents/{}/raw_content",
76            self.base_url, doc_token
77        );
78        let resp = self
79            .client
80            .get(&url)
81            .header("Authorization", auth)
82            .send()
83            .await
84            .map_err(|e| SynapticError::Loader(format!("Lark doc fetch: {e}")))?;
85
86        let body: Value = resp
87            .json()
88            .await
89            .map_err(|e| SynapticError::Loader(format!("Lark doc parse: {e}")))?;
90
91        check_lark_code(&body, "fetch doc content")?;
92
93        let content = body["data"]["content"].as_str().unwrap_or("").to_string();
94        let title = body["data"]["title"].as_str().unwrap_or("").to_string();
95
96        let mut metadata = HashMap::new();
97        metadata.insert("doc_id".to_string(), Value::String(doc_token.to_string()));
98        metadata.insert("title".to_string(), Value::String(title));
99        metadata.insert(
100            "source".to_string(),
101            Value::String(format!("lark:doc:{doc_token}")),
102        );
103        metadata.insert(
104            "url".to_string(),
105            Value::String(format!("https://bytedance.feishu.cn/docx/{doc_token}")),
106        );
107        metadata.insert("doc_type".to_string(), Value::String("docx".to_string()));
108
109        Ok(Document {
110            id: doc_token.to_string(),
111            content,
112            metadata,
113        })
114    }
115
116    /// Discover all doc tokens under a Wiki space node (paginates automatically).
117    async fn list_wiki_nodes(&self, space_id: &str) -> Result<Vec<String>, SynapticError> {
118        let auth = self.auth_header().await?;
119        let mut tokens = Vec::new();
120        let mut page_token: Option<String> = None;
121
122        loop {
123            let mut url = format!(
124                "{}/wiki/v2/spaces/{}/nodes?page_size=50",
125                self.base_url, space_id
126            );
127            if let Some(ref pt) = page_token {
128                url.push_str(&format!("&page_token={pt}"));
129            }
130
131            let resp = self
132                .client
133                .get(&url)
134                .header("Authorization", auth.clone())
135                .send()
136                .await
137                .map_err(|e| SynapticError::Loader(format!("Lark wiki list: {e}")))?;
138
139            let body: Value = resp
140                .json()
141                .await
142                .map_err(|e| SynapticError::Loader(format!("Lark wiki parse: {e}")))?;
143
144            check_lark_code(&body, "list wiki nodes")?;
145
146            if let Some(items) = body["data"]["items"].as_array() {
147                for item in items {
148                    if let Some(obj_token) = item["obj_token"].as_str() {
149                        let obj_type = item["obj_type"].as_str().unwrap_or("");
150                        if obj_type == "docx" || obj_type == "doc" {
151                            tokens.push(obj_token.to_string());
152                        }
153                    }
154                }
155            }
156
157            let has_more = body["data"]["has_more"].as_bool().unwrap_or(false);
158            if !has_more {
159                break;
160            }
161            page_token = body["data"]["page_token"].as_str().map(|s| s.to_string());
162        }
163        Ok(tokens)
164    }
165}
166
167fn check_lark_code(body: &Value, ctx: &str) -> Result<(), SynapticError> {
168    let code = body["code"].as_i64().unwrap_or(-1);
169    if code != 0 {
170        return Err(SynapticError::Loader(format!(
171            "Lark API error ({ctx}) code={code}: {}",
172            body["msg"].as_str().unwrap_or("unknown")
173        )));
174    }
175    Ok(())
176}
177
178#[async_trait]
179impl Loader for LarkDocLoader {
180    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
181        let mut all_tokens = self.doc_tokens.clone();
182
183        if let Some(ref space_id) = self.wiki_space_id {
184            let wiki_tokens = self.list_wiki_nodes(space_id).await?;
185            all_tokens.extend(wiki_tokens);
186        }
187
188        let mut documents = Vec::new();
189        for token in &all_tokens {
190            match self.fetch_doc_content(token).await {
191                Ok(doc) => documents.push(doc),
192                Err(e) => {
193                    tracing::warn!("Failed to load Lark doc {token}: {e}");
194                }
195            }
196        }
197        Ok(documents)
198    }
199}