1use std::{
4 env, fs,
5 io::{self, Read},
6 path::{Path, PathBuf},
7 process::Command as ProcessCommand,
8 thread,
9 time::{Duration, Instant, SystemTime, UNIX_EPOCH},
10};
11
12use clap::{error::ErrorKind, Parser, Subcommand};
13use serde_json::Value;
14use zotron_rpc::{StdProviderCommandRunner, UreqProviderHttpTransport, ZoteroRpc};
15use zotron_types::{
16 bm25_score_chunks, build_embedding_provider_request, build_ocr_provider_request,
17 builtin_ocr_provider_specs, cosine_similarity, execute_embedding_provider_request,
18 is_zotron_evidence_artifact, machine_artifact_exists_for_item,
19 machine_artifact_exists_in_sidecar, machine_artifact_store_root,
20 ocr_provider_spec as raw_ocr_provider_spec, parse_embedding_provider_response,
21 parse_ocr_provider_response, read_machine_artifact_sidecar, rrf_merge,
22 write_machine_artifact_sidecar, ArtifactStorePlatform, EmbeddingChunkInput,
23 EmbeddingRequestInput, EmbeddingVector, MachineArtifactKind, OcrRequestInput,
24 ProviderCommandRunner, ProviderHttpInvocation, ProviderHttpTransport, StructureChunk,
25 DEFAULT_RPC_URL,
26};
27
28pub trait RpcCaller {
29 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String>;
30}
31
32#[derive(Debug, Clone, PartialEq, serde::Serialize)]
33pub struct CliOcrProviderSpec {
34 pub id: &'static str,
35 pub provider: &'static str,
36 pub request_style: &'static str,
37 pub auth: &'static str,
38 pub auth_header: &'static str,
39 pub supports_pdf_direct: bool,
40 pub key_field: &'static str,
41}
42
43#[derive(Debug, Clone, PartialEq, serde::Serialize)]
44pub struct CliEmbeddingProviderSpec {
45 pub id: &'static str,
46 pub provider: &'static str,
47 pub request_style: &'static str,
48 pub default_url: String,
49 pub default_model: &'static str,
50 pub auth: &'static str,
51 pub key_field: &'static str,
52}
53
54pub fn ocr_provider_specs() -> Vec<CliOcrProviderSpec> {
55 builtin_ocr_provider_specs()
56 .into_iter()
57 .map(cli_ocr_provider_spec)
58 .collect()
59}
60
61pub fn ocr_provider_spec(provider: &str) -> Result<CliOcrProviderSpec, String> {
62 zotron_types::ocr_provider_spec(provider).map(cli_ocr_provider_spec)
63}
64
65pub fn embedding_provider_spec(provider: &str) -> Result<CliEmbeddingProviderSpec, String> {
66 let spec = zotron_types::embedding_provider_spec(provider)?;
67 Ok(CliEmbeddingProviderSpec {
68 id: spec.id,
69 provider: spec.provider_key,
70 request_style: if spec.provider_key == "alibaba" {
71 "dashscope"
72 } else {
73 spec.request_style.as_str()
74 },
75 default_url: spec.default_url.unwrap_or("").to_string(),
76 default_model: spec.default_model,
77 auth: spec.auth,
78 key_field: spec.key_field,
79 })
80}
81
82pub fn chunks_from_blocks(blocks: &[Value], max_chars: usize) -> Result<Vec<Value>, String> {
83 let typed = blocks
84 .iter()
85 .map(json_block_to_pdf_block)
86 .collect::<Result<Vec<_>, _>>()?;
87 let chunks = zotron_types::chunks_from_blocks(&typed, max_chars);
88 chunks
89 .into_iter()
90 .map(|chunk| chunk_to_cli_value(&chunk, &typed))
91 .collect()
92}
93
94fn cli_ocr_provider_spec(spec: zotron_types::OcrProviderSpec) -> CliOcrProviderSpec {
95 CliOcrProviderSpec {
96 id: spec.provider_key,
97 provider: spec.provider_key,
98 request_style: spec.request_style.as_str(),
99 auth: spec.auth,
100 auth_header: spec.auth_header,
101 supports_pdf_direct: spec.supports_pdf_direct,
102 key_field: spec.key_field,
103 }
104}
105
106fn json_block_to_pdf_block(value: &Value) -> Result<zotron_types::PdfEvidenceBlock, String> {
107 let block_key = value
108 .get("block_key")
109 .and_then(Value::as_str)
110 .ok_or_else(|| "block missing block_key".to_string())?
111 .to_string();
112 let item_key = value
113 .get("item_key")
114 .and_then(Value::as_str)
115 .ok_or_else(|| "block missing item_key".to_string())?
116 .to_string();
117 let attachment_key = value
118 .get("attachment_key")
119 .and_then(Value::as_str)
120 .ok_or_else(|| "block missing attachment_key".to_string())?
121 .to_string();
122 let page_idx = value
123 .get("page_idx")
124 .or_else(|| value.get("page"))
125 .and_then(Value::as_u64)
126 .unwrap_or(1);
127 let block_type = value
128 .get("type")
129 .or_else(|| value.get("block_type"))
130 .and_then(Value::as_str)
131 .unwrap_or("paragraph")
132 .to_string();
133 let section_path = value
134 .get("section_path")
135 .and_then(Value::as_array)
136 .map(|items| {
137 items
138 .iter()
139 .filter_map(Value::as_str)
140 .map(ToString::to_string)
141 .collect::<Vec<_>>()
142 })
143 .unwrap_or_default();
144 let text = value
145 .get("text")
146 .and_then(Value::as_str)
147 .unwrap_or("")
148 .to_string();
149 let bbox = value.get("bbox").and_then(value_bbox4);
150
151 Ok(zotron_types::PdfEvidenceBlock {
152 block_key,
153 item_key,
154 attachment_key,
155 page_idx,
156 block_type,
157 bbox,
158 section_path,
159 text,
160 })
161}
162
163fn chunk_to_cli_value(
164 chunk: &zotron_types::StructureChunk,
165 blocks: &[zotron_types::PdfEvidenceBlock],
166) -> Result<Value, String> {
167 let refs = chunk
168 .block_keys
169 .iter()
170 .filter_map(|key| blocks.iter().find(|block| &block.block_key == key))
171 .map(|block| {
172 serde_json::json!({
173 "block_key": block.block_key,
174 "page_idx": block.page_idx,
175 "bbox": block.bbox.map(|bbox| bbox.iter().map(|n| {
176 if n.fract() == 0.0 {
177 Value::from(*n as i64)
178 } else {
179 Value::from(*n)
180 }
181 }).collect::<Vec<_>>()),
182 })
183 })
184 .collect::<Vec<_>>();
185 Ok(serde_json::json!({
186 "chunk_key": chunk.chunk_key,
187 "item_key": chunk.item_key,
188 "attachment_key": chunk.attachment_key,
189 "block_keys": chunk.block_keys,
190 "section_path": chunk.section_path,
191 "text": chunk.text,
192 "page_start": chunk.page_start,
193 "page_end": chunk.page_end,
194 "evidence_refs": refs,
195 }))
196}
197
198fn value_bbox4(value: &Value) -> Option<[f64; 4]> {
199 let arr = value.as_array()?;
200 if arr.len() != 4 {
201 return None;
202 }
203 Some([
204 arr[0].as_f64()?,
205 arr[1].as_f64()?,
206 arr[2].as_f64()?,
207 arr[3].as_f64()?,
208 ])
209}
210
211impl RpcCaller for ZoteroRpc {
212 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String> {
213 self.call(method, params).map_err(|err| err.to_string())
214 }
215}
216
217#[derive(Debug, Parser)]
218#[command(name = "zotron", about = "Rust client + CLI for the Zotron XPI")]
219struct Cli {
220 #[command(subcommand)]
221 command: Command,
222}
223
224#[derive(Debug, Subcommand)]
225enum OcrCommand {
226 Providers,
228 #[command(name = "run")]
230 Run {
231 #[arg(long)]
232 provider: String,
233 #[arg(long)]
235 input: Option<String>,
236 #[arg(long)]
238 file: Option<String>,
239 #[arg(long = "item-key")]
241 item_key: Option<String>,
242 #[arg(long = "attachment-key")]
244 attachment_key: Option<String>,
245 #[arg(long = "mime-type")]
247 mime_type: Option<String>,
248 #[arg(long)]
250 endpoint: Option<String>,
251 #[arg(long = "api-key-env")]
253 api_key_env: Option<String>,
254 },
255 Status {
257 #[arg(long)]
258 collection: String,
259 #[arg(long, default_value = DEFAULT_RPC_URL)]
260 url: String,
261 },
262 #[command(name = "process")]
264 Process {
265 #[arg(long, default_value = "mineru")]
266 provider: String,
267 #[arg(long)]
269 parent: String,
270 #[arg(long)]
272 attachment: Option<String>,
273 #[arg(long = "source-url")]
275 source_url: Option<String>,
276 #[arg(long = "result-dir")]
278 result_dir: Option<String>,
279 #[arg(long = "result-zip")]
281 result_zip: Option<String>,
282 #[arg(long = "provider-endpoint")]
284 provider_endpoint: Option<String>,
285 #[arg(long = "api-key-env", default_value = "ZOTRON_MINERU_API_KEY")]
287 api_key_env: String,
288 #[arg(long = "poll-interval-seconds", default_value_t = 5)]
289 poll_interval_seconds: u64,
290 #[arg(long = "timeout-seconds", default_value_t = 900)]
291 timeout_seconds: u64,
292 #[arg(long = "chunk-chars", default_value_t = 1200)]
293 chunk_chars: usize,
294 #[arg(long, default_value = DEFAULT_RPC_URL)]
295 url: String,
296 },
297}
298
299#[derive(Debug, Subcommand)]
300enum Command {
301 Ping {
303 #[arg(long, default_value = DEFAULT_RPC_URL)]
304 url: String,
305 },
306 Rpc {
308 method: String,
309 #[arg(default_value = "{}")]
310 params_json: String,
311 #[arg(long, default_value = DEFAULT_RPC_URL)]
312 url: String,
313 #[arg(long)]
314 paginate: bool,
315 #[arg(long, default_value_t = 100)]
316 page_size: usize,
317 },
318 Push {
320 json_file: String,
322 #[arg(long)]
324 pdf: Option<String>,
325 #[arg(long)]
327 collection: Option<String>,
328 #[arg(long = "on-duplicate", default_value = "skip")]
330 on_duplicate: String,
331 #[arg(long, default_value = DEFAULT_RPC_URL)]
332 url: String,
333 #[arg(long = "dry-run")]
335 dry_run: bool,
336 },
337 System {
339 #[command(subcommand)]
340 command: SystemCommand,
341 },
342 Search(SearchArgs),
344 Items {
346 #[command(subcommand)]
347 command: ItemsCommand,
348 },
349 Collections {
351 #[command(subcommand)]
352 command: CollectionsCommand,
353 },
354 Notes {
356 #[command(subcommand)]
357 command: NotesCommand,
358 },
359 Settings {
361 #[command(subcommand)]
362 command: SettingsCommand,
363 },
364 Tags {
366 #[command(subcommand)]
367 command: TagsCommand,
368 },
369 Export(ExportArgs),
371 Annotations {
373 #[command(subcommand)]
374 command: AnnotationsCommand,
375 },
376 Ocr {
378 #[command(subcommand)]
379 command: OcrCommand,
380 },
381 Rag {
383 #[command(subcommand)]
384 command: RagCommand,
385 },
386}
387
388struct RagSearchOptions {
389 query: String,
390 collection: Option<String>,
391 keys: Vec<String>,
392 zotero: bool,
393 top_spans_per_item: u64,
394 include_fulltext_spans: bool,
395 top_k: u64,
396 output: String,
397}
398
399#[derive(Debug, Subcommand)]
400enum RagCommand {
401 #[command(name = "providers")]
403 Providers,
404 #[command(name = "embed")]
406 Embed {
407 #[arg(long)]
408 provider: String,
409 #[arg(long)]
411 input: String,
412 #[arg(long)]
414 endpoint: Option<String>,
415 #[arg(long)]
417 model: Option<String>,
418 #[arg(long = "input-type")]
420 input_type: Option<String>,
421 #[arg(long = "api-key-env")]
423 api_key_env: Option<String>,
424 },
425 Status {
427 #[arg(long)]
428 collection: String,
429 #[arg(long, default_value = DEFAULT_RPC_URL)]
430 url: String,
431 },
432 #[command(name = "search")]
434 Search {
435 query: String,
436 #[arg(long)]
437 collection: Option<String>,
438 #[arg(long = "key", alias = "keys")]
440 keys: Vec<String>,
441 #[arg(long)]
442 zotero: bool,
443 #[arg(long = "top-spans-per-item", default_value_t = 3)]
444 top_spans_per_item: u64,
445 #[arg(long = "include-fulltext-spans")]
446 include_fulltext_spans: bool,
447 #[arg(long = "limit", alias = "top-k", default_value_t = 50)]
448 top_k: u64,
449 #[arg(long, default_value = "json", value_parser = ["json", "jsonl"])]
450 output: String,
451 #[arg(long, default_value = DEFAULT_RPC_URL)]
452 url: String,
453 },
454}
455
456#[derive(Debug, Subcommand)]
457enum SystemCommand {
458 Version {
460 #[arg(long, default_value = DEFAULT_RPC_URL)]
461 url: String,
462 },
463 Libraries {
465 #[arg(long, default_value = DEFAULT_RPC_URL)]
466 url: String,
467 },
468 #[command(name = "library-stats")]
470 LibraryStats {
471 #[arg(long)]
472 library: Option<i64>,
473 #[arg(long, default_value = DEFAULT_RPC_URL)]
474 url: String,
475 },
476 Schema {
478 #[arg(long = "type")]
479 item_type: Option<String>,
480 #[arg(long, default_value = DEFAULT_RPC_URL)]
481 url: String,
482 },
483 #[command(name = "current-collection")]
485 CurrentCollection {
486 #[arg(long, default_value = DEFAULT_RPC_URL)]
487 url: String,
488 },
489 Methods {
491 method: Option<String>,
493 #[arg(long, default_value = DEFAULT_RPC_URL)]
494 url: String,
495 },
496}
497
498#[derive(Debug, clap::Args)]
499struct SearchArgs {
500 query: Option<String>,
502 #[arg(long)]
504 fulltext: bool,
505 #[arg(long)]
507 author: Option<String>,
508 #[arg(long)]
510 after: Option<String>,
511 #[arg(long)]
513 before: Option<String>,
514 #[arg(long)]
516 journal: Option<String>,
517 #[arg(long)]
519 tag: Option<String>,
520 #[arg(long)]
522 doi: Option<String>,
523 #[arg(long)]
525 isbn: Option<String>,
526 #[arg(long)]
528 issn: Option<String>,
529 #[arg(long)]
531 collection: Option<String>,
532 #[arg(long, default_value_t = 50)]
533 limit: u64,
534 #[arg(long, default_value_t = 0)]
535 offset: u64,
536 #[arg(long, default_value = DEFAULT_RPC_URL)]
537 url: String,
538 #[command(subcommand)]
539 management: Option<SearchManagementCommand>,
540}
541
542#[derive(Debug, Subcommand)]
543enum SearchManagementCommand {
544 #[command(name = "saved-searches")]
546 SavedSearches {
547 #[arg(long, default_value = DEFAULT_RPC_URL)]
548 url: String,
549 },
550 #[command(name = "create-saved")]
552 CreateSaved {
553 name: String,
554 #[arg(long = "condition", required = true)]
555 condition: Vec<String>,
556 #[arg(long)]
557 dry_run: bool,
558 #[arg(long, default_value = DEFAULT_RPC_URL)]
559 url: String,
560 },
561 #[command(name = "delete-saved")]
563 DeleteSaved {
564 search_key: String,
565 #[arg(long)]
566 dry_run: bool,
567 #[arg(long, default_value = DEFAULT_RPC_URL)]
568 url: String,
569 },
570}
571
572#[derive(Debug, Subcommand)]
573enum ItemsCommand {
574 Add {
576 #[arg(long)]
577 doi: Option<String>,
578 #[arg(long)]
579 isbn: Option<String>,
580 #[arg(long = "from-url")]
582 from_url: Option<String>,
583 #[arg(long)]
585 file: Option<String>,
586 #[arg(long = "type")]
588 item_type: Option<String>,
589 #[arg(long = "field")]
591 fields: Vec<String>,
592 #[arg(long)]
593 collection: Option<String>,
594 #[arg(long)]
595 dry_run: bool,
596 #[arg(long, default_value = DEFAULT_RPC_URL)]
597 url: String,
598 },
599 Update {
601 key: String,
602 #[arg(long = "field")]
603 fields: Vec<String>,
604 #[arg(long)]
605 dry_run: bool,
606 #[arg(long, default_value = DEFAULT_RPC_URL)]
607 url: String,
608 },
609 Delete {
611 key: String,
612 #[arg(long)]
613 dry_run: bool,
614 #[arg(long, default_value = DEFAULT_RPC_URL)]
615 url: String,
616 },
617 Trash {
619 items: Vec<String>,
620 #[arg(long)]
621 dry_run: bool,
622 #[arg(long, default_value = DEFAULT_RPC_URL)]
623 url: String,
624 },
625 Restore {
627 item: String,
628 #[arg(long)]
629 dry_run: bool,
630 #[arg(long, default_value = DEFAULT_RPC_URL)]
631 url: String,
632 },
633 #[command(name = "merge-duplicates")]
635 MergeDuplicates {
636 keys: Vec<String>,
637 #[arg(long)]
638 dry_run: bool,
639 #[arg(long, default_value = DEFAULT_RPC_URL)]
640 url: String,
641 },
642 #[command(name = "add-related")]
644 AddRelated {
645 key: String,
646 #[arg(long)]
647 target: String,
648 #[arg(long)]
649 dry_run: bool,
650 #[arg(long, default_value = DEFAULT_RPC_URL)]
651 url: String,
652 },
653 #[command(name = "remove-related")]
655 RemoveRelated {
656 key: String,
657 #[arg(long)]
658 target: String,
659 #[arg(long)]
660 dry_run: bool,
661 #[arg(long, default_value = DEFAULT_RPC_URL)]
662 url: String,
663 },
664 Get {
666 item: String,
667 #[arg(long, default_value = DEFAULT_RPC_URL)]
668 url: String,
669 },
670 List {
672 #[arg(long, default_value_t = 50)]
673 limit: u64,
674 #[arg(long, default_value_t = 0)]
675 offset: u64,
676 #[arg(long)]
677 sort: Option<String>,
678 #[arg(long, default_value = "asc")]
679 direction: String,
680 #[arg(long)]
682 trash: bool,
683 #[arg(long, default_value = DEFAULT_RPC_URL)]
684 url: String,
685 },
686 #[command(name = "find-duplicates")]
688 FindDuplicates {
689 #[arg(long, default_value = DEFAULT_RPC_URL)]
690 url: String,
691 },
692 Recent {
694 #[arg(long, default_value_t = 20)]
695 limit: u64,
696 #[arg(long, default_value_t = 0)]
697 offset: u64,
698 #[arg(long = "type", default_value = "added")]
699 recent_type: String,
700 #[arg(long, default_value = DEFAULT_RPC_URL)]
701 url: String,
702 },
703 Fulltext {
705 key: String,
706 #[arg(long, default_value = DEFAULT_RPC_URL)]
707 url: String,
708 },
709 Related {
711 key: String,
712 #[arg(long, default_value = DEFAULT_RPC_URL)]
713 url: String,
714 },
715 #[command(name = "citation-key")]
717 CitationKey {
718 key: String,
719 #[arg(long, default_value = DEFAULT_RPC_URL)]
720 url: String,
721 },
722 Path {
724 key: String,
725 #[arg(long, default_value = DEFAULT_RPC_URL)]
726 url: String,
727 },
728 Attachments {
730 key: String,
731 #[arg(long, default_value_t = 0)]
732 offset: u64,
733 #[arg(long, default_value = DEFAULT_RPC_URL)]
734 url: String,
735 },
736 #[command(name = "find-pdfs")]
738 FindPdfs {
739 #[arg(long)]
740 collection: String,
741 #[arg(long, default_value_t = 0)]
742 limit: usize,
743 #[arg(long, default_value = DEFAULT_RPC_URL)]
744 url: String,
745 },
746}
747
748#[derive(Debug, Subcommand)]
749enum SettingsCommand {
750 Get {
752 key: String,
753 #[arg(long, default_value = DEFAULT_RPC_URL)]
754 url: String,
755 },
756 #[command(visible_alias = "get-all")]
758 List {
759 #[arg(long, default_value = DEFAULT_RPC_URL)]
760 url: String,
761 },
762 Set {
764 pairs: Vec<String>,
766 #[arg(long)]
768 file: Option<String>,
769 #[arg(long)]
770 dry_run: bool,
771 #[arg(long, default_value = DEFAULT_RPC_URL)]
772 url: String,
773 },
774}
775
776#[derive(Debug, Subcommand)]
777enum TagsCommand {
778 List {
780 #[arg(long, default_value_t = 200)]
781 limit: u64,
782 #[arg(long, default_value = DEFAULT_RPC_URL)]
783 url: String,
784 },
785 Rename {
787 old: String,
788 new: String,
789 #[arg(long)]
790 dry_run: bool,
791 #[arg(long, default_value = DEFAULT_RPC_URL)]
792 url: String,
793 },
794 Delete {
796 tag: String,
797 #[arg(long)]
798 dry_run: bool,
799 #[arg(long, default_value = DEFAULT_RPC_URL)]
800 url: String,
801 },
802 Add {
804 keys: Vec<String>,
805 #[arg(long = "tag", required = true)]
806 tags: Vec<String>,
807 #[arg(long)]
808 dry_run: bool,
809 #[arg(long, default_value = DEFAULT_RPC_URL)]
810 url: String,
811 },
812 Remove {
814 keys: Vec<String>,
815 #[arg(long = "tag", required = true)]
816 tags: Vec<String>,
817 #[arg(long)]
818 dry_run: bool,
819 #[arg(long, default_value = DEFAULT_RPC_URL)]
820 url: String,
821 },
822}
823
824#[derive(Debug, clap::Args)]
825struct ExportArgs {
826 keys: Vec<String>,
828 #[arg(long, default_value = "bibtex")]
830 format: String,
831 #[arg(long)]
833 collection: Option<String>,
834 #[arg(long, default_value = "http://www.zotero.org/styles/apa")]
836 style: String,
837 #[arg(long)]
839 html: bool,
840 #[arg(long, default_value = DEFAULT_RPC_URL)]
841 url: String,
842}
843
844#[derive(Debug, Subcommand)]
845enum AnnotationsCommand {
846 List {
848 parent: String,
850 #[arg(long)]
852 attachment: Option<String>,
853 #[arg(long)]
855 context: Option<u32>,
856 #[arg(long, default_value = DEFAULT_RPC_URL)]
857 url: String,
858 },
859 Create {
861 parent: String,
863 #[arg(long)]
865 attachment: Option<String>,
866 #[arg(long = "type")]
867 annotation_type: Option<String>,
868 #[arg(long)]
871 position: Option<String>,
872 #[arg(long)]
875 quote: Option<String>,
876 #[arg(long)]
878 page: Option<u32>,
879 #[arg(long = "sort-index")]
881 sort_index: Option<String>,
882 #[arg(long)]
883 text: Option<String>,
884 #[arg(long)]
885 comment: Option<String>,
886 #[arg(long, default_value = "#ffd400")]
887 color: String,
888 #[arg(long)]
889 dry_run: bool,
890 #[arg(long, default_value = DEFAULT_RPC_URL)]
891 url: String,
892 },
893 CreateBatch {
896 parent: String,
898 #[arg(long)]
900 attachment: Option<String>,
901 #[arg(long)]
903 file: Option<String>,
904 #[arg(long)]
905 dry_run: bool,
906 #[arg(long, default_value = DEFAULT_RPC_URL)]
907 url: String,
908 },
909 Locate {
912 parent: String,
914 #[arg(long)]
916 attachment: Option<String>,
917 #[arg(long)]
919 quote: String,
920 #[arg(long)]
922 page: Option<u32>,
923 #[arg(long, default_value = DEFAULT_RPC_URL)]
924 url: String,
925 },
926 Delete {
928 annotation_key: String,
929 #[arg(long)]
930 dry_run: bool,
931 #[arg(long, default_value = DEFAULT_RPC_URL)]
932 url: String,
933 },
934}
935
936#[derive(Debug, Subcommand)]
937enum NotesCommand {
938 List {
940 #[arg(long)]
941 parent: String,
942 #[arg(long, default_value_t = 50)]
943 limit: u64,
944 #[arg(long, default_value_t = 0)]
945 offset: u64,
946 #[arg(long, default_value = DEFAULT_RPC_URL)]
947 url: String,
948 },
949 Get {
951 note_key: String,
952 #[arg(long, default_value = DEFAULT_RPC_URL)]
953 url: String,
954 },
955 Create {
957 #[arg(long)]
958 parent: String,
959 #[arg(long)]
960 content: String,
961 #[arg(long = "tag")]
962 tags: Vec<String>,
963 #[arg(long)]
964 dry_run: bool,
965 #[arg(long, default_value = DEFAULT_RPC_URL)]
966 url: String,
967 },
968 Update {
970 note_key: String,
971 #[arg(long)]
972 content: String,
973 #[arg(long)]
974 dry_run: bool,
975 #[arg(long, default_value = DEFAULT_RPC_URL)]
976 url: String,
977 },
978 Delete {
980 note_key: String,
981 #[arg(long)]
982 dry_run: bool,
983 #[arg(long, default_value = DEFAULT_RPC_URL)]
984 url: String,
985 },
986 Search {
988 query: String,
989 #[arg(long, default_value_t = 50)]
990 limit: u64,
991 #[arg(long, default_value = DEFAULT_RPC_URL)]
992 url: String,
993 },
994}
995
996#[derive(Debug, Subcommand)]
997enum CollectionsCommand {
998 List {
1000 #[arg(long, default_value = DEFAULT_RPC_URL)]
1001 url: String,
1002 },
1003 Tree {
1005 #[arg(long, default_value = DEFAULT_RPC_URL)]
1006 url: String,
1007 },
1008 Get {
1010 name_or_id: String,
1011 #[arg(long, default_value = DEFAULT_RPC_URL)]
1012 url: String,
1013 },
1014 #[command(name = "get-items", visible_alias = "items")]
1016 GetItems {
1017 name_or_id: String,
1018 #[arg(long)]
1019 limit: Option<u64>,
1020 #[arg(long, default_value_t = 0)]
1021 offset: u64,
1022 #[arg(long, default_value = DEFAULT_RPC_URL)]
1023 url: String,
1024 },
1025 Stats {
1027 name_or_id: String,
1028 #[arg(long, default_value = DEFAULT_RPC_URL)]
1029 url: String,
1030 },
1031 Rename {
1033 old_name: String,
1034 new_name: String,
1035 #[arg(long, default_value = DEFAULT_RPC_URL)]
1036 url: String,
1037 #[arg(long)]
1038 dry_run: bool,
1039 },
1040 Create {
1042 name: String,
1043 #[arg(long)]
1044 parent: Option<String>,
1045 #[arg(long, default_value = DEFAULT_RPC_URL)]
1046 url: String,
1047 #[arg(long)]
1048 dry_run: bool,
1049 },
1050 Delete {
1052 name_or_id: String,
1053 #[arg(long, default_value = DEFAULT_RPC_URL)]
1054 url: String,
1055 #[arg(long)]
1056 dry_run: bool,
1057 },
1058 #[command(name = "add-items")]
1060 AddItems {
1061 collection: String,
1062 item_keys: Vec<String>,
1063 #[arg(long, default_value = DEFAULT_RPC_URL)]
1064 url: String,
1065 #[arg(long)]
1066 dry_run: bool,
1067 },
1068 #[command(name = "remove-items")]
1070 RemoveItems {
1071 collection: String,
1072 item_keys: Vec<String>,
1073 #[arg(long, default_value = DEFAULT_RPC_URL)]
1074 url: String,
1075 #[arg(long)]
1076 dry_run: bool,
1077 },
1078}
1079
1080#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1081enum JsonStyle {
1082 PythonCompact,
1084 Pretty,
1086}
1087
1088enum ParseOutcome<T> {
1089 Command(T),
1090 Display(String),
1091}
1092
1093fn parse_cli<T>(
1094 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1095) -> Result<ParseOutcome<T>, String>
1096where
1097 T: Parser,
1098{
1099 match T::try_parse_from(args) {
1100 Ok(cli) => Ok(ParseOutcome::Command(cli)),
1101 Err(err)
1102 if matches!(
1103 err.kind(),
1104 ErrorKind::DisplayHelp | ErrorKind::DisplayVersion
1105 ) =>
1106 {
1107 Ok(ParseOutcome::Display(err.to_string()))
1108 }
1109 Err(err) => Err(err.to_string()),
1110 }
1111}
1112
1113pub fn format_error_json(message: &str) -> String {
1114 let message = message.trim_end();
1115 let (code, message) = split_error_code(message).unwrap_or(("RUNTIME_ERROR", message));
1116 serde_json::json!({"error": {"code": code, "message": message}}).to_string()
1117}
1118
1119fn split_error_code(message: &str) -> Option<(&str, &str)> {
1120 let (code, rest) = message.split_once(':')?;
1121 if !code.is_empty()
1122 && code
1123 .chars()
1124 .all(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_')
1125 {
1126 Some((code, rest.trim_start()))
1127 } else {
1128 None
1129 }
1130}
1131
1132pub fn run(
1133 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1134) -> Result<String, String> {
1135 if std::env::var("CLAUDECODE").as_deref() == Ok("1") {
1137 eprintln!(r#"<claude-code-hint v="1" type="plugin" value="zotron@dianzuan/zotron" />"#);
1138 }
1139
1140 let cli = match parse_cli::<Cli>(args)? {
1141 ParseOutcome::Command(cli) => cli,
1142 ParseOutcome::Display(output) => return Ok(output),
1143 };
1144 let url = command_url(&cli.command);
1145 let mut client = ZoteroRpc::new(url);
1146 run_command(cli.command, &mut client)
1147}
1148
1149pub fn run_with_client(
1150 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1151 client: &mut impl RpcCaller,
1152) -> Result<String, String> {
1153 let cli = match parse_cli::<Cli>(args)? {
1154 ParseOutcome::Command(cli) => cli,
1155 ParseOutcome::Display(output) => return Ok(output),
1156 };
1157 run_command(cli.command, client)
1158}
1159
1160fn rag_command_url(command: &RagCommand) -> String {
1161 match command {
1162 RagCommand::Providers => DEFAULT_RPC_URL.to_string(),
1163 RagCommand::Embed { .. } => DEFAULT_RPC_URL.to_string(),
1164 RagCommand::Status { url, .. } => url.clone(),
1165 RagCommand::Search { url, .. } => url.clone(),
1166 }
1167}
1168
1169fn command_url(command: &Command) -> String {
1170 match command {
1171 Command::Ping { url }
1172 | Command::Rpc { url, .. }
1173 | Command::Push { url, .. }
1174 => url.clone(),
1175 Command::Ocr { command } => match command {
1176 OcrCommand::Providers => DEFAULT_RPC_URL.to_string(),
1177 OcrCommand::Run { .. } => DEFAULT_RPC_URL.to_string(),
1178 OcrCommand::Status { url, .. } => url.clone(),
1179 OcrCommand::Process { url, .. } => url.clone(),
1180 },
1181 Command::Rag { command } => rag_command_url(command),
1182 Command::System { command } => match command {
1183 SystemCommand::Version { url }
1184 | SystemCommand::Libraries { url }
1185 | SystemCommand::LibraryStats { url, .. }
1186 | SystemCommand::Schema { url, .. }
1187 | SystemCommand::CurrentCollection { url }
1188 | SystemCommand::Methods { url, .. } => url.clone(),
1189 },
1190 Command::Search(ref args) => match &args.management {
1191 Some(SearchManagementCommand::SavedSearches { url })
1192 | Some(SearchManagementCommand::CreateSaved { url, .. })
1193 | Some(SearchManagementCommand::DeleteSaved { url, .. }) => url.clone(),
1194 None => args.url.clone(),
1195 },
1196 Command::Items { command } => match command {
1197 ItemsCommand::Add { url, .. }
1198 | ItemsCommand::Update { url, .. }
1199 | ItemsCommand::Delete { url, .. }
1200 | ItemsCommand::Trash { url, .. }
1201 | ItemsCommand::Restore { url, .. }
1202 | ItemsCommand::MergeDuplicates { url, .. }
1203 | ItemsCommand::AddRelated { url, .. }
1204 | ItemsCommand::RemoveRelated { url, .. }
1205 | ItemsCommand::Get { url, .. }
1206 | ItemsCommand::List { url, .. }
1207 | ItemsCommand::FindDuplicates { url }
1208 | ItemsCommand::Recent { url, .. }
1209 | ItemsCommand::Fulltext { url, .. }
1210 | ItemsCommand::Related { url, .. }
1211 | ItemsCommand::CitationKey { url, .. }
1212 | ItemsCommand::Path { url, .. }
1213 | ItemsCommand::Attachments { url, .. }
1214 | ItemsCommand::FindPdfs { url, .. } => url.clone(),
1215 },
1216 Command::Collections { command } => match command {
1217 CollectionsCommand::List { url }
1218 | CollectionsCommand::Tree { url }
1219 | CollectionsCommand::Get { url, .. }
1220 | CollectionsCommand::GetItems { url, .. }
1221 | CollectionsCommand::Stats { url, .. }
1222 | CollectionsCommand::Rename { url, .. }
1223 | CollectionsCommand::Create { url, .. }
1224 | CollectionsCommand::Delete { url, .. }
1225 | CollectionsCommand::AddItems { url, .. }
1226 | CollectionsCommand::RemoveItems { url, .. } => url.clone(),
1227 },
1228 Command::Notes { command } => match command {
1229 NotesCommand::List { url, .. }
1230 | NotesCommand::Get { url, .. }
1231 | NotesCommand::Create { url, .. }
1232 | NotesCommand::Update { url, .. }
1233 | NotesCommand::Delete { url, .. }
1234 | NotesCommand::Search { url, .. } => url.clone(),
1235 },
1236 Command::Settings { command } => match command {
1237 SettingsCommand::Get { url, .. }
1238 | SettingsCommand::List { url }
1239 | SettingsCommand::Set { url, .. } => url.clone(),
1240 },
1241 Command::Tags { command } => match command {
1242 TagsCommand::List { url, .. }
1243 | TagsCommand::Rename { url, .. }
1244 | TagsCommand::Delete { url, .. }
1245 | TagsCommand::Add { url, .. }
1246 | TagsCommand::Remove { url, .. } => url.clone(),
1247 },
1248 Command::Export(ref args) => args.url.clone(),
1249 Command::Annotations { command } => match command {
1250 AnnotationsCommand::List { url, .. }
1251 | AnnotationsCommand::Create { url, .. }
1252 | AnnotationsCommand::CreateBatch { url, .. }
1253 | AnnotationsCommand::Locate { url, .. }
1254 | AnnotationsCommand::Delete { url, .. } => url.clone(),
1255 },
1256 }
1257}
1258
1259fn run_ocr_command(command: OcrCommand, client: &mut impl RpcCaller) -> Result<String, String> {
1260 if let OcrCommand::Providers = &command {
1261 return format_json(
1262 &serde_json::json!({ "providers": ocr_provider_specs() }),
1263 JsonStyle::Pretty,
1264 );
1265 }
1266 let value = match command {
1267 OcrCommand::Providers => unreachable!(),
1268 OcrCommand::Run {
1269 provider,
1270 input,
1271 file,
1272 item_key,
1273 attachment_key,
1274 mime_type,
1275 endpoint,
1276 api_key_env,
1277 } => run_ocr_run_command(OcrRunOptions {
1278 provider,
1279 input,
1280 file,
1281 item_key,
1282 attachment_key,
1283 mime_type,
1284 endpoint,
1285 api_key_env,
1286 })?,
1287 OcrCommand::Status { collection, .. } => run_ocr_status_command(client, collection)?,
1288 OcrCommand::Process {
1289 provider,
1290 parent,
1291 attachment,
1292 source_url,
1293 result_dir,
1294 result_zip,
1295 provider_endpoint,
1296 api_key_env,
1297 poll_interval_seconds,
1298 timeout_seconds,
1299 chunk_chars,
1300 ..
1301 } => run_ocr_process_command(
1302 client,
1303 OcrProcessOptions {
1304 provider,
1305 parent,
1306 attachment,
1307 source_url,
1308 result_dir,
1309 result_zip,
1310 provider_endpoint,
1311 api_key_env,
1312 poll_interval_seconds,
1313 timeout_seconds,
1314 chunk_chars,
1315 },
1316 )?,
1317 };
1318 format_json(&value, JsonStyle::PythonCompact)
1319}
1320
1321struct OcrProcessOptions {
1322 provider: String,
1323 parent: String,
1324 attachment: Option<String>,
1325 source_url: Option<String>,
1326 result_dir: Option<String>,
1327 result_zip: Option<String>,
1328 provider_endpoint: Option<String>,
1329 api_key_env: String,
1330 poll_interval_seconds: u64,
1331 timeout_seconds: u64,
1332 chunk_chars: usize,
1333}
1334
1335struct OcrRunOptions {
1336 provider: String,
1337 input: Option<String>,
1338 file: Option<String>,
1339 item_key: Option<String>,
1340 attachment_key: Option<String>,
1341 mime_type: Option<String>,
1342 endpoint: Option<String>,
1343 api_key_env: Option<String>,
1344}
1345
1346fn run_ocr_run_command(options: OcrRunOptions) -> Result<Value, String> {
1347 let input: OcrRequestInput = match (options.input, options.file) {
1348 (Some(input), None) => read_json_input(&input)?,
1349 (None, Some(file)) => ocr_input_from_file(
1350 file,
1351 options.item_key,
1352 options.attachment_key,
1353 options.mime_type,
1354 )?,
1355 (Some(_), Some(_)) => {
1356 return Err("INVALID_ARGS: use either --input or --file, not both".to_string())
1357 }
1358 (None, None) => return Err("INVALID_ARGS: provide --input JSON or --file".to_string()),
1359 };
1360 let request = build_ocr_provider_request(&options.provider, &input)?;
1361 let payload = if request.command.is_empty() {
1362 let method = request
1363 .method
1364 .ok_or_else(|| format!("OCR provider {} missing HTTP method", request.provider))?;
1365 let auth_scheme = raw_ocr_provider_spec(&options.provider)?.auth;
1366 let mut transport =
1367 provider_http_transport_with_auth(options.api_key_env.as_deref(), auth_scheme)?;
1368 transport.post_json(&ProviderHttpInvocation {
1369 provider: request.provider.to_string(),
1370 style: request.style.to_string(),
1371 method: method.to_string(),
1372 url: options
1373 .endpoint
1374 .or_else(|| request.url.map(ToString::to_string)),
1375 auth_header_name: request.auth_header.map(ToString::to_string),
1376 auth_header_value: None,
1377 body: request.body,
1378 })?
1379 } else {
1380 let mut command_runner = StdProviderCommandRunner;
1381 command_runner.run_json(&request.command)?
1382 };
1383 let blocks = match parse_ocr_provider_response(
1384 request.provider,
1385 &payload,
1386 &input.item_key,
1387 &input.attachment_key,
1388 ) {
1389 Ok(blocks) => blocks,
1390 Err(err) => {
1391 if let Some(task) = ocr_async_task_result(request.provider, &payload) {
1392 return Ok(task);
1393 }
1394 return Err(err);
1395 }
1396 };
1397
1398 Ok(serde_json::json!({
1399 "provider": request.provider,
1400 "blocks": blocks,
1401 }))
1402}
1403
1404fn run_ocr_process_command(
1405 client: &mut impl RpcCaller,
1406 mut options: OcrProcessOptions,
1407) -> Result<Value, String> {
1408 let spec = raw_ocr_provider_spec(&options.provider)?;
1409
1410 let attachment = match options.attachment.take() {
1411 Some(key) => key,
1412 None => resolve_first_pdf_attachment_key(client, &options.parent)?,
1413 };
1414 options.attachment = Some(attachment.clone());
1415
1416 let attachment_path = resolve_attachment_path(client, &attachment)?;
1417 let storage_dir = attachment_path
1418 .parent()
1419 .ok_or_else(|| {
1420 format!(
1421 "ATTACHMENT_PATH_INVALID: attachment path has no parent directory: {}",
1422 attachment_path.display()
1423 )
1424 })?
1425 .to_path_buf();
1426
1427 match spec.provider_key {
1428 "mineru" | "mineru-cli" => {
1429 if options.result_dir.is_some() && options.result_zip.is_some() {
1430 return Err("INVALID_ARGS: use either --result-dir or --result-zip, not both".to_string());
1431 }
1432 if options.source_url.is_some()
1433 && (options.result_dir.is_some() || options.result_zip.is_some())
1434 {
1435 return Err(
1436 "INVALID_ARGS: --source-url cannot be combined with --result-dir/--result-zip"
1437 .to_string(),
1438 );
1439 }
1440 let file_name = attachment_path
1441 .file_name()
1442 .and_then(|name| name.to_str())
1443 .unwrap_or("document.pdf")
1444 .to_string();
1445 let source = load_mineru_result_source(&options, &attachment_path, &file_name)?;
1446 let artifacts = persist_mineru_result_sidecars(
1447 &storage_dir, &options.parent, &attachment,
1448 &options.provider, &source, options.chunk_chars,
1449 )?;
1450 Ok(serde_json::json!({
1451 "provider": spec.provider_key,
1452 "status": "indexed",
1453 "item_key": options.parent,
1454 "attachment_key": attachment,
1455 "attachment_path": attachment_path,
1456 "storage_dir": storage_dir,
1457 "task_id": source.task_id,
1458 "state": source.state,
1459 "blocks": artifacts.block_count,
1460 "chunks": artifacts.chunk_count,
1461 "artifacts": artifacts.artifacts,
1462 }))
1463 }
1464 _ => {
1465 run_ocr_process_sync(
1466 client, &options, spec.provider_key,
1467 &attachment, &attachment_path, &storage_dir,
1468 )
1469 }
1470 }
1471}
1472
1473fn run_ocr_process_sync(
1474 client: &mut impl RpcCaller,
1475 options: &OcrProcessOptions,
1476 provider: &str,
1477 attachment_key: &str,
1478 attachment_path: &Path,
1479 storage_dir: &Path,
1480) -> Result<Value, String> {
1481 let api_url = if let Some(endpoint) = &options.provider_endpoint {
1482 endpoint.clone()
1483 } else {
1484 let settings = client.call("settings.getAll", None)?;
1485 settings.get("ocr.apiUrl")
1486 .and_then(Value::as_str)
1487 .unwrap_or("")
1488 .to_string()
1489 };
1490 if api_url.is_empty() {
1491 return Err(format!("MISSING_CONFIG: ocr.apiUrl not configured for provider {provider}"));
1492 }
1493
1494 let api_key = {
1495 let from_env = if !options.api_key_env.is_empty() {
1496 env::var(&options.api_key_env).ok().filter(|v| !v.is_empty())
1497 } else {
1498 None
1499 };
1500 from_env.unwrap_or_else(|| {
1501 client.call("settings.getRaw", Some(serde_json::json!({"key": "ocr.apiKey"})))
1502 .ok()
1503 .and_then(|raw| raw.get("ocr.apiKey").and_then(Value::as_str).map(String::from))
1504 .unwrap_or_default()
1505 })
1506 };
1507
1508 let pdf_bytes = fs::read(attachment_path)
1509 .map_err(|e| format!("READ_PDF_FAILED: {}: {e}", attachment_path.display()))?;
1510
1511 const MAX_PDF_SIZE: usize = 100 * 1024 * 1024; if pdf_bytes.len() > MAX_PDF_SIZE {
1513 return Err(format!(
1514 "PDF_TOO_LARGE: {} is {} MB, max {} MB",
1515 attachment_path.display(),
1516 pdf_bytes.len() / (1024 * 1024),
1517 MAX_PDF_SIZE / (1024 * 1024),
1518 ));
1519 }
1520
1521 let base64_pdf = format!("data:application/pdf;base64,{}", base64_encode(&pdf_bytes));
1522
1523 let input = OcrRequestInput {
1524 content_base64: base64_pdf,
1525 file_name: attachment_path
1526 .file_name()
1527 .and_then(|n| n.to_str())
1528 .unwrap_or("document.pdf")
1529 .to_string(),
1530 mime_type: "application/pdf".to_string(),
1531 item_key: options.parent.clone(),
1532 attachment_key: attachment_key.to_string(),
1533 source_url: None,
1534 local_path: Some(attachment_path.to_string_lossy().to_string()),
1535 output_dir: None,
1536 };
1537 let request = build_ocr_provider_request(provider, &input)?;
1538
1539 let payload = if request.command.is_empty() {
1540 let method = request
1541 .method
1542 .ok_or_else(|| format!("OCR provider {provider} missing HTTP method"))?;
1543 let spec = raw_ocr_provider_spec(provider)?;
1544
1545 let mut transport = if !api_key.is_empty() {
1546 match spec.auth {
1547 "bearer" => UreqProviderHttpTransport::with_bearer_token(&api_key),
1548 "token" => UreqProviderHttpTransport::with_api_key(format!("token {api_key}")),
1549 _ => UreqProviderHttpTransport::new(),
1550 }
1551 } else {
1552 UreqProviderHttpTransport::new()
1553 };
1554
1555 transport.post_json(&ProviderHttpInvocation {
1556 provider: request.provider.to_string(),
1557 style: request.style.to_string(),
1558 method: method.to_string(),
1559 url: Some(api_url),
1560 auth_header_name: request.auth_header.map(ToString::to_string),
1561 auth_header_value: None,
1562 body: request.body,
1563 })?
1564 } else {
1565 let mut runner = StdProviderCommandRunner;
1566 runner.run_json(&request.command)?
1567 };
1568
1569 let blocks = parse_ocr_provider_response(provider, &payload, &options.parent, attachment_key)?;
1570 let chunks = zotron_types::chunks_from_blocks(&blocks, options.chunk_chars);
1571
1572 let artifacts = vec![
1573 write_sidecar_json(
1574 storage_dir, &options.parent, attachment_key,
1575 MachineArtifactKind::OcrRaw, &payload,
1576 )?,
1577 write_sidecar_jsonl(
1578 storage_dir, &options.parent, attachment_key,
1579 MachineArtifactKind::Blocks, &blocks,
1580 )?,
1581 write_sidecar_jsonl(
1582 storage_dir, &options.parent, attachment_key,
1583 MachineArtifactKind::Chunks, &chunks,
1584 )?,
1585 ];
1586
1587 let embedding_count = embed_sidecar_chunks(client, storage_dir, &options.parent, attachment_key, &chunks);
1588
1589 Ok(serde_json::json!({
1590 "provider": provider,
1591 "status": "indexed",
1592 "item_key": options.parent,
1593 "attachment_key": attachment_key,
1594 "embeddings": embedding_count,
1595 "attachment_path": attachment_path,
1596 "storage_dir": storage_dir,
1597 "blocks": blocks.len(),
1598 "chunks": chunks.len(),
1599 "artifacts": artifacts,
1600 }))
1601}
1602
1603fn embed_sidecar_chunks(
1604 client: &mut impl RpcCaller,
1605 storage_dir: &Path,
1606 item_key: &str,
1607 _attachment_key: &str,
1608 chunks: &[zotron_types::StructureChunk],
1609) -> usize {
1610 let Ok((provider, model, api_url, api_key)) = fetch_embedding_settings(client) else {
1611 return 0;
1612 };
1613 if provider.is_empty() || (api_key.is_empty() && provider != "ollama") {
1614 return 0;
1615 }
1616 let emb_chunks: Vec<EmbeddingChunkInput> = chunks
1617 .iter()
1618 .map(|c| EmbeddingChunkInput {
1619 chunk_key: c.chunk_key.clone(),
1620 text: c.text.clone(),
1621 })
1622 .collect();
1623 if emb_chunks.is_empty() {
1624 return 0;
1625 }
1626 let batch_size = 20;
1628 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
1629 for batch in emb_chunks.chunks(batch_size) {
1630 let input = EmbeddingRequestInput {
1631 item_key: item_key.to_string(),
1632 chunks: batch.to_vec(),
1633 model: if model.is_empty() { None } else { Some(model.clone()) },
1634 url: if api_url.is_empty() { None } else { Some(api_url.clone()) },
1635 input_type: Some("document".to_string()),
1636 };
1637 let Ok(request) = build_embedding_provider_request(&provider, &input) else {
1638 break;
1639 };
1640 let Some(url) = request.url.as_deref() else { break };
1641 let mut http = ureq::post(url).set("Content-Type", "application/json");
1642 if let Some(auth) = request.auth_header {
1643 if !api_key.is_empty() {
1644 http = http.set(auth, &format!("Bearer {api_key}"));
1645 }
1646 }
1647 let Ok(resp) = http.send_json(&request.body) else { break };
1648 let Ok(payload): Result<Value, _> = resp.into_json() else { break };
1649 let Ok(vectors) = parse_embedding_provider_response(&provider, &payload, item_key, batch)
1650 else {
1651 break;
1652 };
1653 all_vectors.extend(vectors);
1654 }
1655 let count = all_vectors.len();
1656 if count > 0 {
1657 let filename = embedding_vector_filename(&provider, &model);
1658 let vectors_dir = storage_dir.join(".zotron").join("embeddings");
1659 fs::create_dir_all(&vectors_dir).map_err(|e| {
1660 eprintln!("warning: cannot create embeddings dir {}: {e}", vectors_dir.display());
1661 e
1662 }).ok();
1663 let vectors_path = vectors_dir.join(&filename);
1664 let mut out = String::new();
1665 for v in &all_vectors {
1666 if let Ok(line) = serde_json::to_string(v) {
1667 out.push_str(&line);
1668 out.push('\n');
1669 }
1670 }
1671 if let Err(e) = fs::write(&vectors_path, &out) {
1672 eprintln!("warning: failed to persist embeddings to {}: {e}", vectors_path.display());
1673 }
1674 }
1675 count
1676}
1677
1678struct MineruResultSource {
1679 task_id: Option<String>,
1680 state: String,
1681 result_dir: PathBuf,
1682 raw_zip_bytes: Option<Vec<u8>>,
1683 task_status: Option<Value>,
1684 payload: Value,
1685 content_list_file: Option<PathBuf>,
1686 markdown: Option<String>,
1687}
1688
1689struct PersistedOcrArtifacts {
1690 block_count: usize,
1691 chunk_count: usize,
1692 artifacts: Vec<Value>,
1693}
1694
1695fn resolve_attachment_path(
1696 client: &mut impl RpcCaller,
1697 attachment_key: &str,
1698) -> Result<PathBuf, String> {
1699 let payload = client.call(
1700 "attachments.getPath",
1701 Some(serde_json::json!({"key": attachment_key})),
1702 )?;
1703 let raw_path = payload
1704 .get("path")
1705 .and_then(Value::as_str)
1706 .filter(|path| !path.trim().is_empty())
1707 .ok_or_else(|| {
1708 format!("ATTACHMENT_PATH_NOT_FOUND: attachment {attachment_key} has no local PDF path")
1709 })?;
1710 Ok(PathBuf::from(local_path_from_zotero_path(raw_path)))
1711}
1712
1713fn resolve_first_pdf_attachment_key(
1715 client: &mut impl RpcCaller,
1716 parent_key: &str,
1717) -> Result<String, String> {
1718 let response = client.call(
1719 "attachments.list",
1720 Some(serde_json::json!({"parentKey": parent_key})),
1721 )?;
1722 let attachments = response
1724 .get("items")
1725 .and_then(Value::as_array)
1726 .or_else(|| response.as_array())
1727 .ok_or_else(|| {
1728 format!("NO_PDF_ATTACHMENT: no attachments found for item {parent_key}")
1729 })?;
1730 for attachment in attachments {
1731 if is_pdf_attachment(attachment) {
1732 if let Some(key) = attachment.get("key").and_then(Value::as_str) {
1733 return Ok(key.to_string());
1734 }
1735 }
1736 }
1737 Err(format!(
1738 "NO_PDF_ATTACHMENT: no PDF attachment found for item {parent_key}"
1739 ))
1740}
1741
1742fn load_mineru_result_source(
1743 options: &OcrProcessOptions,
1744 attachment_path: &Path,
1745 file_name: &str,
1746) -> Result<MineruResultSource, String> {
1747 if let Some(result_dir) = options.result_dir.as_deref() {
1748 return mineru_result_source_from_dir(PathBuf::from(result_dir), None, None, None);
1749 }
1750 if let Some(result_zip) = options.result_zip.as_deref() {
1751 let zip_path = PathBuf::from(result_zip);
1752 let zip_bytes = fs::read(&zip_path)
1753 .map_err(|err| format!("read MinerU result zip {}: {err}", zip_path.display()))?;
1754 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1755 return mineru_result_source_from_dir(result_dir, Some(zip_bytes), None, None);
1756 }
1757
1758 let Some(source_url) = options
1759 .source_url
1760 .as_deref()
1761 .filter(|value| !value.trim().is_empty())
1762 else {
1763 return submit_mineru_local_file(options, attachment_path, file_name);
1764 };
1765 let input = OcrRequestInput {
1766 item_key: options.parent.clone(),
1767 attachment_key: options.attachment.clone().expect("attachment resolved"),
1768 file_name: file_name.to_string(),
1769 mime_type: "application/pdf".to_string(),
1770 content_base64: format!("url:{source_url}"),
1771 source_url: Some(source_url.to_string()),
1772 local_path: None,
1773 output_dir: None,
1774 };
1775 let task = submit_mineru_task(
1776 &options.provider,
1777 &input,
1778 options.provider_endpoint.clone(),
1779 &options.api_key_env,
1780 )?;
1781 let task_id = task
1782 .get("data")
1783 .and_then(|data| data.get("task_id"))
1784 .and_then(Value::as_str)
1785 .ok_or_else(|| "MinerU submit response missing data.task_id".to_string())?
1786 .to_string();
1787 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1788 let status = poll_mineru_task(
1789 options.provider_endpoint.as_deref(),
1790 &task_id,
1791 &auth_header,
1792 options.poll_interval_seconds,
1793 options.timeout_seconds,
1794 )?;
1795 let zip_url = status
1796 .pointer("/data/full_zip_url")
1797 .or_else(|| status.pointer("/data/result/full_zip_url"))
1798 .and_then(Value::as_str)
1799 .ok_or_else(|| "MinerU completed task missing data.full_zip_url".to_string())?;
1800 let zip_bytes = download_bytes(zip_url)?;
1801 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1802 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(task_id))
1803}
1804
1805fn submit_mineru_local_file(
1806 options: &OcrProcessOptions,
1807 attachment_path: &Path,
1808 file_name: &str,
1809) -> Result<MineruResultSource, String> {
1810 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1811 let upload_request = create_mineru_file_upload(
1812 options.provider_endpoint.as_deref(),
1813 file_name,
1814 options.attachment.as_deref().expect("attachment resolved"),
1815 &auth_header,
1816 )?;
1817 let upload_url = upload_request
1818 .pointer("/data/file_urls/0")
1819 .or_else(|| upload_request.pointer("/data/fileUrls/0"))
1820 .and_then(Value::as_str)
1821 .ok_or_else(|| "MinerU upload URL response missing data.file_urls[0]".to_string())?;
1822 let batch_id = upload_request
1823 .pointer("/data/batch_id")
1824 .or_else(|| upload_request.pointer("/data/batchId"))
1825 .and_then(Value::as_str)
1826 .ok_or_else(|| "MinerU upload URL response missing data.batch_id".to_string())?
1827 .to_string();
1828 let bytes = fs::read(attachment_path)
1829 .map_err(|err| format!("read attachment PDF {}: {err}", attachment_path.display()))?;
1830 put_bytes(upload_url, &bytes)?;
1831 let status = poll_mineru_batch(
1832 options.provider_endpoint.as_deref(),
1833 &batch_id,
1834 &auth_header,
1835 options.poll_interval_seconds,
1836 options.timeout_seconds,
1837 )?;
1838 let zip_url = mineru_batch_zip_url(&status)
1839 .ok_or_else(|| "MinerU completed batch missing full_zip_url".to_string())?;
1840 let zip_bytes = download_bytes(&zip_url)?;
1841 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1842 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(batch_id))
1843}
1844
1845fn create_mineru_file_upload(
1846 endpoint: Option<&str>,
1847 file_name: &str,
1848 data_id: &str,
1849 auth_header: &str,
1850) -> Result<Value, String> {
1851 let url = mineru_file_urls_url(endpoint);
1852 let body = serde_json::json!({
1853 "files": [{"name": file_name, "data_id": data_id}],
1854 "model_version": "vlm",
1855 "is_ocr": false,
1856 "enable_formula": true,
1857 "enable_table": true,
1858 "language": "ch",
1859 "page_ranges": "1-200",
1860 });
1861 ureq::post(&url)
1862 .set("Authorization", auth_header)
1863 .send_json(body)
1864 .map_err(|err| format!("POST {url} failed: {err}"))?
1865 .into_json::<Value>()
1866 .map_err(|err| format!("POST {url} returned invalid JSON: {err}"))
1867}
1868
1869fn put_bytes(url: &str, bytes: &[u8]) -> Result<(), String> {
1870 ureq::put(url)
1871 .send_bytes(bytes)
1872 .map_err(|err| format!("PUT {url} failed: {err}"))?;
1873 Ok(())
1874}
1875
1876fn submit_mineru_task(
1877 provider: &str,
1878 input: &OcrRequestInput,
1879 endpoint: Option<String>,
1880 api_key_env: &str,
1881) -> Result<Value, String> {
1882 let request = build_ocr_provider_request(provider, input)?;
1883 let method = request
1884 .method
1885 .ok_or_else(|| "MinerU provider missing HTTP method".to_string())?;
1886 let mut transport = provider_http_transport_with_auth(Some(api_key_env), "bearer")?;
1887 transport.post_json(&ProviderHttpInvocation {
1888 provider: request.provider.to_string(),
1889 style: request.style.to_string(),
1890 method: method.to_string(),
1891 url: endpoint.or_else(|| request.url.map(ToString::to_string)),
1892 auth_header_name: request.auth_header.map(ToString::to_string),
1893 auth_header_value: None,
1894 body: request.body,
1895 })
1896}
1897
1898fn poll_mineru_task(
1899 endpoint: Option<&str>,
1900 task_id: &str,
1901 auth_header: &str,
1902 poll_interval_seconds: u64,
1903 timeout_seconds: u64,
1904) -> Result<Value, String> {
1905 let url = mineru_task_status_url(endpoint, task_id);
1906 let started = Instant::now();
1907 loop {
1908 let status = get_json_with_auth(&url, auth_header)?;
1909 let state = status
1910 .pointer("/data/state")
1911 .or_else(|| status.pointer("/data/status"))
1912 .and_then(Value::as_str)
1913 .unwrap_or("unknown");
1914 match state {
1915 "done" | "finished" | "success" => return Ok(status),
1916 "failed" | "error" => return Err(format!("MinerU task {task_id} failed: {status}")),
1917 _ => {
1918 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1919 return Err(format!(
1920 "MinerU task {task_id} timed out after {timeout_seconds}s with state {state}"
1921 ));
1922 }
1923 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1924 }
1925 }
1926 }
1927}
1928
1929fn mineru_task_status_url(endpoint: Option<&str>, task_id: &str) -> String {
1930 let base = endpoint
1931 .unwrap_or("https://mineru.net/api/v4/extract/task")
1932 .trim_end_matches('/');
1933 if base.ends_with("/extract/task") {
1934 format!("{base}/{task_id}")
1935 } else {
1936 format!("{base}/extract/task/{task_id}")
1937 }
1938}
1939
1940fn mineru_file_urls_url(endpoint: Option<&str>) -> String {
1941 let base = mineru_api_base(endpoint);
1942 format!("{base}/file-urls/batch")
1943}
1944
1945fn mineru_batch_status_url(endpoint: Option<&str>, batch_id: &str) -> String {
1946 let base = mineru_api_base(endpoint);
1947 format!("{base}/extract-results/batch/{batch_id}")
1948}
1949
1950fn mineru_api_base(endpoint: Option<&str>) -> String {
1951 let base = endpoint
1952 .unwrap_or("https://mineru.net/api/v4/extract/task")
1953 .trim_end_matches('/');
1954 if let Some(stripped) = base.strip_suffix("/extract/task") {
1955 return stripped.to_string();
1956 }
1957 if let Some(stripped) = base.strip_suffix("/extract") {
1958 return stripped.to_string();
1959 }
1960 base.to_string()
1961}
1962
1963fn poll_mineru_batch(
1964 endpoint: Option<&str>,
1965 batch_id: &str,
1966 auth_header: &str,
1967 poll_interval_seconds: u64,
1968 timeout_seconds: u64,
1969) -> Result<Value, String> {
1970 let url = mineru_batch_status_url(endpoint, batch_id);
1971 let started = Instant::now();
1972 loop {
1973 let status = get_json_with_auth(&url, auth_header)?;
1974 let state = mineru_batch_state(&status).unwrap_or("unknown");
1975 match state {
1976 "done" | "finished" | "success" => return Ok(status),
1977 "failed" | "error" => return Err(format!("MinerU batch {batch_id} failed: {status}")),
1978 _ => {
1979 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1980 return Err(format!(
1981 "MinerU batch {batch_id} timed out after {timeout_seconds}s with state {state}"
1982 ));
1983 }
1984 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1985 }
1986 }
1987 }
1988}
1989
1990fn mineru_batch_state(status: &Value) -> Option<&str> {
1991 status
1992 .pointer("/data/extract_result/0/state")
1993 .or_else(|| status.pointer("/data/extractResult/0/state"))
1994 .or_else(|| status.pointer("/data/state"))
1995 .and_then(Value::as_str)
1996}
1997
1998fn mineru_batch_zip_url(status: &Value) -> Option<String> {
1999 status
2000 .pointer("/data/extract_result/0/full_zip_url")
2001 .or_else(|| status.pointer("/data/extractResult/0/full_zip_url"))
2002 .or_else(|| status.pointer("/data/full_zip_url"))
2003 .and_then(Value::as_str)
2004 .map(ToString::to_string)
2005}
2006
2007fn provider_auth_header_value(api_key_env: &str, auth_scheme: &str) -> Result<String, String> {
2008 let token = env::var(api_key_env)
2009 .map_err(|_| format!("missing provider credential env var {api_key_env}"))?;
2010 let token = token.trim();
2011 if token.is_empty() {
2012 return Err(format!(
2013 "provider credential env var {api_key_env} is empty"
2014 ));
2015 }
2016 Ok(match auth_scheme {
2017 "bearer" if token.starts_with("Bearer ") => token.to_string(),
2018 "bearer" => format!("Bearer {token}"),
2019 "token" if token.starts_with("token ") => token.to_string(),
2020 "token" => format!("token {token}"),
2021 _ => token.to_string(),
2022 })
2023}
2024
2025fn get_json_with_auth(url: &str, auth_header: &str) -> Result<Value, String> {
2026 ureq::get(url)
2027 .set("Authorization", auth_header)
2028 .call()
2029 .map_err(|err| format!("GET {url} failed: {err}"))?
2030 .into_json::<Value>()
2031 .map_err(|err| format!("GET {url} returned invalid JSON: {err}"))
2032}
2033
2034fn download_bytes(url: &str) -> Result<Vec<u8>, String> {
2035 let response = ureq::get(url)
2036 .call()
2037 .map_err(|err| format!("download {url} failed: {err}"))?;
2038 let mut bytes = Vec::new();
2039 response
2040 .into_reader()
2041 .read_to_end(&mut bytes)
2042 .map_err(|err| format!("read download {url}: {err}"))?;
2043 Ok(bytes)
2044}
2045
2046fn extract_zip_bytes_to_temp(prefix: &str, zip_bytes: &[u8]) -> Result<PathBuf, String> {
2047 let dir = unique_temp_path(prefix);
2048 fs::create_dir_all(&dir).map_err(|err| format!("create temp dir {}: {err}", dir.display()))?;
2049 let zip_path = dir.with_extension("zip");
2050 fs::write(&zip_path, zip_bytes)
2051 .map_err(|err| format!("write temp zip {}: {err}", zip_path.display()))?;
2052 let output = ProcessCommand::new("unzip")
2053 .arg("-q")
2054 .arg("-o")
2055 .arg(&zip_path)
2056 .arg("-d")
2057 .arg(&dir)
2058 .output()
2059 .map_err(|err| format!("run unzip: {err}"))?;
2060 if !output.status.success() {
2061 return Err(format!(
2062 "unzip {} failed: {}",
2063 zip_path.display(),
2064 String::from_utf8_lossy(&output.stderr).trim()
2065 ));
2066 }
2067 Ok(dir)
2068}
2069
2070fn unique_temp_path(prefix: &str) -> PathBuf {
2071 let nanos = SystemTime::now()
2072 .duration_since(UNIX_EPOCH)
2073 .map(|duration| duration.as_nanos())
2074 .unwrap_or(0);
2075 env::temp_dir().join(format!("{prefix}-{}-{nanos}", std::process::id()))
2076}
2077
2078fn mineru_result_source_from_dir(
2079 result_dir: PathBuf,
2080 raw_zip_bytes: Option<Vec<u8>>,
2081 task_status: Option<Value>,
2082 task_id: Option<String>,
2083) -> Result<MineruResultSource, String> {
2084 let (payload, content_list_file) = mineru_payload_from_result_dir(&result_dir)?;
2085 let markdown = find_first_file_by_name(&result_dir, "full.md")
2086 .map(|path| {
2087 fs::read_to_string(&path)
2088 .map_err(|err| format!("read native markdown {}: {err}", path.display()))
2089 })
2090 .transpose()?;
2091 Ok(MineruResultSource {
2092 task_id,
2093 state: "done".to_string(),
2094 result_dir,
2095 raw_zip_bytes,
2096 task_status,
2097 payload,
2098 content_list_file,
2099 markdown,
2100 })
2101}
2102
2103fn mineru_payload_from_result_dir(result_dir: &Path) -> Result<(Value, Option<PathBuf>), String> {
2104 let v2 = find_first_file_with_suffix(result_dir, "_content_list_v2.json");
2105 if let Some(path) = v2 {
2106 let value = read_json_file(&path)?;
2107 return Ok((serde_json::json!({"content_list_v2": value}), Some(path)));
2108 }
2109 let content_list = find_first_file_with_suffix(result_dir, "_content_list.json");
2110 if let Some(path) = content_list {
2111 let value = read_json_file(&path)?;
2112 return Ok((serde_json::json!({"content_list": value}), Some(path)));
2113 }
2114 let layout = find_first_file_by_name(result_dir, "layout.json");
2115 if let Some(path) = layout {
2116 return Ok((read_json_file(&path)?, Some(path)));
2117 }
2118 let markdown = find_first_file_by_name(result_dir, "full.md");
2119 if let Some(path) = markdown {
2120 let text = fs::read_to_string(&path)
2121 .map_err(|err| format!("read native markdown {}: {err}", path.display()))?;
2122 return Ok((serde_json::json!({"result": text}), Some(path)));
2123 }
2124 Err(format!(
2125 "MinerU result directory {} missing content_list_v2/content_list/layout/full.md",
2126 result_dir.display()
2127 ))
2128}
2129
2130fn read_json_file(path: &Path) -> Result<Value, String> {
2131 let raw = fs::read_to_string(path).map_err(|err| format!("read {}: {err}", path.display()))?;
2132 serde_json::from_str(&raw).map_err(|err| format!("parse JSON {}: {err}", path.display()))
2133}
2134
2135fn persist_mineru_result_sidecars(
2136 storage_dir: &Path,
2137 item_key: &str,
2138 attachment_key: &str,
2139 provider: &str,
2140 source: &MineruResultSource,
2141 chunk_chars: usize,
2142) -> Result<PersistedOcrArtifacts, String> {
2143 let blocks = parse_ocr_provider_response(provider, &source.payload, item_key, attachment_key)?;
2144 let chunks = zotron_types::chunks_from_blocks(&blocks, chunk_chars);
2145 let assets = copy_mineru_assets(&source.result_dir, storage_dir)?;
2146 let raw_bundle = serde_json::json!({
2147 "provider": provider,
2148 "item_key": item_key,
2149 "attachment_key": attachment_key,
2150 "task_id": source.task_id,
2151 "state": source.state,
2152 "task_status": source.task_status,
2153 "content_list_file": source.content_list_file,
2154 "payload": source.payload,
2155 });
2156
2157 let mut artifacts = Vec::new();
2158 artifacts.push(write_sidecar_json(
2159 storage_dir,
2160 item_key,
2161 attachment_key,
2162 MachineArtifactKind::OcrRaw,
2163 &raw_bundle,
2164 )?);
2165 artifacts.push(write_sidecar_jsonl(
2166 storage_dir,
2167 item_key,
2168 attachment_key,
2169 MachineArtifactKind::Blocks,
2170 &blocks,
2171 )?);
2172 artifacts.push(write_sidecar_jsonl(
2173 storage_dir,
2174 item_key,
2175 attachment_key,
2176 MachineArtifactKind::Chunks,
2177 &chunks,
2178 )?);
2179 if let Some(markdown) = source.markdown.as_deref() {
2180 artifacts.push(write_sidecar_bytes(
2181 storage_dir,
2182 item_key,
2183 attachment_key,
2184 MachineArtifactKind::OcrNativeMarkdown,
2185 markdown.as_bytes(),
2186 )?);
2187 }
2188 artifacts.push(write_sidecar_json(
2189 storage_dir,
2190 item_key,
2191 attachment_key,
2192 MachineArtifactKind::OcrNativeAssets,
2193 &assets,
2194 )?);
2195 if let Some(bytes) = source.raw_zip_bytes.as_deref() {
2196 artifacts.push(write_extra_sidecar_bytes(
2197 storage_dir,
2198 ".zotron/ocr/latest.raw.zip",
2199 bytes,
2200 )?);
2201 }
2202
2203 Ok(PersistedOcrArtifacts {
2204 block_count: blocks.len(),
2205 chunk_count: chunks.len(),
2206 artifacts,
2207 })
2208}
2209
2210fn write_sidecar_json(
2211 storage_dir: &Path,
2212 item_key: &str,
2213 attachment_key: &str,
2214 kind: MachineArtifactKind,
2215 value: &Value,
2216) -> Result<Value, String> {
2217 let bytes = serde_json::to_vec_pretty(value).map_err(|err| err.to_string())?;
2218 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, &bytes)
2219}
2220
2221fn write_sidecar_jsonl<T: serde::Serialize>(
2222 storage_dir: &Path,
2223 item_key: &str,
2224 attachment_key: &str,
2225 kind: MachineArtifactKind,
2226 values: &[T],
2227) -> Result<Value, String> {
2228 let mut out = String::new();
2229 for value in values {
2230 out.push_str(&serde_json::to_string(value).map_err(|err| err.to_string())?);
2231 out.push('\n');
2232 }
2233 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, out.as_bytes())
2234}
2235
2236fn write_sidecar_bytes(
2237 storage_dir: &Path,
2238 item_key: &str,
2239 attachment_key: &str,
2240 kind: MachineArtifactKind,
2241 bytes: &[u8],
2242) -> Result<Value, String> {
2243 let record = write_machine_artifact_sidecar(storage_dir, item_key, attachment_key, kind, bytes)
2244 .map_err(|err| format!("write sidecar {:?}: {err}", kind))?;
2245 Ok(serde_json::json!({
2246 "kind": kind,
2247 "relative_path": record.relative_path,
2248 "absolute_path": record.absolute_path,
2249 }))
2250}
2251
2252fn write_extra_sidecar_bytes(
2253 storage_dir: &Path,
2254 relative_path: &str,
2255 bytes: &[u8],
2256) -> Result<Value, String> {
2257 let absolute_path = storage_dir.join(relative_path);
2258 if let Some(parent) = absolute_path.parent() {
2259 fs::create_dir_all(parent).map_err(|err| format!("create {}: {err}", parent.display()))?;
2260 }
2261 fs::write(&absolute_path, bytes)
2262 .map_err(|err| format!("write sidecar {}: {err}", absolute_path.display()))?;
2263 Ok(serde_json::json!({
2264 "kind": "ocr_raw_zip",
2265 "relative_path": relative_path,
2266 "absolute_path": absolute_path,
2267 }))
2268}
2269
2270fn copy_mineru_assets(result_dir: &Path, storage_dir: &Path) -> Result<Value, String> {
2271 let mut images = Vec::new();
2272 for file in collect_files(result_dir)? {
2273 if !is_image_file(&file) {
2274 continue;
2275 }
2276 let relative = file.strip_prefix(result_dir).unwrap_or(&file).to_path_buf();
2277 let destination = storage_dir.join(".zotron").join("ocr").join(&relative);
2278 if let Some(parent) = destination.parent() {
2279 fs::create_dir_all(parent)
2280 .map_err(|err| format!("create {}: {err}", parent.display()))?;
2281 }
2282 fs::copy(&file, &destination).map_err(|err| {
2283 format!(
2284 "copy MinerU asset {} to {}: {err}",
2285 file.display(),
2286 destination.display()
2287 )
2288 })?;
2289 images.push(serde_json::json!({
2290 "source_relative": relative,
2291 "sidecar_relative": PathBuf::from(".zotron").join("ocr").join(&relative),
2292 "absolute_path": destination,
2293 }));
2294 }
2295 Ok(serde_json::json!({
2296 "provider": "mineru",
2297 "images": images,
2298 }))
2299}
2300
2301fn is_image_file(path: &Path) -> bool {
2302 matches!(
2303 path.extension()
2304 .and_then(|ext| ext.to_str())
2305 .unwrap_or_default()
2306 .to_ascii_lowercase()
2307 .as_str(),
2308 "png" | "jpg" | "jpeg" | "webp" | "gif"
2309 )
2310}
2311
2312fn find_first_file_with_suffix(root: &Path, suffix: &str) -> Option<PathBuf> {
2313 collect_files(root).ok()?.into_iter().find(|path| {
2314 path.file_name()
2315 .and_then(|name| name.to_str())
2316 .is_some_and(|name| name.ends_with(suffix))
2317 })
2318}
2319
2320fn find_first_file_by_name(root: &Path, name: &str) -> Option<PathBuf> {
2321 collect_files(root).ok()?.into_iter().find(|path| {
2322 path.file_name()
2323 .and_then(|file_name| file_name.to_str())
2324 .is_some_and(|file_name| file_name == name)
2325 })
2326}
2327
2328fn collect_files(root: &Path) -> Result<Vec<PathBuf>, String> {
2329 let mut files = Vec::new();
2330 collect_files_into(root, &mut files)?;
2331 files.sort();
2332 Ok(files)
2333}
2334
2335fn collect_files_into(root: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
2336 for entry in fs::read_dir(root).map_err(|err| format!("read dir {}: {err}", root.display()))? {
2337 let entry = entry.map_err(|err| format!("read dir entry {}: {err}", root.display()))?;
2338 let path = entry.path();
2339 let file_type = entry
2340 .file_type()
2341 .map_err(|err| format!("stat {}: {err}", path.display()))?;
2342 if file_type.is_dir() {
2343 collect_files_into(&path, files)?;
2344 } else if file_type.is_file() {
2345 files.push(path);
2346 }
2347 }
2348 Ok(())
2349}
2350
2351fn ocr_async_task_result(provider: &str, payload: &Value) -> Option<Value> {
2352 let data = payload.get("data")?;
2353 let task_id = data.get("task_id").and_then(Value::as_str)?;
2354 Some(serde_json::json!({
2355 "provider": provider,
2356 "status": "submitted",
2357 "task_id": task_id,
2358 "state": data.get("state").and_then(Value::as_str).unwrap_or("submitted"),
2359 "result_url": data.get("full_zip_url").or_else(|| data.get("markdown_url")).cloned(),
2360 "raw": payload,
2361 }))
2362}
2363
2364fn ocr_input_from_file(
2365 file: String,
2366 item_key: Option<String>,
2367 attachment_key: Option<String>,
2368 mime_type: Option<String>,
2369) -> Result<OcrRequestInput, String> {
2370 let item_key = item_key
2371 .ok_or_else(|| "INVALID_ARGS: --item-key is required when using --file".to_string())?;
2372 let attachment_key = attachment_key.ok_or_else(|| {
2373 "INVALID_ARGS: --attachment-key is required when using --file".to_string()
2374 })?;
2375 let path = PathBuf::from(&file);
2376 let bytes = fs::read(&path).map_err(|err| format!("read {file}: {err}"))?;
2377 let file_name = path
2378 .file_name()
2379 .and_then(|name| name.to_str())
2380 .unwrap_or("document.pdf")
2381 .to_string();
2382 let mime_type = mime_type.unwrap_or_else(|| guess_mime_type(&path).to_string());
2383 Ok(OcrRequestInput {
2384 item_key,
2385 attachment_key,
2386 file_name,
2387 mime_type,
2388 content_base64: base64_encode(&bytes),
2389 source_url: None,
2390 local_path: Some(file),
2391 output_dir: None,
2392 })
2393}
2394
2395fn guess_mime_type(path: &Path) -> &'static str {
2396 match path
2397 .extension()
2398 .and_then(|ext| ext.to_str())
2399 .unwrap_or_default()
2400 .to_ascii_lowercase()
2401 .as_str()
2402 {
2403 "png" => "image/png",
2404 "jpg" | "jpeg" => "image/jpeg",
2405 "webp" => "image/webp",
2406 _ => "application/pdf",
2407 }
2408}
2409
2410fn base64_encode(bytes: &[u8]) -> String {
2411 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2412 let mut out = String::with_capacity(bytes.len().div_ceil(3) * 4);
2413 for chunk in bytes.chunks(3) {
2414 let b0 = chunk[0];
2415 let b1 = *chunk.get(1).unwrap_or(&0);
2416 let b2 = *chunk.get(2).unwrap_or(&0);
2417 out.push(TABLE[(b0 >> 2) as usize] as char);
2418 out.push(TABLE[(((b0 & 0b0000_0011) << 4) | (b1 >> 4)) as usize] as char);
2419 if chunk.len() > 1 {
2420 out.push(TABLE[(((b1 & 0b0000_1111) << 2) | (b2 >> 6)) as usize] as char);
2421 } else {
2422 out.push('=');
2423 }
2424 if chunk.len() > 2 {
2425 out.push(TABLE[(b2 & 0b0011_1111) as usize] as char);
2426 } else {
2427 out.push('=');
2428 }
2429 }
2430 out
2431}
2432
2433fn run_ocr_status_command(
2434 client: &mut impl RpcCaller,
2435 collection: String,
2436) -> Result<Value, String> {
2437 let collection_key = find_collection_in_tree(client, &collection)?
2438 .and_then(|node| node.get("key").cloned())
2439 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
2440 let raw = paginate_rpc(
2441 client,
2442 "collections.getItems",
2443 serde_json::json!({"key": collection_key}),
2444 500,
2445 )?;
2446 let items = raw
2447 .get("items")
2448 .and_then(Value::as_array)
2449 .or_else(|| raw.as_array())
2450 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
2451 .clone();
2452
2453 let mut has_ocr = 0usize;
2454 for item in &items {
2455 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
2456 if has_ocr_artifact(client, &item_key)? || has_ocr_note(client, &item_key)? {
2457 has_ocr += 1;
2458 }
2459 }
2460
2461 Ok(serde_json::json!({
2462 "collection": collection,
2463 "total": items.len(),
2464 "has_ocr": has_ocr,
2465 "missing_ocr": items.len() - has_ocr,
2466 }))
2467}
2468
2469fn has_ocr_artifact(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2470 if let Some(item_key) = item_key.as_str() {
2471 if machine_artifact_exists_for_item(
2474 machine_artifact_store_root(),
2475 item_key,
2476 MachineArtifactKind::Chunks,
2477 ) {
2478 return Ok(true);
2479 }
2480 }
2481
2482 let attachments = client.call(
2483 "attachments.list",
2484 Some(serde_json::json!({"parentKey": item_key.clone()})),
2485 )?;
2486 Ok(attachments.as_array().is_some_and(|attachments| {
2487 attachments.iter().any(|attachment| {
2488 let has_sidecar_chunks = attachment
2489 .get("path")
2490 .and_then(Value::as_str)
2491 .map(local_path_from_zotero_path)
2492 .as_deref()
2493 .map(Path::new)
2494 .and_then(Path::parent)
2495 .is_some_and(|dir| {
2496 machine_artifact_exists_in_sidecar(dir, MachineArtifactKind::Chunks)
2497 });
2498 if has_sidecar_chunks {
2499 return true;
2500 }
2501
2502 attachment
2504 .get("title")
2505 .and_then(Value::as_str)
2506 .is_some_and(|title| title.ends_with("zotron-chunks.jsonl"))
2507 })
2508 }))
2509}
2510
2511fn local_path_from_zotero_path(path: &str) -> String {
2512 if is_wsl() && path.as_bytes().get(1) == Some(&b':') {
2513 return ProcessCommand::new("wslpath")
2514 .arg("-u")
2515 .arg(path)
2516 .output()
2517 .ok()
2518 .filter(|output| output.status.success())
2519 .and_then(|output| String::from_utf8(output.stdout).ok())
2520 .map(|converted| converted.trim().to_string())
2521 .filter(|converted| !converted.is_empty())
2522 .unwrap_or_else(|| path.to_string());
2523 }
2524 path.to_string()
2525}
2526
2527fn has_ocr_note(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2528 let notes = client.call(
2529 "notes.list",
2530 Some(serde_json::json!({"parentKey": item_key.clone()})),
2531 )?;
2532 Ok(notes.as_array().is_some_and(|notes| {
2533 notes.iter().any(|note| {
2534 note.get("tags")
2535 .and_then(Value::as_array)
2536 .is_some_and(|tags| tags.iter().any(tag_is_ocr))
2537 })
2538 }))
2539}
2540
2541fn tag_is_ocr(tag: &Value) -> bool {
2542 tag.as_str() == Some("ocr")
2543 || tag
2544 .get("tag")
2545 .and_then(Value::as_str)
2546 .is_some_and(|tag| tag == "ocr")
2547}
2548
2549fn find_collection_in_tree(
2550 client: &mut impl RpcCaller,
2551 collection: &str,
2552) -> Result<Option<Value>, String> {
2553 let tree = client.call("collections.tree", None)?;
2554 let nodes = tree
2555 .as_array()
2556 .ok_or_else(|| "collections.tree returned non-array result".to_string())?;
2557 Ok(search_collection_tree(nodes, collection).cloned())
2558}
2559
2560fn search_collection_tree<'a>(nodes: &'a [Value], collection: &str) -> Option<&'a Value> {
2561 for node in nodes {
2562 if node.get("key").and_then(Value::as_str) == Some(collection)
2563 || node.get("name").and_then(Value::as_str) == Some(collection)
2564 {
2565 return Some(node);
2566 }
2567 if let Some(children) = node.get("children").and_then(Value::as_array) {
2568 if let Some(found) = search_collection_tree(children, collection) {
2569 return Some(found);
2570 }
2571 }
2572 }
2573 None
2574}
2575
2576fn run_command(command: Command, client: &mut impl RpcCaller) -> Result<String, String> {
2577 if let Command::Export(args) = command {
2578 return run_export(args, client);
2579 }
2580
2581 let (value, style) = match command {
2582 Command::Ping { .. } => (
2583 call_json(client, "system.ping", None)?,
2584 JsonStyle::PythonCompact,
2585 ),
2586 Command::Rpc {
2587 method,
2588 params_json,
2589 paginate,
2590 page_size,
2591 ..
2592 } => {
2593 let params = serde_json::from_str::<Value>(¶ms_json)
2594 .map_err(|err| format!("INVALID_JSON: params must be a JSON object: {err}"))?;
2595 if !params.is_object() {
2596 return Err("INVALID_JSON: params must be a JSON object".to_string());
2597 }
2598 if paginate {
2599 (
2600 paginate_rpc(client, &method, params, page_size)?,
2601 JsonStyle::Pretty,
2602 )
2603 } else {
2604 (call_json(client, &method, Some(params))?, JsonStyle::Pretty)
2605 }
2606 }
2607 Command::Push {
2608 json_file,
2609 pdf,
2610 collection,
2611 on_duplicate,
2612 dry_run,
2613 ..
2614 } => return run_push_command(json_file, pdf, collection, on_duplicate, dry_run, client),
2615 Command::System { command } => run_system_command(command, client)?,
2616 Command::Search(args) => {
2617 if let Some(mgmt) = args.management {
2618 run_search_management_command(mgmt, client)?
2619 } else {
2620 run_search(args, client)?
2621 }
2622 }
2623 Command::Items { command } => run_items_command(command, client)?,
2624 Command::Collections { command } => run_collections_command(command, client)?,
2625 Command::Notes { command } => run_notes_command(command, client)?,
2626 Command::Settings { command } => run_settings_command(command, client)?,
2627 Command::Tags { command } => run_tags_command(command, client)?,
2628 Command::Annotations { command } => run_annotations_command(command, client)?,
2629 Command::Ocr { command } => {
2630 return run_ocr_command(command, client);
2631 }
2632 Command::Rag { command } => {
2633 return run_rag_command(command, client);
2634 }
2635 Command::Export(_) => unreachable!("export commands return raw output above"),
2636 };
2637
2638 format_json(&value, style)
2639}
2640
2641fn run_rag_command(command: RagCommand, client: &mut impl RpcCaller) -> Result<String, String> {
2642 match command {
2643 RagCommand::Providers => format_json(
2644 &serde_json::json!({
2645 "providers": [
2646 embedding_provider_spec("volcengine")?,
2647 embedding_provider_spec("alibaba")?,
2648 embedding_provider_spec("custom")?,
2649 ],
2650 }),
2651 JsonStyle::Pretty,
2652 ),
2653 RagCommand::Embed {
2654 provider,
2655 input,
2656 endpoint,
2657 model,
2658 input_type,
2659 api_key_env,
2660 } => {
2661 let value = run_embedding_provider_json_command(
2662 provider,
2663 input,
2664 endpoint,
2665 model,
2666 input_type,
2667 api_key_env,
2668 )?;
2669 format_json(&value, JsonStyle::PythonCompact)
2670 }
2671 RagCommand::Status { collection, .. } => {
2672 let value = rag_status_value(client, &collection)?;
2673 format_json(&value, JsonStyle::PythonCompact)
2674 }
2675 RagCommand::Search {
2676 query,
2677 collection,
2678 keys,
2679 zotero,
2680 top_spans_per_item,
2681 include_fulltext_spans,
2682 top_k,
2683 output,
2684 ..
2685 } => run_rag_search_command(
2686 client,
2687 RagSearchOptions {
2688 query,
2689 collection,
2690 keys,
2691 zotero,
2692 top_spans_per_item,
2693 include_fulltext_spans,
2694 top_k,
2695 output,
2696 },
2697 ),
2698 }
2699}
2700
2701fn run_embedding_provider_json_command(
2702 provider: String,
2703 input: String,
2704 endpoint: Option<String>,
2705 model: Option<String>,
2706 input_type: Option<String>,
2707 api_key_env: Option<String>,
2708) -> Result<Value, String> {
2709 let mut input: EmbeddingRequestInput = read_json_input(&input)?;
2710 if endpoint.is_some() {
2711 input.url = endpoint;
2712 }
2713 if model.is_some() {
2714 input.model = model;
2715 }
2716 if input_type.is_some() {
2717 input.input_type = input_type;
2718 }
2719 let mut transport = provider_http_transport(api_key_env.as_deref())?;
2720 let vectors = execute_embedding_provider_request(&provider, &input, &mut transport)?;
2721
2722 Ok(serde_json::json!({
2723 "provider": provider,
2724 "vectors": vectors,
2725 }))
2726}
2727
2728fn provider_http_transport(api_key_env: Option<&str>) -> Result<UreqProviderHttpTransport, String> {
2729 provider_http_transport_with_auth(api_key_env, "bearer")
2730}
2731
2732fn provider_http_transport_with_auth(
2733 api_key_env: Option<&str>,
2734 auth_scheme: &str,
2735) -> Result<UreqProviderHttpTransport, String> {
2736 let Some(env_name) = api_key_env else {
2737 return Ok(UreqProviderHttpTransport::new());
2738 };
2739 let token = env::var(env_name)
2740 .map_err(|_| format!("missing provider credential env var {env_name}"))?;
2741 if token.trim().is_empty() {
2742 return Err(format!("provider credential env var {env_name} is empty"));
2743 }
2744 let token = token.trim();
2745 match auth_scheme {
2746 "token" if token.starts_with("token ") => {
2747 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2748 }
2749 "token" => Ok(UreqProviderHttpTransport::with_api_key(format!(
2750 "token {token}"
2751 ))),
2752 "bearer" if token.starts_with("Bearer ") => {
2753 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2754 }
2755 "bearer" => Ok(UreqProviderHttpTransport::with_bearer_token(token)),
2756 "none" => Ok(UreqProviderHttpTransport::new()),
2757 other => Err(format!("unsupported provider auth scheme {other}")),
2758 }
2759}
2760
2761fn read_json_input<T: serde::de::DeserializeOwned>(path: &str) -> Result<T, String> {
2762 let payload = if path == "-" {
2763 let mut input = String::new();
2764 io::stdin()
2765 .read_to_string(&mut input)
2766 .map_err(|err| format!("read stdin: {err}"))?;
2767 input
2768 } else {
2769 fs::read_to_string(path).map_err(|err| format!("read {path}: {err}"))?
2770 };
2771 serde_json::from_str::<T>(&payload)
2772 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))
2773}
2774
2775fn fetch_embedding_settings(
2776 client: &mut impl RpcCaller,
2777) -> Result<(String, String, String, String), String> {
2778 let settings = client.call("settings.getAll", None)?;
2779 let provider = settings
2780 .get("embedding.provider")
2781 .and_then(Value::as_str)
2782 .unwrap_or("ollama")
2783 .to_string();
2784 let model = settings
2785 .get("embedding.model")
2786 .and_then(Value::as_str)
2787 .unwrap_or("")
2788 .to_string();
2789 let api_url = settings
2790 .get("embedding.apiUrl")
2791 .and_then(Value::as_str)
2792 .unwrap_or("")
2793 .to_string();
2794 let raw = client.call("settings.getRaw", Some(serde_json::json!({"key": "embedding.apiKey"})))?;
2795 let api_key = raw
2796 .get("embedding.apiKey")
2797 .and_then(Value::as_str)
2798 .unwrap_or("")
2799 .to_string();
2800 Ok((provider, model, api_url, api_key))
2801}
2802
2803fn fetch_retrieval_mode(client: &mut impl RpcCaller) -> String {
2804 client
2805 .call(
2806 "settings.get",
2807 Some(serde_json::json!({"key": "rag.retrievalMode"})),
2808 )
2809 .ok()
2810 .and_then(|v| {
2811 v.get("rag.retrievalMode")
2812 .and_then(Value::as_str)
2813 .map(String::from)
2814 })
2815 .unwrap_or_else(|| "hybrid".to_string())
2816}
2817
2818fn resolve_sidecar_paths(
2819 client: &mut impl RpcCaller,
2820 collection: Option<&str>,
2821 keys: &[String],
2822) -> Result<Vec<(String, String, PathBuf)>, String> {
2823 let items = if !keys.is_empty() {
2824 let mut items = Vec::new();
2825 for key in keys {
2826 let item = client.call("items.get", Some(serde_json::json!({"key": key})))?;
2827 items.push(item);
2828 }
2829 items
2830 } else if let Some(col) = collection {
2831 let col_key = resolve_collection(client, col)?;
2832 let response = client.call(
2833 "collections.getItems",
2834 Some(serde_json::json!({"key": col_key})),
2835 )?;
2836 collection_items(&response)
2837 } else {
2838 return Err("INVALID_ARGS: --collection or --key required".into());
2839 };
2840
2841 let mut results = Vec::new();
2842 for item in &items {
2843 let item_key = item.get("key").and_then(Value::as_str).unwrap_or_default();
2844 let attachments = client.call(
2845 "attachments.list",
2846 Some(serde_json::json!({"parentKey": item_key})),
2847 )?;
2848 let att_list = attachments
2849 .get("items")
2850 .and_then(Value::as_array)
2851 .or_else(|| attachments.as_array())
2852 .cloned()
2853 .unwrap_or_default();
2854 for att in &att_list {
2855 let content_type = att
2856 .get("contentType")
2857 .and_then(Value::as_str)
2858 .unwrap_or("");
2859 if content_type != "application/pdf" {
2860 continue;
2861 }
2862 let att_key = att.get("key").and_then(Value::as_str).unwrap_or_default();
2863 let path = att.get("path").and_then(Value::as_str).unwrap_or_default();
2864 if path.is_empty() {
2865 continue;
2866 }
2867 let local_path = local_path_from_zotero_path(path);
2868 let pdf_path = PathBuf::from(&local_path);
2869 if let Some(parent) = pdf_path.parent() {
2870 let sidecar_root = parent.join(".zotron");
2871 if sidecar_root.exists() {
2872 results.push((item_key.to_string(), att_key.to_string(), sidecar_root));
2873 }
2874 }
2875 }
2876 }
2877 Ok(results)
2878}
2879
2880fn load_sidecar_chunks(sidecar_root: &Path) -> Vec<StructureChunk> {
2881 let chunks_path = sidecar_root.join("chunks").join("chunks.v1.jsonl");
2882 let Ok(content) = fs::read_to_string(&chunks_path) else {
2883 return Vec::new();
2884 };
2885 content
2886 .lines()
2887 .filter(|line| !line.trim().is_empty())
2888 .filter_map(|line| serde_json::from_str::<StructureChunk>(line).ok())
2889 .collect()
2890}
2891
2892fn embedding_vector_filename(provider: &str, model: &str) -> String {
2893 let p = provider.trim().to_lowercase().replace('/', "-");
2894 let m = model.trim().to_lowercase().replace('/', "-");
2895 if p.is_empty() && m.is_empty() {
2896 return "vectors.jsonl".to_string();
2897 }
2898 format!("{p}--{m}.jsonl")
2899}
2900
2901fn load_sidecar_vectors(sidecar_root: &Path, provider: &str, model: &str) -> Vec<EmbeddingVector> {
2902 let embeddings_dir = sidecar_root.join("embeddings");
2903 let target = embedding_vector_filename(provider, model);
2904 let target_path = embeddings_dir.join(&target);
2905 if let Ok(content) = fs::read_to_string(&target_path) {
2906 let vecs: Vec<EmbeddingVector> = content
2907 .lines()
2908 .filter(|line| !line.trim().is_empty())
2909 .filter_map(|line| serde_json::from_str(line).ok())
2910 .collect();
2911 if !vecs.is_empty() {
2912 return vecs;
2913 }
2914 }
2915 for legacy in &["vectors.v1.jsonl", "vectors.jsonl"] {
2917 let path = embeddings_dir.join(legacy);
2918 if let Ok(content) = fs::read_to_string(&path) {
2919 let vecs: Vec<EmbeddingVector> = content
2920 .lines()
2921 .filter(|line| !line.trim().is_empty())
2922 .filter_map(|line| serde_json::from_str::<EmbeddingVector>(line).ok())
2923 .filter(|v| v.source_provider == provider || provider.is_empty())
2924 .collect();
2925 if !vecs.is_empty() {
2926 return vecs;
2927 }
2928 }
2929 }
2930 Vec::new()
2931}
2932
2933fn embed_query_text(
2934 query: &str,
2935 provider: &str,
2936 model: &str,
2937 api_url: &str,
2938 api_key: &str,
2939) -> Result<Vec<f64>, String> {
2940 let input = EmbeddingRequestInput {
2941 item_key: "query".to_string(),
2942 chunks: vec![EmbeddingChunkInput {
2943 chunk_key: "q0".to_string(),
2944 text: query.to_string(),
2945 }],
2946 model: if model.is_empty() {
2947 None
2948 } else {
2949 Some(model.to_string())
2950 },
2951 url: if api_url.is_empty() {
2952 None
2953 } else {
2954 Some(api_url.to_string())
2955 },
2956 input_type: Some("query".to_string()),
2957 };
2958 let request = build_embedding_provider_request(provider, &input)?;
2959 let url = request
2960 .url
2961 .as_deref()
2962 .ok_or("no embedding URL configured")?;
2963 let mut http = ureq::post(url).set("Content-Type", "application/json");
2964 if let Some(auth) = request.auth_header {
2965 if !api_key.is_empty() {
2966 http = http.set(auth, &format!("Bearer {api_key}"));
2967 }
2968 }
2969 let resp = http
2970 .send_json(&request.body)
2971 .map_err(|e| format!("embedding request failed: {e}"))?;
2972 let payload: Value = resp
2973 .into_json()
2974 .map_err(|e| format!("embedding response parse: {e}"))?;
2975 let vectors =
2976 parse_embedding_provider_response(provider, &payload, "query", &input.chunks)?;
2977 vectors
2978 .into_iter()
2979 .next()
2980 .map(|v| v.vector)
2981 .ok_or_else(|| "no embedding vector returned".to_string())
2982}
2983
2984fn run_rag_search_xpi_fallback(
2985 client: &mut impl RpcCaller,
2986 options: &RagSearchOptions,
2987) -> Result<String, String> {
2988 let mut params = serde_json::json!({
2989 "query": options.query,
2990 "limit": options.top_k,
2991 "top_spans_per_item": options.top_spans_per_item,
2992 "include_fulltext_spans": options.include_fulltext_spans,
2993 });
2994 if let Some(map) = params.as_object_mut() {
2995 if let Some(col) = &options.collection {
2996 map.insert("collection".into(), Value::String(col.clone()));
2997 }
2998 if !options.keys.is_empty() {
2999 map.insert(
3000 "keys".into(),
3001 Value::Array(options.keys.iter().map(|k| Value::String(k.clone())).collect()),
3002 );
3003 }
3004 }
3005 let payload = client.call("rag.searchHits", Some(params))?;
3006 let hits = payload
3007 .get("hits")
3008 .and_then(Value::as_array)
3009 .cloned()
3010 .unwrap_or_default();
3011 if options.output == "jsonl" {
3012 let mut out = String::new();
3013 for hit in &hits {
3014 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3015 out.push('\n');
3016 }
3017 Ok(out)
3018 } else {
3019 let total = hits.len() as u64;
3020 format_json(
3021 &normalize_list_envelope(
3022 serde_json::json!({"items": hits, "total": total}),
3023 "items",
3024 Some(options.top_k),
3025 0,
3026 ),
3027 JsonStyle::Pretty,
3028 )
3029 }
3030}
3031
3032fn run_rag_search_command(
3033 client: &mut impl RpcCaller,
3034 options: RagSearchOptions,
3035) -> Result<String, String> {
3036 if options.zotero {
3038 if options.collection.is_none() && options.keys.is_empty() {
3039 return Err(
3040 "INVALID_ARGS: --collection or --key is required".to_string(),
3041 );
3042 }
3043 return run_rag_search_xpi_fallback(client, &options);
3044 }
3045
3046 if options.collection.is_none() && options.keys.is_empty() {
3048 return Err("INVALID_ARGS: --collection or --key required".to_string());
3049 }
3050
3051 let sidecars = resolve_sidecar_paths(
3053 client,
3054 options.collection.as_deref(),
3055 &options.keys,
3056 );
3057
3058 let sidecars = match sidecars {
3061 Ok(ref s) if !s.is_empty() => s,
3062 Err(ref e) if e.contains("COLLECTION_NOT_FOUND") => return Err(e.clone()),
3063 _ => return run_rag_search_xpi_fallback(client, &options),
3064 };
3065
3066 let (emb_provider, emb_model, emb_url, emb_key) = fetch_embedding_settings(client)?;
3068 let mut all_chunks: Vec<StructureChunk> = Vec::new();
3069 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
3070 for (_item_key, _att_key, sidecar_root) in sidecars {
3071 all_chunks.extend(load_sidecar_chunks(sidecar_root));
3072 all_vectors.extend(load_sidecar_vectors(sidecar_root, &emb_provider, &emb_model));
3073 }
3074
3075 if all_chunks.is_empty() {
3076 return run_rag_search_xpi_fallback(client, &options);
3077 }
3078
3079 let mode = fetch_retrieval_mode(client);
3081
3082 let bm25_ranked = if mode != "dense" {
3084 bm25_score_chunks(&all_chunks, &options.query, 1.2, 0.75)
3085 } else {
3086 Vec::new()
3087 };
3088
3089 let dense_ranked = if mode != "lexical" && !all_vectors.is_empty() {
3091 match embed_query_text(&options.query, &emb_provider, &emb_model, &emb_url, &emb_key) {
3092 Ok(query_vec) => {
3093 let vec_map: std::collections::HashMap<&str, &[f64]> = all_vectors
3094 .iter()
3095 .map(|v| (v.chunk_key.as_str(), v.vector.as_slice()))
3096 .collect();
3097 let mut scores: Vec<(usize, f64)> = all_chunks
3098 .iter()
3099 .enumerate()
3100 .filter_map(|(i, chunk)| {
3101 vec_map.get(chunk.chunk_key.as_str()).map(|stored| {
3102 (i, cosine_similarity(&query_vec, stored))
3103 })
3104 })
3105 .filter(|(_, s)| *s > 0.0)
3106 .collect();
3107 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
3108 scores
3109 }
3110 Err(_) => Vec::new(),
3111 }
3112 } else {
3113 Vec::new()
3114 };
3115
3116 let limit = options.top_k as usize;
3118 let ranked = if !bm25_ranked.is_empty() && !dense_ranked.is_empty() {
3119 rrf_merge(&bm25_ranked, &dense_ranked, 60.0, limit)
3120 } else if !bm25_ranked.is_empty() {
3121 bm25_ranked.into_iter().take(limit).collect()
3122 } else {
3123 dense_ranked.into_iter().take(limit).collect()
3124 };
3125
3126 let mut per_item_count: std::collections::HashMap<&str, u64> =
3128 std::collections::HashMap::new();
3129 let mut selected: Vec<(usize, f64)> = Vec::new();
3130 for (idx, score) in &ranked {
3131 let item_key = all_chunks[*idx].item_key.as_str();
3132 let count = per_item_count.entry(item_key).or_insert(0);
3133 if *count < options.top_spans_per_item {
3134 *count += 1;
3135 selected.push((*idx, *score));
3136 }
3137 }
3138
3139 let mut meta_cache: std::collections::HashMap<String, Value> =
3141 std::collections::HashMap::new();
3142 let mut hits: Vec<Value> = Vec::new();
3143 for (idx, score) in &selected {
3144 let chunk = &all_chunks[*idx];
3145 let meta = if let Some(cached) = meta_cache.get(&chunk.item_key) {
3146 cached.clone()
3147 } else {
3148 let fetched = client
3149 .call(
3150 "items.get",
3151 Some(serde_json::json!({"key": chunk.item_key})),
3152 )
3153 .unwrap_or(Value::Null);
3154 meta_cache.insert(chunk.item_key.clone(), fetched.clone());
3155 fetched
3156 };
3157 let title = meta
3158 .get("title")
3159 .and_then(Value::as_str)
3160 .unwrap_or("")
3161 .to_string();
3162 let authors = meta
3163 .get("creators")
3164 .and_then(Value::as_array)
3165 .map(|creators| {
3166 creators
3167 .iter()
3168 .filter_map(|c| {
3169 let last = c.get("lastName").and_then(Value::as_str).unwrap_or("");
3170 let first = c.get("firstName").and_then(Value::as_str).unwrap_or("");
3171 if last.is_empty() && first.is_empty() {
3172 None
3173 } else {
3174 Some(format!("{last}{first}"))
3175 }
3176 })
3177 .collect::<Vec<_>>()
3178 .join(", ")
3179 })
3180 .unwrap_or_default();
3181 let year = meta.get("date").and_then(Value::as_str).unwrap_or("");
3182 let mut hit = serde_json::json!({
3183 "item_key": chunk.item_key,
3184 "chunk_key": chunk.chunk_key,
3185 "title": title,
3186 "authors": authors,
3187 "year": year,
3188 "text": chunk.text,
3189 "page_range": chunk.page_range,
3190 "section_path": chunk.section_path,
3191 "score": score,
3192 });
3193 if options.include_fulltext_spans {
3194 hit.as_object_mut().unwrap().insert(
3195 "attachment_key".to_string(),
3196 Value::String(chunk.attachment_key.clone()),
3197 );
3198 }
3199 hits.push(hit);
3200 }
3201
3202 if options.output == "jsonl" {
3204 let mut out = String::new();
3205 for hit in &hits {
3206 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3207 out.push('\n');
3208 }
3209 Ok(out)
3210 } else {
3211 let total = hits.len() as u64;
3212 format_json(
3213 &normalize_list_envelope(
3214 serde_json::json!({"items": hits, "total": total}),
3215 "items",
3216 Some(options.top_k),
3217 0,
3218 ),
3219 JsonStyle::Pretty,
3220 )
3221 }
3222}
3223
3224fn rag_status_value(client: &mut impl RpcCaller, collection: &str) -> Result<Value, String> {
3225 let raw_store_path = rag_store_path(collection);
3226 if raw_store_path.exists() {
3227 return rag_status_from_store(collection, &raw_store_path);
3228 }
3229
3230 let mut store_candidates = Vec::new();
3231 let collection_match = find_collection_in_tree(client, collection)?;
3232 if let Some(collection_node) = collection_match.as_ref() {
3233 if let Some(name) = collection_node.get("name").and_then(Value::as_str) {
3234 store_candidates.push(rag_store_path(name));
3235 }
3236 if let Some(key) = collection_node.get("key").and_then(Value::as_str) {
3237 store_candidates.push(rag_store_path(key));
3238 }
3239 }
3240 for store_path in unique_paths(store_candidates) {
3241 if store_path.exists() {
3242 return rag_status_from_store(collection, &store_path);
3243 }
3244 }
3245
3246 rag_status_from_zotero_sidecars(client, collection, collection_match)
3247}
3248
3249fn unique_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
3250 let mut unique = Vec::new();
3251 for path in paths {
3252 if !unique.iter().any(|seen| seen == &path) {
3253 unique.push(path);
3254 }
3255 }
3256 unique
3257}
3258
3259fn rag_status_from_store(collection: &str, store_path: &Path) -> Result<Value, String> {
3260 let raw = fs::read_to_string(store_path)
3261 .map_err(|err| format!("read RAG store {}: {err}", store_path.display()))?;
3262 let store: Value = serde_json::from_str(&raw)
3263 .map_err(|err| format!("parse RAG store {}: {err}", store_path.display()))?;
3264 let chunks = store
3265 .get("chunks")
3266 .and_then(Value::as_array)
3267 .cloned()
3268 .unwrap_or_default();
3269 let mut item_keys = Vec::<Value>::new();
3270 for chunk in &chunks {
3271 let Some(item_key) = chunk.get("item_key") else {
3272 continue;
3273 };
3274 if !item_keys.iter().any(|seen| seen == item_key) {
3275 item_keys.push(item_key.clone());
3276 }
3277 }
3278 Ok(serde_json::json!({
3279 "status": "indexed",
3280 "collection": store.get("collection").and_then(Value::as_str).unwrap_or(collection),
3281 "collection_key": store.get("collection_key").cloned().unwrap_or(Value::Null),
3282 "model": store.get("model").cloned().unwrap_or(Value::String("unknown".to_string())),
3283 "total_chunks": chunks.len(),
3284 "total_items": item_keys.len(),
3285 "store_path": store_path.to_string_lossy(),
3286 }))
3287}
3288
3289fn rag_status_from_zotero_sidecars(
3290 client: &mut impl RpcCaller,
3291 collection: &str,
3292 collection_match: Option<Value>,
3293) -> Result<Value, String> {
3294 let collection_key = collection_match
3295 .as_ref()
3296 .and_then(|node| node.get("key").cloned())
3297 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
3298 let raw = paginate_rpc(
3299 client,
3300 "collections.getItems",
3301 serde_json::json!({"key": collection_key}),
3302 500,
3303 )?;
3304 let items = raw
3305 .get("items")
3306 .and_then(Value::as_array)
3307 .or_else(|| raw.as_array())
3308 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
3309 .clone();
3310
3311 let mut indexed_items = 0usize;
3312 let mut total_chunks = 0usize;
3313 for item in &items {
3314 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
3315 let chunk_count = sidecar_chunk_count_for_item(client, &item_key)?;
3316 if chunk_count > 0 {
3317 indexed_items += 1;
3318 total_chunks += chunk_count;
3319 }
3320 }
3321
3322 if indexed_items == 0 {
3323 return Ok(serde_json::json!({
3324 "status": "not indexed",
3325 "collection": collection,
3326 "total_items": items.len(),
3327 "indexed_items": 0,
3328 }));
3329 }
3330
3331 Ok(serde_json::json!({
3332 "status": "indexed",
3333 "collection": collection,
3334 "total_chunks": total_chunks,
3335 "total_items": indexed_items,
3336 "collection_items": items.len(),
3337 "source": "zotero-sidecar",
3338 }))
3339}
3340
3341fn sidecar_chunk_count_for_item(
3342 client: &mut impl RpcCaller,
3343 item_key: &Value,
3344) -> Result<usize, String> {
3345 let attachments = client.call(
3346 "attachments.list",
3347 Some(serde_json::json!({"parentKey": item_key.clone()})),
3348 )?;
3349 let Some(attachments) = attachments.as_array() else {
3350 return Ok(0);
3351 };
3352
3353 let mut count = 0usize;
3354 for attachment in attachments {
3355 let Some(path) = attachment.get("path").and_then(Value::as_str) else {
3356 continue;
3357 };
3358 let local = local_path_from_zotero_path(path);
3359 let Some(dir) = Path::new(&local).parent() else {
3360 continue;
3361 };
3362 let Ok(bytes) = read_machine_artifact_sidecar(dir, MachineArtifactKind::Chunks) else {
3363 continue;
3364 };
3365 let text = String::from_utf8_lossy(&bytes);
3366 count += text.lines().filter(|line| !line.trim().is_empty()).count();
3367 }
3368 Ok(count)
3369}
3370
3371fn rag_store_path(collection: &str) -> PathBuf {
3372 rag_store_root().join(format!("{collection}.json"))
3373}
3374
3375fn rag_store_root() -> PathBuf {
3376 let xdg_data_home = env::var_os("XDG_DATA_HOME")
3377 .filter(|path| !path.is_empty())
3378 .map(PathBuf::from);
3379 let appdata = env::var_os("APPDATA")
3380 .filter(|path| !path.is_empty())
3381 .map(PathBuf::from);
3382 let userprofile = env::var_os("USERPROFILE")
3383 .filter(|path| !path.is_empty())
3384 .map(PathBuf::from);
3385 let home = env::var_os("HOME")
3386 .filter(|path| !path.is_empty())
3387 .map(PathBuf::from);
3388
3389 rag_store_root_for_platform(
3390 ArtifactStorePlatform::current(),
3391 xdg_data_home.as_deref(),
3392 appdata.as_deref(),
3393 userprofile.as_deref(),
3394 home.as_deref(),
3395 )
3396}
3397
3398fn rag_store_root_for_platform(
3399 platform: ArtifactStorePlatform,
3400 xdg_data_home: Option<&Path>,
3401 appdata: Option<&Path>,
3402 userprofile: Option<&Path>,
3403 home: Option<&Path>,
3404) -> PathBuf {
3405 match platform {
3406 ArtifactStorePlatform::Windows => {
3407 if let Some(path) = appdata {
3408 return path.join("Zotron").join("rag");
3409 }
3410 if let Some(path) = userprofile {
3411 return path
3412 .join("AppData")
3413 .join("Roaming")
3414 .join("Zotron")
3415 .join("rag");
3416 }
3417 if let Some(path) = home {
3418 return path
3419 .join("AppData")
3420 .join("Roaming")
3421 .join("Zotron")
3422 .join("rag");
3423 }
3424 PathBuf::from(".zotron").join("rag")
3425 }
3426 ArtifactStorePlatform::Macos => {
3427 if let Some(path) = home {
3428 return path
3429 .join("Library")
3430 .join("Application Support")
3431 .join("Zotron")
3432 .join("rag");
3433 }
3434 if let Some(path) = xdg_data_home {
3435 return path.join("zotron").join("rag");
3436 }
3437 PathBuf::from(".zotron").join("rag")
3438 }
3439 ArtifactStorePlatform::Linux | ArtifactStorePlatform::Other => xdg_data_home
3440 .map(|path| path.join("zotron").join("rag"))
3441 .or_else(|| {
3442 home.map(|path| path.join(".local").join("share").join("zotron").join("rag"))
3443 })
3444 .unwrap_or_else(|| PathBuf::from(".zotron").join("rag")),
3445 }
3446}
3447
3448fn run_push_command(
3449 json_file: String,
3450 pdf: Option<String>,
3451 collection: Option<String>,
3452 on_duplicate: String,
3453 dry_run: bool,
3454 client: &mut impl RpcCaller,
3455) -> Result<String, String> {
3456 if !matches!(on_duplicate.as_str(), "skip" | "update" | "create") {
3457 return Err(format!(
3458 "INVALID_ARGS: --on-duplicate must be skip|update|create, got {on_duplicate:?}"
3459 ));
3460 }
3461
3462 let payload = if json_file == "-" {
3463 let mut input = String::new();
3464 io::stdin()
3465 .read_to_string(&mut input)
3466 .map_err(|err| format!("read stdin: {err}"))?;
3467 input
3468 } else {
3469 fs::read_to_string(&json_file).map_err(|err| format!("read {json_file}: {err}"))?
3470 };
3471 let item_json = serde_json::from_str::<Value>(&payload)
3472 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
3473
3474 match item_json.get("itemType").and_then(Value::as_str) {
3476 Some(s) if !s.is_empty() => {}
3477 _ => return Err("INVALID_ARGS: input JSON must include a non-empty \"itemType\" field".to_string()),
3478 }
3479
3480 if dry_run {
3481 let collection_key = collection
3482 .as_deref()
3483 .map(|name| resolve_collection(client, name))
3484 .transpose()?;
3485 return format_json(
3486 &serde_json::json!({
3487 "ok": true,
3488 "dryRun": true,
3489 "wouldPush": {
3490 "title": item_json.get("title").cloned().unwrap_or(Value::Null),
3491 "itemType": item_json.get("itemType").cloned().unwrap_or(Value::Null),
3492 "collectionKey": collection_key,
3493 "pdfPath": pdf,
3494 "onDuplicate": on_duplicate,
3495 }
3496 }),
3497 JsonStyle::PythonCompact,
3498 );
3499 }
3500
3501 let result = push_item(
3502 client,
3503 &item_json,
3504 pdf.as_deref(),
3505 collection.as_deref(),
3506 &on_duplicate,
3507 )?;
3508 format_json(&result, JsonStyle::PythonCompact)
3509}
3510
3511fn push_item(
3512 client: &mut impl RpcCaller,
3513 item_json: &Value,
3514 pdf_path: Option<&str>,
3515 collection: Option<&str>,
3516 on_duplicate: &str,
3517) -> Result<Value, String> {
3518 let pdf_size = if let Some(path) = pdf_path {
3519 validate_pdf_magic(path)?
3520 } else {
3521 0
3522 };
3523
3524 let collection_key = match collection {
3525 Some(name) => resolve_collection(client, name)?,
3526 None => resolve_current_collection(client)?,
3527 };
3528
3529 let dup_id = find_duplicate(client, item_json)?;
3530 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "skip") {
3531 if !is_library_root(&collection_key) {
3532 client.call(
3533 "collections.addItems",
3534 Some(serde_json::json!({"key": collection_key, "keys": [dup_id]})),
3535 )?;
3536 }
3537 let mut pdf_attached = false;
3538 if let Some(path) = pdf_path {
3539 if !item_has_pdf_attachment(client, dup_id)? {
3540 attach_pdf(client, dup_id, path)?;
3541 pdf_attached = true;
3542 }
3543 }
3544 return Ok(push_result(
3545 "skipped_duplicate",
3546 Some(dup_id.to_string()),
3547 pdf_attached,
3548 if pdf_attached { pdf_size } else { 0 },
3549 Value::Null,
3550 ));
3551 }
3552
3553 let xpi_payload = to_xpi_payload(item_json, Some(&collection_key));
3554 let (item_key, status) =
3555 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "update") {
3556 let mut params = serde_json::Map::new();
3557 params.insert("key".to_string(), Value::String(dup_id.to_string()));
3558 params.insert(
3559 "fields".to_string(),
3560 xpi_payload
3561 .get("fields")
3562 .cloned()
3563 .unwrap_or_else(|| serde_json::json!({})),
3564 );
3565 if let Some(creators) = xpi_payload.get("creators") {
3566 params.insert("creators".to_string(), creators.clone());
3567 }
3568 if let Some(tags) = xpi_payload.get("tags") {
3569 params.insert("tags".to_string(), tags.clone());
3570 }
3571 client.call("items.update", Some(Value::Object(params)))?;
3572 (dup_id.to_string(), "updated")
3573 } else {
3574 let created = client.call("items.create", Some(xpi_payload))?;
3575 let key = created
3576 .get("key")
3577 .and_then(Value::as_str)
3578 .ok_or_else(|| format!("items.create returned unexpected shape: {created:?}"))?;
3579 (key.to_string(), "created")
3580 };
3581
3582 let mut pdf_attached = false;
3583 if let Some(path) = pdf_path {
3584 if status != "updated" || !item_has_pdf_attachment(client, &item_key)? {
3585 attach_pdf(client, &item_key, path)?;
3586 pdf_attached = true;
3587 }
3588 }
3589
3590 if status == "updated" && !is_library_root(&collection_key) {
3591 client.call(
3592 "collections.addItems",
3593 Some(serde_json::json!({"key": collection_key, "keys": [item_key]})),
3594 )?;
3595 }
3596
3597 Ok(push_result(
3598 status,
3599 Some(item_key),
3600 pdf_attached,
3601 if pdf_attached { pdf_size } else { 0 },
3602 Value::Null,
3603 ))
3604}
3605
3606fn validate_pdf_magic(path: &str) -> Result<u64, String> {
3607 let bytes = fs::read(path)
3608 .map_err(|e| format!("INVALID_PDF: cannot read {path}: {e}"))?;
3609 if !bytes.starts_with(b"%PDF-") {
3610 return Err(format!(
3611 "INVALID_PDF: {path} does not start with %PDF- magic bytes"
3612 ));
3613 }
3614 Ok(bytes.len() as u64)
3615}
3616
3617fn resolve_current_collection(client: &mut impl RpcCaller) -> Result<Value, String> {
3618 let selected = client.call("system.currentCollection", None)?;
3619 Ok(selected
3620 .get("key")
3621 .cloned()
3622 .unwrap_or_else(|| Value::Number(0.into())))
3623}
3624
3625fn find_duplicate(
3626 client: &mut impl RpcCaller,
3627 item_json: &Value,
3628) -> Result<Option<String>, String> {
3629 if let Some(doi) = item_json
3630 .get("DOI")
3631 .and_then(Value::as_str)
3632 .filter(|doi| !doi.is_empty())
3633 {
3634 let hits = client.call("search.byIdentifier", Some(serde_json::json!({"doi": doi})))?;
3635 if let Some(key) = first_hit_key(&hits) {
3636 return Ok(Some(key));
3637 }
3638 }
3639
3640 if let Some(title) = item_json
3641 .get("title")
3642 .and_then(Value::as_str)
3643 .filter(|title| title.len() >= 10)
3644 {
3645 let hits = client.call(
3646 "search.quick",
3647 Some(serde_json::json!({"query": title, "limit": 20})),
3648 )?;
3649 if let Some(items) = response_items(&hits) {
3650 for item in items {
3651 if item.get("title").and_then(Value::as_str) == Some(title) {
3652 if let Some(key) = item.get("key").and_then(Value::as_str) {
3653 return Ok(Some(key.to_string()));
3654 }
3655 }
3656 }
3657 }
3658 }
3659
3660 Ok(None)
3661}
3662
3663fn first_hit_key(response: &Value) -> Option<String> {
3664 response_items(response)?
3665 .first()?
3666 .get("key")?
3667 .as_str()
3668 .map(ToString::to_string)
3669}
3670
3671fn response_items(response: &Value) -> Option<&Vec<Value>> {
3672 response
3673 .get("items")
3674 .and_then(Value::as_array)
3675 .or_else(|| response.as_array())
3676}
3677
3678fn to_xpi_payload(item_json: &Value, collection_key: Option<&Value>) -> Value {
3679 const NON_FIELD_KEYS: &[&str] = &[
3680 "itemType",
3681 "creators",
3682 "tags",
3683 "collections",
3684 "attachments",
3685 "relations",
3686 "notes",
3687 "id",
3688 "key",
3689 "version",
3690 ];
3691
3692 let mut fields = serde_json::Map::new();
3693 if let Some(item) = item_json.as_object() {
3694 for (key, value) in item {
3695 if !NON_FIELD_KEYS.contains(&key.as_str()) && !value.is_null() && value != "" {
3696 fields.insert(key.clone(), value.clone());
3697 }
3698 }
3699 }
3700
3701 let mut payload = serde_json::Map::new();
3702 payload.insert(
3703 "itemType".to_string(),
3704 item_json
3705 .get("itemType")
3706 .cloned()
3707 .unwrap_or_else(|| Value::String("journalArticle".to_string())),
3708 );
3709 payload.insert("fields".to_string(), Value::Object(fields));
3710
3711 if let Some(creators) = item_json.get("creators").and_then(Value::as_array) {
3712 if !creators.is_empty() {
3713 payload.insert(
3714 "creators".to_string(),
3715 Value::Array(
3716 creators
3717 .iter()
3718 .map(|creator| {
3719 let mut c = serde_json::json!({
3720 "firstName": creator.get("firstName").and_then(Value::as_str).unwrap_or(""),
3721 "lastName": creator.get("lastName").and_then(Value::as_str).unwrap_or(""),
3722 "creatorType": creator.get("creatorType").and_then(Value::as_str).unwrap_or("author"),
3723 });
3724 if let Some(fm) = creator.get("fieldMode").and_then(Value::as_u64) {
3725 c["fieldMode"] = Value::from(fm);
3726 }
3727 c
3728 })
3729 .collect(),
3730 ),
3731 );
3732 }
3733 }
3734
3735 if let Some(tags) = item_json.get("tags").and_then(Value::as_array) {
3736 if !tags.is_empty() {
3737 payload.insert(
3738 "tags".to_string(),
3739 Value::Array(
3740 tags.iter()
3741 .map(|tag| tag.get("tag").cloned().unwrap_or_else(|| tag.clone()))
3742 .collect(),
3743 ),
3744 );
3745 }
3746 }
3747
3748 if let Some(collection_key) = collection_key.filter(|key| !is_library_root(key)) {
3749 payload.insert(
3750 "collections".to_string(),
3751 Value::Array(vec![collection_key.clone()]),
3752 );
3753 }
3754
3755 Value::Object(payload)
3756}
3757
3758fn item_has_pdf_attachment(client: &mut impl RpcCaller, item_key: &str) -> Result<bool, String> {
3759 let attachments = client.call(
3760 "attachments.list",
3761 Some(serde_json::json!({"parentKey": item_key})),
3762 )?;
3763 Ok(has_pdf_attachment(&attachments))
3764}
3765
3766fn attach_pdf(client: &mut impl RpcCaller, item_key: &str, path: &str) -> Result<(), String> {
3767 client.call(
3768 "attachments.add",
3769 Some(serde_json::json!({
3770 "parentKey": item_key,
3771 "path": zotero_path(path),
3772 "title": "Full Text PDF",
3773 })),
3774 )?;
3775 Ok(())
3776}
3777
3778fn zotero_path(path: &str) -> String {
3779 let path = Path::new(path)
3780 .canonicalize()
3781 .unwrap_or_else(|_| Path::new(path).to_path_buf())
3782 .to_string_lossy()
3783 .into_owned();
3784 if is_wsl() {
3785 return ProcessCommand::new("wslpath")
3786 .arg("-w")
3787 .arg(&path)
3788 .output()
3789 .ok()
3790 .filter(|output| output.status.success())
3791 .and_then(|output| String::from_utf8(output.stdout).ok())
3792 .map(|converted| converted.trim().to_string())
3793 .filter(|converted| !converted.is_empty())
3794 .unwrap_or(path);
3795 }
3796 path
3797}
3798
3799fn is_wsl() -> bool {
3800 if env::var_os("WSL_DISTRO_NAME").is_some() {
3801 return true;
3802 }
3803 fs::read_to_string("/proc/sys/kernel/osrelease")
3804 .map(|release| release.to_ascii_lowercase().contains("microsoft"))
3805 .unwrap_or(false)
3806}
3807
3808fn is_library_root(value: &Value) -> bool {
3809 value.as_i64() == Some(0) || value.as_u64() == Some(0)
3810}
3811
3812fn push_result(
3813 status: &str,
3814 zotero_item_key: Option<String>,
3815 pdf_attached: bool,
3816 pdf_size_bytes: u64,
3817 error: Value,
3818) -> Value {
3819 serde_json::json!({
3820 "status": status,
3821 "zotero_item_key": zotero_item_key,
3822 "pdf_attached": pdf_attached,
3823 "pdf_size_bytes": pdf_size_bytes,
3824 "error": error,
3825 })
3826}
3827
3828fn run_search(
3829 args: SearchArgs,
3830 client: &mut impl RpcCaller,
3831) -> Result<(Value, JsonStyle), String> {
3832 let SearchArgs {
3833 query, fulltext, author, after, before, journal, tag,
3834 doi, isbn, issn, collection, limit, offset, ..
3835 } = args;
3836
3837 let has_identifier = doi.is_some() || isbn.is_some() || issn.is_some();
3838 if has_identifier {
3839 let mut params = serde_json::Map::new();
3840 if let Some(doi) = doi { params.insert("doi".into(), Value::String(doi)); }
3841 if let Some(isbn) = isbn { params.insert("isbn".into(), Value::String(isbn)); }
3842 if let Some(issn) = issn { params.insert("issn".into(), Value::String(issn)); }
3843 let value = client.call("search.byIdentifier", Some(Value::Object(params)))?;
3844 return Ok((normalize_list_envelope(value, "items", None, 0), JsonStyle::Pretty));
3845 }
3846
3847 if fulltext {
3848 let query = query.ok_or("INVALID_ARGS: --fulltext requires a search query")?;
3849 let mut params = serde_json::json!({"query": query, "limit": limit});
3850 if let (Some(col), Some(map)) = (collection, params.as_object_mut()) {
3851 map.insert("collection".into(), resolve_collection(client, &col)?);
3852 }
3853 let value = client.call("search.fulltext", Some(params))?;
3854 return Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty));
3855 }
3856
3857 let has_filters = author.is_some() || after.is_some() || before.is_some()
3858 || journal.is_some() || tag.is_some();
3859 if has_filters {
3860 let mut conditions: Vec<Value> = Vec::new();
3861 if let Some(query) = &query {
3862 conditions.push(serde_json::json!({
3863 "field": "quicksearch-titleCreatorYear",
3864 "operator": "contains",
3865 "value": query,
3866 }));
3867 }
3868 if let Some(author) = author {
3869 conditions.push(serde_json::json!({
3870 "field": "creator", "operator": "contains", "value": author,
3871 }));
3872 }
3873 if let Some(after) = after {
3874 conditions.push(serde_json::json!({
3875 "field": "date", "operator": "isAfter", "value": after,
3876 }));
3877 }
3878 if let Some(before) = before {
3879 conditions.push(serde_json::json!({
3880 "field": "date", "operator": "isBefore", "value": before,
3881 }));
3882 }
3883 if let Some(journal) = journal {
3884 conditions.push(serde_json::json!({
3885 "field": "publicationTitle", "operator": "contains", "value": journal,
3886 }));
3887 }
3888 if let Some(tag) = tag {
3889 conditions.push(serde_json::json!({
3890 "field": "tag", "operator": "is", "value": tag,
3891 }));
3892 }
3893 let value = client.call(
3894 "search.advanced",
3895 Some(serde_json::json!({
3896 "conditions": conditions,
3897 "operator": "and",
3898 "limit": limit,
3899 "offset": offset,
3900 })),
3901 )?;
3902 return Ok((normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty));
3903 }
3904
3905 let query = query.ok_or(
3906 "INVALID_ARGS: provide a search query, or use --doi/--isbn/--issn for identifier lookup"
3907 )?;
3908 let value = if let Some(col) = collection {
3909 let key = resolve_collection(client, &col)?;
3910 let response = client.call(
3911 "collections.getItems",
3912 Some(serde_json::json!({"key": key})),
3913 )?;
3914 collection_quick_search_response(&response, &query, limit)
3915 } else {
3916 filter_search_artifacts(client.call(
3917 "search.quick",
3918 Some(serde_json::json!({"query": query, "limit": limit})),
3919 )?)
3920 };
3921 Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty))
3922}
3923
3924fn run_search_management_command(
3925 command: SearchManagementCommand,
3926 client: &mut impl RpcCaller,
3927) -> Result<(Value, JsonStyle), String> {
3928 match command {
3929 SearchManagementCommand::SavedSearches { .. } => Ok((
3930 normalize_list_envelope(client.call("search.savedSearches", None)?, "items", None, 0),
3931 JsonStyle::Pretty,
3932 )),
3933 SearchManagementCommand::CreateSaved {
3934 name, condition, dry_run, ..
3935 } => {
3936 let conditions = condition
3937 .iter()
3938 .map(|raw| parse_search_condition(raw))
3939 .collect::<Result<Vec<_>, _>>()?;
3940 let params = serde_json::json!({"name": name, "conditions": conditions});
3941 if dry_run {
3942 Ok((dry_run_value("search.createSavedSearch", params), JsonStyle::PythonCompact))
3943 } else {
3944 Ok((client.call("search.createSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3945 }
3946 }
3947 SearchManagementCommand::DeleteSaved {
3948 search_key, dry_run, ..
3949 } => {
3950 let params = serde_json::json!({"key": search_key});
3951 if dry_run {
3952 Ok((dry_run_value("search.deleteSavedSearch", params), JsonStyle::PythonCompact))
3953 } else {
3954 Ok((client.call("search.deleteSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3955 }
3956 }
3957 }
3958}
3959
3960fn filter_search_artifacts(mut value: Value) -> Value {
3961 let Some(items) = value.get_mut("items").and_then(Value::as_array_mut) else {
3962 return value;
3963 };
3964 items.retain(|item| match item.get("title").and_then(Value::as_str) {
3965 Some(title) => !is_zotron_evidence_artifact(title),
3966 None => true,
3967 });
3968 let total_items = items.len() as u64;
3969 if let Some(total) = value.get_mut("total") {
3970 *total = Value::from(total_items);
3971 }
3972 value
3973}
3974
3975fn collection_quick_search_response(response: &Value, query: &str, limit: u64) -> Value {
3976 let mut matched = collection_items(response)
3977 .into_iter()
3978 .filter(|item| !item_is_evidence_artifact(item))
3979 .filter(|item| quick_item_matches(item, query))
3980 .collect::<Vec<_>>();
3981 let total = matched.len() as u64;
3982 let limit = usize::try_from(limit).unwrap_or(usize::MAX);
3983 if matched.len() > limit {
3984 matched.truncate(limit);
3985 }
3986 serde_json::json!({"items": matched, "total": total})
3987}
3988
3989fn item_is_evidence_artifact(item: &Value) -> bool {
3990 item.get("title")
3991 .and_then(Value::as_str)
3992 .is_some_and(is_zotron_evidence_artifact)
3993}
3994
3995fn quick_item_matches(item: &Value, query: &str) -> bool {
3996 let terms = query
3997 .split_whitespace()
3998 .map(|term| term.to_lowercase())
3999 .filter(|term| !term.is_empty())
4000 .collect::<Vec<_>>();
4001 if terms.is_empty() {
4002 return true;
4003 }
4004 let mut haystack = String::new();
4005 append_search_text(item, &mut haystack);
4006 let haystack = haystack.to_lowercase();
4007 terms.iter().all(|term| haystack.contains(term))
4008}
4009
4010fn append_search_text(value: &Value, out: &mut String) {
4011 match value {
4012 Value::String(text) => {
4013 out.push(' ');
4014 out.push_str(text);
4015 }
4016 Value::Number(number) => {
4017 out.push(' ');
4018 out.push_str(&number.to_string());
4019 }
4020 Value::Bool(value) => {
4021 out.push(' ');
4022 out.push_str(if *value { "true" } else { "false" });
4023 }
4024 Value::Array(items) => {
4025 for item in items {
4026 append_search_text(item, out);
4027 }
4028 }
4029 Value::Object(map) => {
4030 for item in map.values() {
4031 append_search_text(item, out);
4032 }
4033 }
4034 Value::Null => {}
4035 }
4036}
4037
4038fn parse_search_condition(raw: &str) -> Result<Value, String> {
4039 let mut parts = raw.split_whitespace();
4040 let field = parts.next();
4041 let operator = parts.next();
4042 let value = parts.collect::<Vec<_>>().join(" ");
4043 match (field, operator, value.is_empty()) {
4044 (Some(field), Some(operator), false) => Ok(serde_json::json!({
4045 "field": field,
4046 "operator": operator,
4047 "value": value,
4048 })),
4049 _ => Err(format!(
4050 "INVALID_ARGS: --condition must be 'field operator value', got: {raw:?}"
4051 )),
4052 }
4053}
4054
4055fn normalize_list_envelope(value: Value, list_key: &str, limit: Option<u64>, offset: u64) -> Value {
4056 if let Value::Array(arr) = value {
4057 let total = arr.len() as u64;
4058 let mut obj = serde_json::Map::new();
4059 obj.insert(list_key.to_string(), Value::Array(arr));
4060 obj.insert("total".to_string(), Value::from(total));
4061 if let Some(limit) = limit {
4062 obj.insert("limit".to_string(), Value::from(limit));
4063 }
4064 obj.insert("offset".to_string(), Value::from(offset));
4065 obj.insert("hasMore".to_string(), Value::Bool(false));
4066 return Value::Object(obj);
4067 }
4068
4069 let mut obj = match value {
4070 Value::Object(obj) if obj.contains_key(list_key) => obj,
4071 other => return other,
4072 };
4073
4074 let items_len = obj
4075 .get(list_key)
4076 .and_then(Value::as_array)
4077 .map_or(0, |a| a.len()) as u64;
4078 let total = obj
4079 .get("total")
4080 .and_then(Value::as_u64)
4081 .unwrap_or(items_len);
4082
4083 obj.insert("total".to_string(), Value::from(total));
4084 if let Some(limit) = limit {
4085 obj.insert("limit".to_string(), Value::from(limit));
4086 }
4087 obj.insert("offset".to_string(), Value::from(offset));
4088 obj.insert(
4089 "hasMore".to_string(),
4090 Value::Bool(offset + items_len < total),
4091 );
4092
4093 Value::Object(obj)
4094}
4095
4096const RPC_PAGINATION_SAFETY_CAP: usize = 10_000;
4097const RPC_PAGE_LIST_KEYS: [&str; 4] = ["items", "tags", "results", "data"];
4098
4099fn paginate_rpc(
4100 client: &mut impl RpcCaller,
4101 method: &str,
4102 params: Value,
4103 page_size: usize,
4104) -> Result<Value, String> {
4105 let base = params
4106 .as_object()
4107 .ok_or_else(|| "params must be a JSON object".to_string())?;
4108 let mut out = Vec::new();
4109 let mut prev_page: Option<Vec<Value>> = None;
4110 let mut offset = 0usize;
4111
4112 loop {
4113 let mut page_params = base.clone();
4114 page_params.insert("offset".to_string(), Value::Number(offset.into()));
4115 page_params.insert("limit".to_string(), Value::Number(page_size.into()));
4116 let response = client.call(method, Some(Value::Object(page_params)))?;
4117
4118 let page = match extract_page(&response) {
4119 Some(page) => page,
4120 None if out.is_empty() => return Ok(response),
4121 None if response.is_object() => {
4122 return Err(format!(
4123 "paginate: {method:?} returned a non-paginated dict after {} accumulated rows; aborting",
4124 out.len()
4125 ));
4126 }
4127 None => {
4128 return Err(format!(
4129 "paginate: {method:?} returned non-list/non-dict shape after {} accumulated rows; aborting",
4130 out.len()
4131 ));
4132 }
4133 };
4134
4135 if prev_page.as_ref() == Some(&page) {
4136 return Err(format!(
4137 "paginate: {method:?} returned identical pages — method likely ignores offset; aborting after {} rows",
4138 out.len()
4139 ));
4140 }
4141
4142 let page_len = page.len();
4143 out.extend(page.clone());
4144 if page_len < page_size {
4145 return Ok(Value::Array(out));
4146 }
4147 if out.len() >= RPC_PAGINATION_SAFETY_CAP {
4148 out.truncate(RPC_PAGINATION_SAFETY_CAP);
4149 return Ok(Value::Array(out));
4150 }
4151 prev_page = Some(page);
4152 offset += page_size;
4153 }
4154}
4155
4156fn extract_page(response: &Value) -> Option<Vec<Value>> {
4157 if let Some(page) = response.as_array() {
4158 return Some(page.clone());
4159 }
4160 let object = response.as_object()?;
4161 for key in RPC_PAGE_LIST_KEYS {
4162 if let Some(page) = object.get(key).and_then(Value::as_array) {
4163 return Some(page.clone());
4164 }
4165 }
4166 None
4167}
4168
4169fn run_find_pdfs_command(
4170 client: &mut impl RpcCaller,
4171 collection: String,
4172 limit: usize,
4173) -> Result<(Value, JsonStyle), String> {
4174 let collection_key = resolve_collection(client, &collection)?;
4175 let response = client.call(
4176 "collections.getItems",
4177 Some(serde_json::json!({"key": collection_key})),
4178 )?;
4179 let items = collection_items(&response);
4180
4181 let mut missing = Vec::new();
4182 for item in &items {
4183 let Some(item_key) = item.get("key").and_then(Value::as_str) else {
4184 continue;
4185 };
4186 let attachments = client.call(
4187 "attachments.list",
4188 Some(serde_json::json!({"parentKey": item_key})),
4189 )?;
4190 if !has_pdf_attachment(&attachments) {
4191 missing.push(item.clone());
4192 }
4193 if limit > 0 && missing.len() >= limit {
4194 break;
4195 }
4196 }
4197
4198 let mut results = Vec::new();
4199 for item in &missing {
4200 let item_key = item
4201 .get("key")
4202 .and_then(Value::as_str)
4203 .ok_or_else(|| "missing item lacks key".to_string())?;
4204 let response = client.call(
4205 "attachments.findPDF",
4206 Some(serde_json::json!({"parentKey": item_key})),
4207 )?;
4208 let attachment = response.get("attachment").filter(|value| !value.is_null());
4209 results.push(serde_json::json!({
4210 "item_key": item_key,
4211 "title": item.get("title").cloned().unwrap_or(Value::Null),
4212 "found": attachment.is_some(),
4213 "attachment_key": attachment
4214 .and_then(|attachment| attachment.get("key"))
4215 .cloned()
4216 .unwrap_or(Value::Null),
4217 }));
4218 }
4219
4220 Ok((
4221 serde_json::json!({
4222 "scanned": items.len(),
4223 "attempted": missing.len(),
4224 "results": results,
4225 }),
4226 JsonStyle::Pretty,
4227 ))
4228}
4229
4230fn collection_items(response: &Value) -> Vec<Value> {
4231 if let Some(items) = response.get("items").and_then(Value::as_array) {
4232 return items.clone();
4233 }
4234 response.as_array().cloned().unwrap_or_default()
4235}
4236
4237fn has_pdf_attachment(attachments: &Value) -> bool {
4238 attachments
4239 .as_array()
4240 .is_some_and(|attachments| attachments.iter().any(is_pdf_attachment))
4241}
4242
4243fn is_pdf_attachment(attachment: &Value) -> bool {
4244 let content_type = attachment
4245 .get("contentType")
4246 .and_then(Value::as_str)
4247 .unwrap_or_default()
4248 .to_lowercase();
4249 let path = attachment
4250 .get("path")
4251 .and_then(Value::as_str)
4252 .unwrap_or_default()
4253 .to_lowercase();
4254 matches!(
4255 content_type.as_str(),
4256 "application/pdf" | "application/x-pdf"
4257 ) || path.ends_with(".pdf")
4258}
4259
4260fn call_json(
4261 client: &mut impl RpcCaller,
4262 method: &str,
4263 params: Option<Value>,
4264) -> Result<Value, String> {
4265 client.call(method, params)
4266}
4267
4268fn run_system_command(
4269 command: SystemCommand,
4270 client: &mut impl RpcCaller,
4271) -> Result<(Value, JsonStyle), String> {
4272 let value = match command {
4273 SystemCommand::Version { .. } => client.call("system.version", None)?,
4274 SystemCommand::Libraries { .. } => client.call("system.libraries", None)?,
4275 SystemCommand::LibraryStats { library, .. } => {
4276 let params = library.map(|id| serde_json::json!({"id": id}));
4277 client.call("system.libraryStats", params)?
4278 }
4279 SystemCommand::Schema { item_type, .. } => {
4280 if let Some(item_type) = item_type {
4281 let fields = client.call("system.itemFields", Some(serde_json::json!({"itemType": item_type})))?;
4282 let creators = client.call("system.creatorTypes", Some(serde_json::json!({"itemType": item_type})))?;
4283 let field_names: Vec<Value> = fields.as_array().unwrap_or(&vec![])
4284 .iter()
4285 .filter_map(|f| f.get("field").cloned())
4286 .collect();
4287 let creator_names: Vec<Value> = creators.as_array().unwrap_or(&vec![])
4288 .iter()
4289 .filter_map(|c| c.get("creatorType").cloned())
4290 .collect();
4291 serde_json::json!({
4292 "itemType": item_type,
4293 "fields": field_names,
4294 "creatorTypes": creator_names,
4295 })
4296 } else {
4297 let types = client.call("system.itemTypes", None)?;
4298 let type_names: Vec<Value> = types.as_array().unwrap_or(&vec![])
4299 .iter()
4300 .filter_map(|t| t.get("itemType").cloned())
4301 .collect();
4302 Value::Array(type_names)
4303 }
4304 }
4305 SystemCommand::CurrentCollection { .. } => client.call("system.currentCollection", None)?,
4306 SystemCommand::Methods { method, .. } => {
4307 if let Some(method) = method {
4308 client.call("system.describe", Some(serde_json::json!({"method": method})))?
4309 } else {
4310 client.call("system.listMethods", None)?
4311 }
4312 }
4313 };
4314 Ok((value, JsonStyle::Pretty))
4315}
4316
4317fn run_items_command(
4318 command: ItemsCommand,
4319 client: &mut impl RpcCaller,
4320) -> Result<(Value, JsonStyle), String> {
4321 let (value, style) = match command {
4322 ItemsCommand::Add {
4323 doi,
4324 isbn,
4325 from_url,
4326 file,
4327 item_type,
4328 fields,
4329 collection,
4330 dry_run,
4331 ..
4332 } => {
4333 if let Some(doi) = doi {
4334 run_add_identifier_command(client, "items.addByDOI", "doi", doi, collection, dry_run)?
4335 } else if let Some(isbn) = isbn {
4336 run_add_identifier_command(client, "items.addByISBN", "isbn", isbn, collection, dry_run)?
4337 } else if let Some(from_url) = from_url {
4338 run_add_identifier_command(client, "items.addByURL", "url", from_url, collection, dry_run)?
4339 } else if let Some(file) = file {
4340 let mut params = serde_json::json!({"path": zotero_path(&file)});
4341 maybe_insert_collection(client, &mut params, collection)?;
4342 run_mutation_command(client, "items.addFromFile", params, dry_run)?
4343 } else if let Some(item_type) = item_type {
4344 let parsed_fields = parse_field_options(&fields)?;
4345 let mut params = serde_json::json!({"itemType": item_type});
4346 if !parsed_fields.is_empty() {
4347 if let Some(map) = params.as_object_mut() {
4348 map.insert("fields".to_string(), Value::Object(parsed_fields));
4349 }
4350 }
4351 run_mutation_command(client, "items.create", params, dry_run)?
4352 } else {
4353 return Err("INVALID_ARGS: provide one of --doi, --isbn, --from-url, --file, or --type".into());
4354 }
4355 }
4356 ItemsCommand::Update {
4357 key,
4358 fields,
4359 dry_run,
4360 ..
4361 } => {
4362 let parsed_fields = parse_field_options(&fields)?;
4363 let mut params = serde_json::json!({"key": key});
4364 if !parsed_fields.is_empty() {
4365 if let Some(map) = params.as_object_mut() {
4366 map.insert("fields".to_string(), Value::Object(parsed_fields));
4367 }
4368 }
4369 run_mutation_command(client, "items.update", params, dry_run)?
4370 }
4371 ItemsCommand::Delete { key, dry_run, .. } => run_mutation_command(
4372 client,
4373 "items.delete",
4374 serde_json::json!({"key": key}),
4375 dry_run,
4376 )?,
4377 ItemsCommand::Trash {
4378 items, dry_run, ..
4379 } => {
4380 if items.len() == 1 {
4381 run_mutation_command(
4382 client,
4383 "items.trash",
4384 serde_json::json!({"key": items[0]}),
4385 dry_run,
4386 )?
4387 } else {
4388 run_mutation_command(
4389 client,
4390 "items.batchTrash",
4391 serde_json::json!({"keys": items}),
4392 dry_run,
4393 )?
4394 }
4395 }
4396 ItemsCommand::Restore { item, dry_run, .. } => run_mutation_command(
4397 client,
4398 "items.restore",
4399 serde_json::json!({"key": item}),
4400 dry_run,
4401 )?,
4402 ItemsCommand::MergeDuplicates { keys, dry_run, .. } => {
4403 if keys.len() < 2 {
4404 return Err("INVALID_ARGS: need at least 2 keys to merge".to_string());
4405 }
4406 run_mutation_command(
4407 client,
4408 "items.mergeDuplicates",
4409 serde_json::json!({"keys": keys}),
4410 dry_run,
4411 )?
4412 }
4413 ItemsCommand::AddRelated {
4414 key,
4415 target,
4416 dry_run,
4417 ..
4418 } => run_mutation_command(
4419 client,
4420 "items.addRelated",
4421 serde_json::json!({"key": key, "targetKey": target}),
4422 dry_run,
4423 )?,
4424 ItemsCommand::RemoveRelated {
4425 key,
4426 target,
4427 dry_run,
4428 ..
4429 } => run_mutation_command(
4430 client,
4431 "items.removeRelated",
4432 serde_json::json!({"key": key, "targetKey": target}),
4433 dry_run,
4434 )?,
4435 ItemsCommand::Get { item, .. } => (
4436 client.call("items.get", Some(serde_json::json!({"key": item})))?,
4437 JsonStyle::Pretty,
4438 ),
4439 ItemsCommand::List {
4440 limit,
4441 offset,
4442 sort,
4443 direction,
4444 trash,
4445 ..
4446 } => {
4447 if trash {
4448 let value = client.call(
4449 "items.getTrash",
4450 Some(serde_json::json!({"limit": limit, "offset": offset})),
4451 )?;
4452 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4453 } else {
4454 let mut params = serde_json::json!({
4455 "limit": limit,
4456 "offset": offset,
4457 "direction": direction,
4458 });
4459 if let (Some(sort), Some(map)) = (sort, params.as_object_mut()) {
4460 map.insert("sort".to_string(), Value::String(sort));
4461 }
4462 let value = client.call("items.list", Some(params))?;
4463 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4464 }
4465 }
4466 ItemsCommand::FindDuplicates { .. } => (
4467 client.call("items.findDuplicates", None)?,
4468 JsonStyle::Pretty,
4469 ),
4470 ItemsCommand::Recent {
4471 limit,
4472 offset,
4473 recent_type,
4474 ..
4475 } => {
4476 if recent_type != "added" && recent_type != "modified" {
4477 return Err(format!(
4478 "--type must be added or modified, got {recent_type:?}"
4479 ));
4480 }
4481 let value = client.call(
4482 "items.getRecent",
4483 Some(
4484 serde_json::json!({"limit": limit, "offset": offset, "type": recent_type}),
4485 ),
4486 )?;
4487 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4488 }
4489 ItemsCommand::Fulltext { key, .. } => (
4490 client.call("items.getFullText", Some(serde_json::json!({"key": key})))?,
4491 JsonStyle::Pretty,
4492 ),
4493 ItemsCommand::Related { key, .. } => (
4494 normalize_list_envelope(
4495 client.call("items.getRelated", Some(serde_json::json!({"key": key})))?,
4496 "items",
4497 None,
4498 0,
4499 ),
4500 JsonStyle::Pretty,
4501 ),
4502 ItemsCommand::CitationKey { key, .. } => (
4503 client.call("items.citationKey", Some(serde_json::json!({"key": key})))?,
4504 JsonStyle::Pretty,
4505 ),
4506 ItemsCommand::Path { key, .. } => (
4507 localize_attachment_path_response(
4508 client.call("attachments.getPath", Some(serde_json::json!({"key": key})))?,
4509 ),
4510 JsonStyle::Pretty,
4511 ),
4512 ItemsCommand::Attachments { key, offset, .. } => {
4513 let value = client.call(
4514 "attachments.list",
4515 Some(serde_json::json!({"parentKey": key})),
4516 )?;
4517 let total = value
4518 .get("items")
4519 .and_then(Value::as_array)
4520 .map_or(0, |a| a.len()) as u64;
4521 (normalize_list_envelope(value, "items", Some(total), offset), JsonStyle::Pretty)
4522 }
4523 ItemsCommand::FindPdfs { collection, limit, .. } => {
4524 run_find_pdfs_command(client, collection, limit)?
4525 }
4526 };
4527 Ok((value, style))
4528}
4529
4530fn run_add_identifier_command(
4531 client: &mut impl RpcCaller,
4532 method: &str,
4533 param_name: &str,
4534 param_value: String,
4535 collection: Option<String>,
4536 dry_run: bool,
4537) -> Result<(Value, JsonStyle), String> {
4538 let mut params = Value::Object(serde_json::Map::from_iter([(
4539 param_name.to_string(),
4540 Value::String(param_value),
4541 )]));
4542 maybe_insert_collection(client, &mut params, collection)?;
4543 run_mutation_command(client, method, params, dry_run)
4544}
4545
4546fn run_mutation_command(
4547 client: &mut impl RpcCaller,
4548 method: &str,
4549 params: Value,
4550 dry_run: bool,
4551) -> Result<(Value, JsonStyle), String> {
4552 let value = if dry_run {
4553 serde_json::json!({
4554 "ok": true,
4555 "dryRun": true,
4556 "wouldCall": method,
4557 "wouldCallParams": params,
4558 })
4559 } else {
4560 client.call(method, Some(params))?
4561 };
4562 Ok((value, JsonStyle::PythonCompact))
4563}
4564
4565fn parse_field_options(fields: &[String]) -> Result<serde_json::Map<String, Value>, String> {
4566 let mut parsed = serde_json::Map::new();
4567 for field in fields {
4568 let (key, value) = field
4569 .split_once('=')
4570 .ok_or_else(|| format!("INVALID_ARGS: --field must be key=value, got: {field:?}"))?;
4571 parsed.insert(key.to_string(), Value::String(value.to_string()));
4572 }
4573 Ok(parsed)
4574}
4575
4576fn maybe_insert_collection(
4577 client: &mut impl RpcCaller,
4578 params: &mut Value,
4579 collection: Option<String>,
4580) -> Result<(), String> {
4581 let Some(collection) = collection else {
4582 return Ok(());
4583 };
4584 let collection = resolve_collection(client, &collection)?;
4585 let include = match &collection {
4586 Value::Null => false,
4587 Value::Number(number) => number.as_i64() != Some(0),
4588 _ => true,
4589 };
4590 if include {
4591 params
4592 .as_object_mut()
4593 .expect("mutation params are always objects")
4594 .insert("collection".to_string(), collection);
4595 }
4596 Ok(())
4597}
4598
4599fn run_settings_command(
4600 command: SettingsCommand,
4601 client: &mut impl RpcCaller,
4602) -> Result<(Value, JsonStyle), String> {
4603 let (value, style) = match command {
4604 SettingsCommand::Get { key, .. } => (
4605 client.call("settings.get", Some(serde_json::json!({"key": key})))?,
4606 JsonStyle::Pretty,
4607 ),
4608 SettingsCommand::List { .. } => (client.call("settings.getAll", None)?, JsonStyle::Pretty),
4609 SettingsCommand::Set {
4610 pairs,
4611 file,
4612 dry_run,
4613 ..
4614 } => {
4615 if let Some(file) = file {
4616 let raw = fs::read_to_string(&file)
4618 .map_err(|err| format!("INVALID_JSON: Could not read JSON: {err}"))?;
4619 let settings: Value = serde_json::from_str(&raw)
4620 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
4621 if dry_run {
4622 (
4623 dry_run_value("settings.setAll", settings),
4624 JsonStyle::PythonCompact,
4625 )
4626 } else {
4627 (
4628 client.call("settings.setAll", Some(settings))?,
4629 JsonStyle::PythonCompact,
4630 )
4631 }
4632 } else if pairs.len() == 2 {
4633 let key = &pairs[0];
4635 let value = &pairs[1];
4636 let parsed_value = serde_json::from_str::<Value>(value)
4637 .unwrap_or(Value::String(value.clone()));
4638 let params = serde_json::json!({"key": key, "value": parsed_value});
4639 if dry_run {
4640 (
4641 dry_run_value("settings.set", params),
4642 JsonStyle::PythonCompact,
4643 )
4644 } else {
4645 (
4646 client.call("settings.set", Some(params))?,
4647 JsonStyle::PythonCompact,
4648 )
4649 }
4650 } else if pairs.len() > 2 && pairs.len() % 2 == 0 {
4651 let mut map = serde_json::Map::new();
4653 for chunk in pairs.chunks(2) {
4654 let parsed = serde_json::from_str::<Value>(&chunk[1])
4655 .unwrap_or(Value::String(chunk[1].clone()));
4656 map.insert(chunk[0].clone(), parsed);
4657 }
4658 let settings = Value::Object(map);
4659 if dry_run {
4660 (
4661 dry_run_value("settings.setAll", settings),
4662 JsonStyle::PythonCompact,
4663 )
4664 } else {
4665 (
4666 client.call("settings.setAll", Some(settings))?,
4667 JsonStyle::PythonCompact,
4668 )
4669 }
4670 } else {
4671 return Err(
4672 "INVALID_ARGS: provide key value pairs (even number of args) or --file".into(),
4673 );
4674 }
4675 }
4676 };
4677 Ok((value, style))
4678}
4679
4680fn run_tags_command(
4681 command: TagsCommand,
4682 client: &mut impl RpcCaller,
4683) -> Result<(Value, JsonStyle), String> {
4684 let (value, style) = match command {
4685 TagsCommand::List { limit, .. } => {
4686 let value = client.call("tags.list", Some(serde_json::json!({"limit": limit})))?;
4687 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
4688 }
4689 TagsCommand::Rename {
4690 old, new, dry_run, ..
4691 } => run_tag_mutation(
4692 client,
4693 "tags.rename",
4694 serde_json::json!({"oldName": old, "newName": new}),
4695 dry_run,
4696 )?,
4697 TagsCommand::Delete { tag, dry_run, .. } => run_tag_mutation(
4698 client,
4699 "tags.delete",
4700 serde_json::json!({"tag": tag}),
4701 dry_run,
4702 )?,
4703 TagsCommand::Add {
4704 keys, tags, dry_run, ..
4705 } => {
4706 if keys.len() == 1 {
4707 run_tag_mutation(
4708 client,
4709 "tags.add",
4710 serde_json::json!({"key": keys[0], "tags": tags}),
4711 dry_run,
4712 )?
4713 } else {
4714 run_tag_mutation(
4715 client,
4716 "tags.batchUpdate",
4717 serde_json::json!({"keys": keys, "add": tags}),
4718 dry_run,
4719 )?
4720 }
4721 }
4722 TagsCommand::Remove {
4723 keys, tags, dry_run, ..
4724 } => {
4725 if keys.len() == 1 {
4726 run_tag_mutation(
4727 client,
4728 "tags.remove",
4729 serde_json::json!({"key": keys[0], "tags": tags}),
4730 dry_run,
4731 )?
4732 } else {
4733 run_tag_mutation(
4734 client,
4735 "tags.batchUpdate",
4736 serde_json::json!({"keys": keys, "remove": tags}),
4737 dry_run,
4738 )?
4739 }
4740 }
4741 };
4742 Ok((value, style))
4743}
4744
4745fn run_tag_mutation(
4746 client: &mut impl RpcCaller,
4747 method: &str,
4748 params: Value,
4749 dry_run: bool,
4750) -> Result<(Value, JsonStyle), String> {
4751 if dry_run {
4752 Ok((dry_run_value(method, params), JsonStyle::PythonCompact))
4753 } else {
4754 Ok((client.call(method, Some(params))?, JsonStyle::PythonCompact))
4755 }
4756}
4757
4758fn dry_run_value(method: &str, params: Value) -> Value {
4759 serde_json::json!({
4760 "ok": true,
4761 "dryRun": true,
4762 "wouldCall": method,
4763 "wouldCallParams": params,
4764 })
4765}
4766
4767fn run_annotations_command(
4768 command: AnnotationsCommand,
4769 client: &mut impl RpcCaller,
4770) -> Result<(Value, JsonStyle), String> {
4771 let (value, style) = match command {
4772 AnnotationsCommand::List { parent, attachment, context, .. } => {
4773 let mut params = serde_json::json!({"parentKey": parent});
4774 if let Some(att) = attachment {
4775 params["attachmentKey"] = Value::String(att);
4776 }
4777 if let Some(ctx) = context {
4778 params["context"] = Value::Number(ctx.into());
4779 }
4780 let value = client.call("annotations.list", Some(params))?;
4781 let total = value
4782 .get("items")
4783 .and_then(Value::as_array)
4784 .map_or(0, |a| a.len()) as u64;
4785 (normalize_list_envelope(value, "items", Some(total), 0), JsonStyle::Pretty)
4786 }
4787 AnnotationsCommand::Create {
4788 parent,
4789 attachment,
4790 annotation_type,
4791 position,
4792 quote,
4793 page,
4794 sort_index,
4795 text,
4796 comment,
4797 color,
4798 dry_run,
4799 ..
4800 } => {
4801 let annotation_type = annotation_type.unwrap_or_else(|| "highlight".to_string());
4802 if !matches!(
4803 annotation_type.as_str(),
4804 "highlight" | "note" | "underline" | "image" | "ink"
4805 ) {
4806 return Err(format!(
4807 "INVALID_ARGS: --type must be highlight|note|underline|image|ink, got {annotation_type:?}"
4808 ));
4809 }
4810 let mut params = serde_json::Map::new();
4811 params.insert("parentKey".to_string(), Value::String(parent));
4812 if let Some(att) = attachment {
4813 params.insert("attachmentKey".to_string(), Value::String(att));
4814 }
4815 params.insert("type".to_string(), Value::String(annotation_type.clone()));
4816 params.insert("color".to_string(), Value::String(color));
4817
4818 if let Some(ref quote_text) = quote {
4819 if !matches!(annotation_type.as_str(), "highlight" | "underline") {
4820 return Err(format!(
4821 "INVALID_ARGS: --quote is only valid for highlight|underline, got {annotation_type:?}"
4822 ));
4823 }
4824 params.insert("quote".to_string(), Value::String(quote_text.clone()));
4825 if let Some(page_idx) = page {
4826 params.insert(
4827 "pageIndex".to_string(),
4828 Value::Number(page_idx.into()),
4829 );
4830 }
4831 if let Some(raw) = position {
4833 let pos = serde_json::from_str::<Value>(&raw)
4834 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))?;
4835 validate_annotation_position(annotation_type.as_str(), &pos)?;
4836 params.insert("position".to_string(), pos);
4837 }
4838 } else {
4839 let position = position
4840 .ok_or_else(|| "INVALID_ARGS: --position JSON is required (or use --quote)".to_string())
4841 .and_then(|raw| {
4842 serde_json::from_str::<Value>(&raw)
4843 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))
4844 })?;
4845 validate_annotation_position(annotation_type.as_str(), &position)?;
4846 params.insert("position".to_string(), position);
4847 }
4848
4849 if let Some(sort_index) = sort_index {
4850 params.insert(
4851 "sortIndex".to_string(),
4852 parse_annotation_sort_index(sort_index)?,
4853 );
4854 }
4855 if let Some(text) = text {
4856 params.insert("text".to_string(), Value::String(text));
4857 }
4858 if let Some(comment) = comment {
4859 params.insert("comment".to_string(), Value::String(comment));
4860 }
4861 run_mutating_command(client, "annotations.create", Value::Object(params), dry_run)?
4862 }
4863 AnnotationsCommand::CreateBatch {
4864 parent,
4865 attachment,
4866 file,
4867 dry_run,
4868 ..
4869 } => {
4870 let input = if let Some(ref path) = file {
4871 std::fs::read_to_string(path)
4872 .map_err(|e| format!("INVALID_ARGS: cannot read file {path}: {e}"))?
4873 } else {
4874 let mut buf = String::new();
4875 std::io::Read::read_to_string(&mut std::io::stdin(), &mut buf)
4876 .map_err(|e| format!("INVALID_ARGS: cannot read stdin: {e}"))?;
4877 buf
4878 };
4879 let annotations: Value = serde_json::from_str(&input)
4880 .map_err(|e| format!("INVALID_JSON: {e}"))?;
4881 if !annotations.is_array() {
4882 return Err("INVALID_ARGS: input must be a JSON array".to_string());
4883 }
4884 let mut params = serde_json::json!({
4885 "parentKey": parent,
4886 "annotations": annotations,
4887 });
4888 if let Some(att) = attachment {
4889 params["attachmentKey"] = Value::String(att);
4890 }
4891 run_mutating_command(client, "annotations.createBatch", params, dry_run)?
4892 }
4893 AnnotationsCommand::Locate {
4894 parent,
4895 attachment,
4896 quote,
4897 page,
4898 ..
4899 } => {
4900 let mut params = serde_json::json!({
4901 "parentKey": parent,
4902 "quote": quote,
4903 });
4904 if let Some(att) = attachment {
4905 params["attachmentKey"] = Value::String(att);
4906 }
4907 if let Some(page_idx) = page {
4908 params["pageIndex"] = Value::Number(page_idx.into());
4909 }
4910 let value = client.call("annotations.locate", Some(params))?;
4911 (value, JsonStyle::Pretty)
4912 }
4913 AnnotationsCommand::Delete {
4914 annotation_key,
4915 dry_run,
4916 ..
4917 } => run_mutating_command(
4918 client,
4919 "annotations.delete",
4920 serde_json::json!({"key": annotation_key}),
4921 dry_run,
4922 )?,
4923 };
4924 Ok((value, style))
4925}
4926
4927fn validate_annotation_position(annotation_type: &str, position: &Value) -> Result<(), String> {
4928 position
4929 .get("pageIndex")
4930 .and_then(Value::as_i64)
4931 .filter(|value| *value >= 0)
4932 .ok_or_else(|| {
4933 "INVALID_ARGS: --position must include a non-negative integer pageIndex".to_string()
4934 })?;
4935
4936 if annotation_type == "ink" {
4937 let has_paths = position
4938 .get("paths")
4939 .and_then(Value::as_array)
4940 .is_some_and(|paths| !paths.is_empty());
4941 if !has_paths {
4942 return Err("INVALID_ARGS: ink --position must include non-empty paths".to_string());
4943 }
4944 return Ok(());
4945 }
4946
4947 let valid_rects = position
4948 .get("rects")
4949 .and_then(Value::as_array)
4950 .is_some_and(|rects| !rects.is_empty() && rects.iter().all(is_annotation_rect));
4951 if !valid_rects {
4952 return Err(
4953 "INVALID_ARGS: --position must include non-empty rects of [x1, y1, x2, y2]".to_string(),
4954 );
4955 }
4956 Ok(())
4957}
4958
4959fn is_annotation_rect(value: &Value) -> bool {
4960 value.as_array().is_some_and(|coords| {
4961 coords.len() == 4
4962 && coords
4963 .iter()
4964 .all(|coord| coord.as_f64().is_some_and(f64::is_finite))
4965 })
4966}
4967
4968fn parse_annotation_sort_index(raw: String) -> Result<Value, String> {
4969 let parsed = serde_json::from_str::<Value>(&raw).unwrap_or_else(|_| Value::String(raw));
4970 let valid = match &parsed {
4971 Value::Number(number) => number.as_f64().is_some_and(f64::is_finite),
4972 Value::String(value) => {
4973 is_zotero_pdf_sort_index(value.trim())
4974 || (!value.trim().is_empty()
4975 && value.trim().parse::<f64>().is_ok_and(f64::is_finite))
4976 }
4977 _ => false,
4978 };
4979 if valid {
4980 Ok(parsed)
4981 } else {
4982 Err(format!(
4983 "INVALID_ARGS: --sort-index must be a finite number or numeric string, got {parsed}"
4984 ))
4985 }
4986}
4987
4988fn is_zotero_pdf_sort_index(value: &str) -> bool {
4989 let mut parts = value.split('|');
4990 matches!(
4991 (parts.next(), parts.next(), parts.next(), parts.next()),
4992 (Some(page), Some(offset), Some(y), None)
4993 if page.len() == 5
4994 && offset.len() == 6
4995 && y.len() == 5
4996 && page.chars().all(|ch| ch.is_ascii_digit())
4997 && offset.chars().all(|ch| ch.is_ascii_digit())
4998 && y.chars().all(|ch| ch.is_ascii_digit())
4999 )
5000}
5001
5002
5003fn localize_attachment_path_response(mut value: Value) -> Value {
5004 if let Some(path) = value.get("path").and_then(Value::as_str) {
5005 let local = local_path_from_zotero_path(path);
5006 if let Some(map) = value.as_object_mut() {
5007 map.insert("path".to_string(), Value::String(local));
5008 }
5009 }
5010 value
5011}
5012
5013fn run_notes_command(
5014 command: NotesCommand,
5015 client: &mut impl RpcCaller,
5016) -> Result<(Value, JsonStyle), String> {
5017 let (value, style) = match command {
5018 NotesCommand::List {
5019 parent,
5020 limit,
5021 offset,
5022 ..
5023 } => {
5024 let value = client.call(
5025 "notes.list",
5026 Some(serde_json::json!({"parentKey": parent})),
5027 )?;
5028 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
5029 }
5030 NotesCommand::Get { note_key, .. } => {
5031 let value = client.call("notes.get", Some(serde_json::json!({"key": note_key})))?;
5032 (value, JsonStyle::Pretty)
5033 }
5034 NotesCommand::Create {
5035 parent,
5036 content,
5037 tags,
5038 dry_run,
5039 ..
5040 } => {
5041 let mut params = serde_json::Map::new();
5042 params.insert("parentKey".to_string(), Value::String(parent));
5043 params.insert("content".to_string(), Value::String(content));
5044 if !tags.is_empty() {
5045 params.insert(
5046 "tags".to_string(),
5047 Value::Array(tags.into_iter().map(Value::String).collect()),
5048 );
5049 }
5050 run_mutating_command(client, "notes.create", Value::Object(params), dry_run)?
5051 }
5052 NotesCommand::Update {
5053 note_key,
5054 content,
5055 dry_run,
5056 ..
5057 } => run_mutating_command(
5058 client,
5059 "notes.update",
5060 serde_json::json!({"key": note_key, "content": content}),
5061 dry_run,
5062 )?,
5063 NotesCommand::Delete {
5064 note_key, dry_run, ..
5065 } => {
5066 run_mutating_command(
5068 client,
5069 "items.delete",
5070 serde_json::json!({"key": note_key}),
5071 dry_run,
5072 )?
5073 }
5074 NotesCommand::Search { query, limit, .. } => {
5075 let value = client.call(
5076 "notes.search",
5077 Some(serde_json::json!({"query": query, "limit": limit})),
5078 )?;
5079 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
5080 }
5081 };
5082 Ok((value, style))
5083}
5084
5085fn run_mutating_command(
5086 client: &mut impl RpcCaller,
5087 method: &str,
5088 params: Value,
5089 dry_run: bool,
5090) -> Result<(Value, JsonStyle), String> {
5091 if dry_run {
5092 Ok((
5093 serde_json::json!({
5094 "ok": true,
5095 "dryRun": true,
5096 "wouldCall": method,
5097 "wouldCallParams": params,
5098 }),
5099 JsonStyle::PythonCompact,
5100 ))
5101 } else {
5102 client
5103 .call(method, Some(params))
5104 .map(|value| (value, JsonStyle::PythonCompact))
5105 }
5106}
5107
5108fn run_collections_command(
5109 command: CollectionsCommand,
5110 client: &mut impl RpcCaller,
5111) -> Result<(Value, JsonStyle), String> {
5112 let value = match command {
5113 CollectionsCommand::List { .. } => normalize_list_envelope(
5114 client.call("collections.list", None)?,
5115 "items",
5116 None,
5117 0,
5118 ),
5119 CollectionsCommand::Tree { .. } => client.call("collections.tree", None)?,
5120 CollectionsCommand::Get { name_or_id, .. } => {
5121 let key = resolve_collection(client, &name_or_id)?;
5122 client.call("collections.get", Some(serde_json::json!({"key": key})))?
5123 }
5124 CollectionsCommand::GetItems {
5125 name_or_id,
5126 limit,
5127 offset,
5128 ..
5129 } => {
5130 let key = resolve_collection(client, &name_or_id)?;
5131 let mut params = serde_json::json!({"key": key});
5132 if let Some(map) = params.as_object_mut() {
5133 if let Some(limit) = limit {
5134 map.insert("limit".to_string(), Value::Number(limit.into()));
5135 }
5136 if offset > 0 {
5137 map.insert("offset".to_string(), Value::Number(offset.into()));
5138 }
5139 }
5140 normalize_list_envelope(
5141 client.call("collections.getItems", Some(params))?,
5142 "items",
5143 limit,
5144 offset,
5145 )
5146 }
5147 CollectionsCommand::Stats { name_or_id, .. } => {
5148 let key = resolve_collection(client, &name_or_id)?;
5149 client.call("collections.stats", Some(serde_json::json!({"key": key})))?
5150 }
5151 CollectionsCommand::Rename {
5152 old_name,
5153 new_name,
5154 dry_run,
5155 ..
5156 } => {
5157 let key = resolve_mutable_collection(client, &old_name, "rename")?;
5158 let params = serde_json::json!({"key": key, "name": new_name});
5159 if dry_run {
5160 return Ok((
5161 dry_run_value("collections.rename", params),
5162 JsonStyle::PythonCompact,
5163 ));
5164 }
5165 return Ok((
5166 client.call("collections.rename", Some(params))?,
5167 JsonStyle::PythonCompact,
5168 ));
5169 }
5170 CollectionsCommand::Create {
5171 name,
5172 parent,
5173 dry_run,
5174 ..
5175 } => {
5176 let mut params = serde_json::json!({"name": name});
5177 if let Some(parent) = parent {
5178 let parent_key = resolve_mutable_collection(client, &parent, "use as parent")?;
5179 if let Some(map) = params.as_object_mut() {
5180 map.insert("parentKey".to_string(), parent_key);
5181 }
5182 }
5183 if dry_run {
5184 return Ok((
5185 dry_run_value("collections.create", params),
5186 JsonStyle::PythonCompact,
5187 ));
5188 }
5189 return Ok((
5190 client.call("collections.create", Some(params))?,
5191 JsonStyle::PythonCompact,
5192 ));
5193 }
5194 CollectionsCommand::Delete {
5195 name_or_id,
5196 dry_run,
5197 ..
5198 } => {
5199 let key = resolve_mutable_collection(client, &name_or_id, "delete")?;
5200 let params = serde_json::json!({"key": key});
5201 if dry_run {
5202 return Ok((
5203 dry_run_value("collections.delete", params),
5204 JsonStyle::PythonCompact,
5205 ));
5206 }
5207 return Ok((
5208 client.call("collections.delete", Some(params))?,
5209 JsonStyle::PythonCompact,
5210 ));
5211 }
5212 CollectionsCommand::AddItems {
5213 collection,
5214 item_keys,
5215 dry_run,
5216 ..
5217 } => {
5218 let key = resolve_mutable_collection(client, &collection, "add to")?;
5219 let params = serde_json::json!({"key": key, "keys": item_keys});
5220 if dry_run {
5221 return Ok((
5222 dry_run_value("collections.addItems", params),
5223 JsonStyle::PythonCompact,
5224 ));
5225 }
5226 return Ok((
5227 client.call("collections.addItems", Some(params))?,
5228 JsonStyle::PythonCompact,
5229 ));
5230 }
5231 CollectionsCommand::RemoveItems {
5232 collection,
5233 item_keys,
5234 dry_run,
5235 ..
5236 } => {
5237 let key = resolve_mutable_collection(client, &collection, "operate on")?;
5238 let params = serde_json::json!({"key": key, "keys": item_keys});
5239 if dry_run {
5240 return Ok((
5241 dry_run_value("collections.removeItems", params),
5242 JsonStyle::PythonCompact,
5243 ));
5244 }
5245 return Ok((
5246 client.call("collections.removeItems", Some(params))?,
5247 JsonStyle::PythonCompact,
5248 ));
5249 }
5250 };
5251 Ok((value, JsonStyle::Pretty))
5252}
5253
5254fn resolve_export_keys(
5255 client: &mut impl RpcCaller,
5256 mut keys: Vec<String>,
5257 collection: Option<String>,
5258) -> Result<Vec<String>, String> {
5259 if let Some(name) = collection {
5260 let col_key = resolve_collection(client, &name)?;
5261 let response = client.call(
5262 "collections.getItems",
5263 Some(serde_json::json!({"key": col_key})),
5264 )?;
5265 let items = collection_items(&response);
5266 for item in items {
5267 if let Some(key) = item.get("key").and_then(Value::as_str) {
5268 if !keys.contains(&key.to_string()) {
5269 keys.push(key.to_string());
5270 }
5271 }
5272 }
5273 }
5274 if keys.is_empty() {
5275 return Err("No item keys provided. Pass positional keys and/or --collection.".to_string());
5276 }
5277 Ok(keys)
5278}
5279
5280fn run_export(args: ExportArgs, client: &mut impl RpcCaller) -> Result<String, String> {
5281 let keys = resolve_export_keys(client, args.keys, args.collection)?;
5282 match args.format.as_str() {
5283 "bibtex" => run_export_content_command(client, "export.bibtex", keys),
5284 "ris" => run_export_content_command(client, "export.ris", keys),
5285 "csl-json" => {
5286 let response =
5287 client.call("export.cslJson", Some(serde_json::json!({"keys": keys})))?;
5288 if let Some(content) = response.get("content") {
5289 format_json(content, JsonStyle::Pretty)
5290 } else {
5291 format_json(&response, JsonStyle::PythonCompact)
5292 }
5293 }
5294 "bibliography" => {
5295 let response = client.call(
5296 "export.bibliography",
5297 Some(serde_json::json!({"keys": keys, "style": args.style})),
5298 )?;
5299 if let Some(object) = response.as_object() {
5300 let field = if args.html { "html" } else { "text" };
5301 if object.contains_key("html") || object.contains_key("text") {
5302 return raw_value_output(
5303 object.get(field).unwrap_or(&Value::String(String::new())),
5304 );
5305 }
5306 }
5307 format_json(&response, JsonStyle::PythonCompact)
5308 }
5309 other => Err(format!(
5310 "INVALID_ARGS: unknown format {other:?}, expected bibtex/ris/csl-json/bibliography"
5311 )),
5312 }
5313}
5314
5315fn run_export_content_command(
5316 client: &mut impl RpcCaller,
5317 method: &str,
5318 keys: Vec<String>,
5319) -> Result<String, String> {
5320 let response = client.call(method, Some(serde_json::json!({"keys": keys})))?;
5321 if let Some(content) = response.get("content") {
5322 raw_value_output(content)
5323 } else {
5324 format_json(&response, JsonStyle::PythonCompact)
5325 }
5326}
5327
5328fn raw_value_output(value: &Value) -> Result<String, String> {
5329 let mut out = match value {
5330 Value::Null => String::new(),
5331 Value::String(content) => content.clone(),
5332 other => to_python_repr(other),
5333 };
5334 out.push('\n');
5335 Ok(out)
5336}
5337
5338fn to_python_repr(value: &Value) -> String {
5339 match value {
5340 Value::Null => "None".to_string(),
5341 Value::Bool(value) => {
5342 if *value {
5343 "True".to_string()
5344 } else {
5345 "False".to_string()
5346 }
5347 }
5348 Value::Number(value) => value.to_string(),
5349 Value::String(value) => format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\'")),
5350 Value::Array(values) => {
5351 let inner = values
5352 .iter()
5353 .map(to_python_repr)
5354 .collect::<Vec<_>>()
5355 .join(", ");
5356 format!("[{inner}]")
5357 }
5358 Value::Object(entries) => {
5359 let inner = entries
5360 .iter()
5361 .map(|(key, value)| {
5362 format!("'{}': {}", key.replace('\'', "\\'"), to_python_repr(value))
5363 })
5364 .collect::<Vec<_>>()
5365 .join(", ");
5366 format!("{{{inner}}}")
5367 }
5368 }
5369}
5370
5371fn resolve_collection(client: &mut impl RpcCaller, name_or_id: &str) -> Result<Value, String> {
5372 let trimmed = name_or_id.trim();
5373 if let Ok(id) = trimmed.parse::<i64>() {
5374 return Ok(Value::Number(id.into()));
5375 }
5376
5377 let collections = client.call("collections.list", None)?;
5378 let items = collections
5379 .get("items")
5380 .and_then(Value::as_array)
5381 .or_else(|| collections.as_array())
5382 .ok_or_else(|| "collections.list returned non-array result".to_string())?;
5383
5384 if let Some(collection) = items
5385 .iter()
5386 .find(|collection| collection.get("key").and_then(Value::as_str) == Some(trimmed))
5387 {
5388 return collection_key(collection);
5389 }
5390
5391 let exact = items
5392 .iter()
5393 .filter(|collection| collection.get("name").and_then(Value::as_str) == Some(trimmed))
5394 .collect::<Vec<_>>();
5395 if exact.len() == 1 {
5396 return collection_key(exact[0]);
5397 }
5398
5399 let needle = normalize_collection_name(trimmed);
5400 let fuzzy = items
5401 .iter()
5402 .filter(|collection| {
5403 collection
5404 .get("name")
5405 .and_then(Value::as_str)
5406 .map(normalize_collection_name)
5407 .is_some_and(|name| name.contains(&needle))
5408 })
5409 .collect::<Vec<_>>();
5410
5411 match fuzzy.len() {
5412 1 => collection_key(fuzzy[0]),
5413 0 => Err(format!(
5414 "COLLECTION_NOT_FOUND: No collection named {trimmed:?}"
5415 )),
5416 _ => Err(format!(
5417 "COLLECTION_AMBIGUOUS: Multiple collections match {trimmed:?}"
5418 )),
5419 }
5420}
5421
5422fn collection_key(collection: &Value) -> Result<Value, String> {
5423 collection
5424 .get("key")
5425 .cloned()
5426 .ok_or_else(|| "collection result is missing key".to_string())
5427}
5428
5429fn resolve_mutable_collection(
5430 client: &mut impl RpcCaller,
5431 name_or_id: &str,
5432 operation: &str,
5433) -> Result<Value, String> {
5434 let key = resolve_collection(client, name_or_id)?;
5435 if key.as_i64() == Some(0) {
5436 return Err(format!(
5437 "COLLECTION_NOT_FOUND: {name_or_id:?} resolved to library root (cannot {operation})"
5438 ));
5439 }
5440 Ok(key)
5441}
5442
5443fn normalize_collection_name(name: &str) -> String {
5444 name.split_whitespace()
5445 .collect::<Vec<_>>()
5446 .join(" ")
5447 .to_lowercase()
5448}
5449
5450fn format_json(value: &Value, style: JsonStyle) -> Result<String, String> {
5451 let mut out = match style {
5452 JsonStyle::PythonCompact => to_python_compact_json(value),
5453 JsonStyle::Pretty => serde_json::to_string_pretty(value).map_err(|err| err.to_string())?,
5454 };
5455 out.push('\n');
5456 Ok(out)
5457}
5458
5459fn to_python_compact_json(value: &Value) -> String {
5460 match value {
5461 Value::Null => "null".to_string(),
5462 Value::Bool(value) => value.to_string(),
5463 Value::Number(value) => value.to_string(),
5464 Value::String(value) => {
5465 serde_json::to_string(value).expect("string serialization cannot fail")
5466 }
5467 Value::Array(values) => {
5468 let inner = values
5469 .iter()
5470 .map(to_python_compact_json)
5471 .collect::<Vec<_>>()
5472 .join(", ");
5473 format!("[{inner}]")
5474 }
5475 Value::Object(entries) => {
5476 let inner = entries
5477 .iter()
5478 .map(|(key, value)| {
5479 let key = serde_json::to_string(key).expect("string serialization cannot fail");
5480 format!("{key}: {}", to_python_compact_json(value))
5481 })
5482 .collect::<Vec<_>>()
5483 .join(", ");
5484 format!("{{{inner}}}")
5485 }
5486 }
5487}