1use std::{
4 env, fs,
5 io::{self, Read},
6 path::{Path, PathBuf},
7 process::Command as ProcessCommand,
8 thread,
9 time::{Duration, Instant, SystemTime, UNIX_EPOCH},
10};
11
12use clap::{error::ErrorKind, Parser, Subcommand};
13use serde_json::Value;
14use zotron_rpc::{StdProviderCommandRunner, UreqProviderHttpTransport, ZoteroRpc};
15use zotron_types::{
16 bm25_score_chunks, build_embedding_provider_request, build_ocr_provider_request,
17 builtin_ocr_provider_specs, cosine_similarity, execute_embedding_provider_request,
18 is_zotron_evidence_artifact, machine_artifact_exists_for_item,
19 machine_artifact_exists_in_sidecar, machine_artifact_store_root,
20 ocr_provider_spec as raw_ocr_provider_spec, parse_embedding_provider_response,
21 parse_ocr_provider_response, read_machine_artifact_sidecar, rrf_merge,
22 write_machine_artifact_sidecar, ArtifactStorePlatform, EmbeddingChunkInput,
23 EmbeddingRequestInput, EmbeddingVector, MachineArtifactKind, OcrRequestInput,
24 ProviderCommandRunner, ProviderHttpInvocation, ProviderHttpTransport, StructureChunk,
25 DEFAULT_RPC_URL,
26};
27
28pub trait RpcCaller {
29 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String>;
30}
31
32#[derive(Debug, Clone, PartialEq, serde::Serialize)]
33pub struct CliOcrProviderSpec {
34 pub id: &'static str,
35 pub provider: &'static str,
36 pub request_style: &'static str,
37 pub auth: &'static str,
38 pub auth_header: &'static str,
39 pub supports_pdf_direct: bool,
40 pub key_field: &'static str,
41}
42
43#[derive(Debug, Clone, PartialEq, serde::Serialize)]
44pub struct CliEmbeddingProviderSpec {
45 pub id: &'static str,
46 pub provider: &'static str,
47 pub request_style: &'static str,
48 pub default_url: String,
49 pub default_model: &'static str,
50 pub auth: &'static str,
51 pub key_field: &'static str,
52}
53
54pub fn ocr_provider_specs() -> Vec<CliOcrProviderSpec> {
55 builtin_ocr_provider_specs()
56 .into_iter()
57 .map(cli_ocr_provider_spec)
58 .collect()
59}
60
61pub fn ocr_provider_spec(provider: &str) -> Result<CliOcrProviderSpec, String> {
62 zotron_types::ocr_provider_spec(provider).map(cli_ocr_provider_spec)
63}
64
65pub fn embedding_provider_spec(provider: &str) -> Result<CliEmbeddingProviderSpec, String> {
66 let spec = zotron_types::embedding_provider_spec(provider)?;
67 Ok(CliEmbeddingProviderSpec {
68 id: spec.id,
69 provider: spec.provider_key,
70 request_style: if spec.provider_key == "alibaba" {
71 "dashscope"
72 } else {
73 spec.request_style.as_str()
74 },
75 default_url: spec.default_url.unwrap_or("").to_string(),
76 default_model: spec.default_model,
77 auth: spec.auth,
78 key_field: spec.key_field,
79 })
80}
81
82pub fn chunks_from_blocks(blocks: &[Value], max_chars: usize) -> Result<Vec<Value>, String> {
83 let typed = blocks
84 .iter()
85 .map(json_block_to_pdf_block)
86 .collect::<Result<Vec<_>, _>>()?;
87 let chunks = zotron_types::chunks_from_blocks(&typed, max_chars);
88 chunks
89 .into_iter()
90 .map(|chunk| chunk_to_cli_value(&chunk, &typed))
91 .collect()
92}
93
94fn cli_ocr_provider_spec(spec: zotron_types::OcrProviderSpec) -> CliOcrProviderSpec {
95 CliOcrProviderSpec {
96 id: spec.provider_key,
97 provider: spec.provider_key,
98 request_style: spec.request_style.as_str(),
99 auth: spec.auth,
100 auth_header: spec.auth_header,
101 supports_pdf_direct: spec.supports_pdf_direct,
102 key_field: spec.key_field,
103 }
104}
105
106fn json_block_to_pdf_block(value: &Value) -> Result<zotron_types::PdfEvidenceBlock, String> {
107 let block_key = value
108 .get("block_key")
109 .and_then(Value::as_str)
110 .ok_or_else(|| "block missing block_key".to_string())?
111 .to_string();
112 let item_key = value
113 .get("item_key")
114 .and_then(Value::as_str)
115 .ok_or_else(|| "block missing item_key".to_string())?
116 .to_string();
117 let attachment_key = value
118 .get("attachment_key")
119 .and_then(Value::as_str)
120 .ok_or_else(|| "block missing attachment_key".to_string())?
121 .to_string();
122 let page_idx = value
123 .get("page_idx")
124 .or_else(|| value.get("page"))
125 .and_then(Value::as_u64)
126 .unwrap_or(1);
127 let block_type = value
128 .get("type")
129 .or_else(|| value.get("block_type"))
130 .and_then(Value::as_str)
131 .unwrap_or("paragraph")
132 .to_string();
133 let section_path = value
134 .get("section_path")
135 .and_then(Value::as_array)
136 .map(|items| {
137 items
138 .iter()
139 .filter_map(Value::as_str)
140 .map(ToString::to_string)
141 .collect::<Vec<_>>()
142 })
143 .unwrap_or_default();
144 let text = value
145 .get("text")
146 .and_then(Value::as_str)
147 .unwrap_or("")
148 .to_string();
149 let bbox = value.get("bbox").and_then(value_bbox4);
150
151 Ok(zotron_types::PdfEvidenceBlock {
152 block_key,
153 item_key,
154 attachment_key,
155 page_idx,
156 block_type,
157 bbox,
158 section_path,
159 text,
160 })
161}
162
163fn chunk_to_cli_value(
164 chunk: &zotron_types::StructureChunk,
165 blocks: &[zotron_types::PdfEvidenceBlock],
166) -> Result<Value, String> {
167 let refs = chunk
168 .block_keys
169 .iter()
170 .filter_map(|key| blocks.iter().find(|block| &block.block_key == key))
171 .map(|block| {
172 serde_json::json!({
173 "block_key": block.block_key,
174 "page_idx": block.page_idx,
175 "bbox": block.bbox.map(|bbox| bbox.iter().map(|n| {
176 if n.fract() == 0.0 {
177 Value::from(*n as i64)
178 } else {
179 Value::from(*n)
180 }
181 }).collect::<Vec<_>>()),
182 })
183 })
184 .collect::<Vec<_>>();
185 Ok(serde_json::json!({
186 "chunk_key": chunk.chunk_key,
187 "item_key": chunk.item_key,
188 "attachment_key": chunk.attachment_key,
189 "block_keys": chunk.block_keys,
190 "section_path": chunk.section_path,
191 "text": chunk.text,
192 "page_start": chunk.page_start,
193 "page_end": chunk.page_end,
194 "evidence_refs": refs,
195 }))
196}
197
198fn value_bbox4(value: &Value) -> Option<[f64; 4]> {
199 let arr = value.as_array()?;
200 if arr.len() != 4 {
201 return None;
202 }
203 Some([
204 arr[0].as_f64()?,
205 arr[1].as_f64()?,
206 arr[2].as_f64()?,
207 arr[3].as_f64()?,
208 ])
209}
210
211impl RpcCaller for ZoteroRpc {
212 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String> {
213 self.call(method, params).map_err(|err| err.to_string())
214 }
215}
216
217#[derive(Debug, Parser)]
218#[command(name = "zotron", about = "Rust client + CLI for the Zotron XPI")]
219struct Cli {
220 #[command(subcommand)]
221 command: Command,
222}
223
224#[derive(Debug, Subcommand)]
225enum OcrCommand {
226 Providers,
228 #[command(name = "run")]
230 Run {
231 #[arg(long)]
232 provider: String,
233 #[arg(long)]
235 input: Option<String>,
236 #[arg(long)]
238 file: Option<String>,
239 #[arg(long = "item-key")]
241 item_key: Option<String>,
242 #[arg(long = "attachment-key")]
244 attachment_key: Option<String>,
245 #[arg(long = "mime-type")]
247 mime_type: Option<String>,
248 #[arg(long)]
250 endpoint: Option<String>,
251 #[arg(long = "api-key-env")]
253 api_key_env: Option<String>,
254 },
255 Status {
257 #[arg(long)]
258 collection: String,
259 #[arg(long, default_value = DEFAULT_RPC_URL)]
260 url: String,
261 },
262 #[command(name = "process")]
264 Process {
265 #[arg(long, default_value = "mineru")]
266 provider: String,
267 #[arg(long)]
269 parent: String,
270 #[arg(long)]
272 attachment: Option<String>,
273 #[arg(long = "source-url")]
275 source_url: Option<String>,
276 #[arg(long = "result-dir")]
278 result_dir: Option<String>,
279 #[arg(long = "result-zip")]
281 result_zip: Option<String>,
282 #[arg(long = "provider-endpoint")]
284 provider_endpoint: Option<String>,
285 #[arg(long = "api-key-env", default_value = "ZOTRON_MINERU_API_KEY")]
287 api_key_env: String,
288 #[arg(long = "poll-interval-seconds", default_value_t = 5)]
289 poll_interval_seconds: u64,
290 #[arg(long = "timeout-seconds", default_value_t = 900)]
291 timeout_seconds: u64,
292 #[arg(long = "chunk-chars", default_value_t = 1200)]
293 chunk_chars: usize,
294 #[arg(long, default_value = DEFAULT_RPC_URL)]
295 url: String,
296 },
297}
298
299#[derive(Debug, Subcommand)]
300enum Command {
301 Ping {
303 #[arg(long, default_value = DEFAULT_RPC_URL)]
304 url: String,
305 },
306 Rpc {
308 method: String,
309 #[arg(default_value = "{}")]
310 params_json: String,
311 #[arg(long, default_value = DEFAULT_RPC_URL)]
312 url: String,
313 #[arg(long)]
314 paginate: bool,
315 #[arg(long, default_value_t = 100)]
316 page_size: usize,
317 },
318 Push {
320 json_file: String,
322 #[arg(long)]
324 pdf: Option<String>,
325 #[arg(long)]
327 collection: Option<String>,
328 #[arg(long = "on-duplicate", default_value = "skip")]
330 on_duplicate: String,
331 #[arg(long, default_value = DEFAULT_RPC_URL)]
332 url: String,
333 #[arg(long = "dry-run")]
335 dry_run: bool,
336 },
337 System {
339 #[command(subcommand)]
340 command: SystemCommand,
341 },
342 Search(SearchArgs),
344 Items {
346 #[command(subcommand)]
347 command: ItemsCommand,
348 },
349 Collections {
351 #[command(subcommand)]
352 command: CollectionsCommand,
353 },
354 Notes {
356 #[command(subcommand)]
357 command: NotesCommand,
358 },
359 Settings {
361 #[command(subcommand)]
362 command: SettingsCommand,
363 },
364 Tags {
366 #[command(subcommand)]
367 command: TagsCommand,
368 },
369 Export(ExportArgs),
371 Annotations {
373 #[command(subcommand)]
374 command: AnnotationsCommand,
375 },
376 Ocr {
378 #[command(subcommand)]
379 command: OcrCommand,
380 },
381 Rag {
383 #[command(subcommand)]
384 command: RagCommand,
385 },
386}
387
388struct RagSearchOptions {
389 query: String,
390 collection: Option<String>,
391 keys: Vec<String>,
392 zotero: bool,
393 top_spans_per_item: u64,
394 include_fulltext_spans: bool,
395 top_k: u64,
396 output: String,
397}
398
399#[derive(Debug, Subcommand)]
400enum RagCommand {
401 #[command(name = "providers")]
403 Providers,
404 #[command(name = "embed")]
406 Embed {
407 #[arg(long)]
408 provider: String,
409 #[arg(long)]
411 input: String,
412 #[arg(long)]
414 endpoint: Option<String>,
415 #[arg(long)]
417 model: Option<String>,
418 #[arg(long = "input-type")]
420 input_type: Option<String>,
421 #[arg(long = "api-key-env")]
423 api_key_env: Option<String>,
424 },
425 Status {
427 #[arg(long)]
428 collection: String,
429 #[arg(long, default_value = DEFAULT_RPC_URL)]
430 url: String,
431 },
432 #[command(name = "search")]
434 Search {
435 query: String,
436 #[arg(long)]
437 collection: Option<String>,
438 #[arg(long = "key", alias = "keys")]
440 keys: Vec<String>,
441 #[arg(long)]
442 zotero: bool,
443 #[arg(long = "top-spans-per-item", default_value_t = 3)]
444 top_spans_per_item: u64,
445 #[arg(long = "include-fulltext-spans")]
446 include_fulltext_spans: bool,
447 #[arg(long = "limit", alias = "top-k", default_value_t = 50)]
448 top_k: u64,
449 #[arg(long, default_value = "json", value_parser = ["json", "jsonl"])]
450 output: String,
451 #[arg(long, default_value = DEFAULT_RPC_URL)]
452 url: String,
453 },
454}
455
456#[derive(Debug, Subcommand)]
457enum SystemCommand {
458 Version {
460 #[arg(long, default_value = DEFAULT_RPC_URL)]
461 url: String,
462 },
463 Libraries {
465 #[arg(long, default_value = DEFAULT_RPC_URL)]
466 url: String,
467 },
468 #[command(name = "library-stats")]
470 LibraryStats {
471 #[arg(long)]
472 library: Option<i64>,
473 #[arg(long, default_value = DEFAULT_RPC_URL)]
474 url: String,
475 },
476 Schema {
478 #[arg(long = "type")]
479 item_type: Option<String>,
480 #[arg(long, default_value = DEFAULT_RPC_URL)]
481 url: String,
482 },
483 #[command(name = "current-collection")]
485 CurrentCollection {
486 #[arg(long, default_value = DEFAULT_RPC_URL)]
487 url: String,
488 },
489 Methods {
491 method: Option<String>,
493 #[arg(long, default_value = DEFAULT_RPC_URL)]
494 url: String,
495 },
496}
497
498#[derive(Debug, clap::Args)]
499struct SearchArgs {
500 query: Option<String>,
502 #[arg(long)]
504 fulltext: bool,
505 #[arg(long)]
507 author: Option<String>,
508 #[arg(long)]
510 after: Option<String>,
511 #[arg(long)]
513 before: Option<String>,
514 #[arg(long)]
516 journal: Option<String>,
517 #[arg(long)]
519 tag: Option<String>,
520 #[arg(long)]
522 doi: Option<String>,
523 #[arg(long)]
525 isbn: Option<String>,
526 #[arg(long)]
528 issn: Option<String>,
529 #[arg(long)]
531 collection: Option<String>,
532 #[arg(long, default_value_t = 50)]
533 limit: u64,
534 #[arg(long, default_value_t = 0)]
535 offset: u64,
536 #[arg(long, default_value = DEFAULT_RPC_URL)]
537 url: String,
538 #[command(subcommand)]
539 management: Option<SearchManagementCommand>,
540}
541
542#[derive(Debug, Subcommand)]
543enum SearchManagementCommand {
544 #[command(name = "saved-searches")]
546 SavedSearches {
547 #[arg(long, default_value = DEFAULT_RPC_URL)]
548 url: String,
549 },
550 #[command(name = "create-saved")]
552 CreateSaved {
553 name: String,
554 #[arg(long = "condition", required = true)]
555 condition: Vec<String>,
556 #[arg(long)]
557 dry_run: bool,
558 #[arg(long, default_value = DEFAULT_RPC_URL)]
559 url: String,
560 },
561 #[command(name = "delete-saved")]
563 DeleteSaved {
564 search_key: String,
565 #[arg(long)]
566 dry_run: bool,
567 #[arg(long, default_value = DEFAULT_RPC_URL)]
568 url: String,
569 },
570}
571
572#[derive(Debug, Subcommand)]
573enum ItemsCommand {
574 Add {
576 #[arg(long)]
577 doi: Option<String>,
578 #[arg(long)]
579 isbn: Option<String>,
580 #[arg(long = "from-url")]
582 from_url: Option<String>,
583 #[arg(long)]
585 file: Option<String>,
586 #[arg(long = "type")]
588 item_type: Option<String>,
589 #[arg(long = "field")]
591 fields: Vec<String>,
592 #[arg(long)]
593 collection: Option<String>,
594 #[arg(long)]
595 dry_run: bool,
596 #[arg(long, default_value = DEFAULT_RPC_URL)]
597 url: String,
598 },
599 Update {
601 key: String,
602 #[arg(long = "field")]
603 fields: Vec<String>,
604 #[arg(long)]
605 dry_run: bool,
606 #[arg(long, default_value = DEFAULT_RPC_URL)]
607 url: String,
608 },
609 Delete {
611 key: String,
612 #[arg(long)]
613 dry_run: bool,
614 #[arg(long, default_value = DEFAULT_RPC_URL)]
615 url: String,
616 },
617 Trash {
619 items: Vec<String>,
620 #[arg(long)]
621 dry_run: bool,
622 #[arg(long, default_value = DEFAULT_RPC_URL)]
623 url: String,
624 },
625 Restore {
627 item: String,
628 #[arg(long)]
629 dry_run: bool,
630 #[arg(long, default_value = DEFAULT_RPC_URL)]
631 url: String,
632 },
633 #[command(name = "merge-duplicates")]
635 MergeDuplicates {
636 keys: Vec<String>,
637 #[arg(long)]
638 dry_run: bool,
639 #[arg(long, default_value = DEFAULT_RPC_URL)]
640 url: String,
641 },
642 #[command(name = "add-related")]
644 AddRelated {
645 key: String,
646 #[arg(long)]
647 target: String,
648 #[arg(long)]
649 dry_run: bool,
650 #[arg(long, default_value = DEFAULT_RPC_URL)]
651 url: String,
652 },
653 #[command(name = "remove-related")]
655 RemoveRelated {
656 key: String,
657 #[arg(long)]
658 target: String,
659 #[arg(long)]
660 dry_run: bool,
661 #[arg(long, default_value = DEFAULT_RPC_URL)]
662 url: String,
663 },
664 Get {
666 item: String,
667 #[arg(long, default_value = DEFAULT_RPC_URL)]
668 url: String,
669 },
670 List {
672 #[arg(long, default_value_t = 50)]
673 limit: u64,
674 #[arg(long, default_value_t = 0)]
675 offset: u64,
676 #[arg(long)]
677 sort: Option<String>,
678 #[arg(long, default_value = "asc")]
679 direction: String,
680 #[arg(long)]
682 trash: bool,
683 #[arg(long, default_value = DEFAULT_RPC_URL)]
684 url: String,
685 },
686 #[command(name = "find-duplicates")]
688 FindDuplicates {
689 #[arg(long, default_value = DEFAULT_RPC_URL)]
690 url: String,
691 },
692 Recent {
694 #[arg(long, default_value_t = 20)]
695 limit: u64,
696 #[arg(long, default_value_t = 0)]
697 offset: u64,
698 #[arg(long = "type", default_value = "added")]
699 recent_type: String,
700 #[arg(long, default_value = DEFAULT_RPC_URL)]
701 url: String,
702 },
703 Fulltext {
705 key: String,
706 #[arg(long, default_value = DEFAULT_RPC_URL)]
707 url: String,
708 },
709 Related {
711 key: String,
712 #[arg(long, default_value = DEFAULT_RPC_URL)]
713 url: String,
714 },
715 #[command(name = "citation-key")]
717 CitationKey {
718 key: String,
719 #[arg(long, default_value = DEFAULT_RPC_URL)]
720 url: String,
721 },
722 Path {
724 key: String,
725 #[arg(long, default_value = DEFAULT_RPC_URL)]
726 url: String,
727 },
728 Attachments {
730 key: String,
731 #[arg(long, default_value_t = 0)]
732 offset: u64,
733 #[arg(long, default_value = DEFAULT_RPC_URL)]
734 url: String,
735 },
736 #[command(name = "find-pdfs")]
738 FindPdfs {
739 #[arg(long)]
740 collection: String,
741 #[arg(long, default_value_t = 0)]
742 limit: usize,
743 #[arg(long, default_value = DEFAULT_RPC_URL)]
744 url: String,
745 },
746}
747
748#[derive(Debug, Subcommand)]
749enum SettingsCommand {
750 Get {
752 key: String,
753 #[arg(long, default_value = DEFAULT_RPC_URL)]
754 url: String,
755 },
756 #[command(visible_alias = "get-all")]
758 List {
759 #[arg(long, default_value = DEFAULT_RPC_URL)]
760 url: String,
761 },
762 Set {
764 pairs: Vec<String>,
766 #[arg(long)]
768 file: Option<String>,
769 #[arg(long)]
770 dry_run: bool,
771 #[arg(long, default_value = DEFAULT_RPC_URL)]
772 url: String,
773 },
774}
775
776#[derive(Debug, Subcommand)]
777enum TagsCommand {
778 List {
780 #[arg(long, default_value_t = 200)]
781 limit: u64,
782 #[arg(long, default_value = DEFAULT_RPC_URL)]
783 url: String,
784 },
785 Rename {
787 old: String,
788 new: String,
789 #[arg(long)]
790 dry_run: bool,
791 #[arg(long, default_value = DEFAULT_RPC_URL)]
792 url: String,
793 },
794 Delete {
796 tag: String,
797 #[arg(long)]
798 dry_run: bool,
799 #[arg(long, default_value = DEFAULT_RPC_URL)]
800 url: String,
801 },
802 Add {
804 keys: Vec<String>,
805 #[arg(long = "tag", required = true)]
806 tags: Vec<String>,
807 #[arg(long)]
808 dry_run: bool,
809 #[arg(long, default_value = DEFAULT_RPC_URL)]
810 url: String,
811 },
812 Remove {
814 keys: Vec<String>,
815 #[arg(long = "tag", required = true)]
816 tags: Vec<String>,
817 #[arg(long)]
818 dry_run: bool,
819 #[arg(long, default_value = DEFAULT_RPC_URL)]
820 url: String,
821 },
822}
823
824#[derive(Debug, clap::Args)]
825struct ExportArgs {
826 keys: Vec<String>,
828 #[arg(long, default_value = "bibtex")]
830 format: String,
831 #[arg(long)]
833 collection: Option<String>,
834 #[arg(long, default_value = "http://www.zotero.org/styles/apa")]
836 style: String,
837 #[arg(long)]
839 html: bool,
840 #[arg(long, default_value = DEFAULT_RPC_URL)]
841 url: String,
842}
843
844#[derive(Debug, Subcommand)]
845enum AnnotationsCommand {
846 List {
848 parent: String,
850 #[arg(long)]
852 attachment: Option<String>,
853 #[arg(long, default_value = DEFAULT_RPC_URL)]
854 url: String,
855 },
856 Create {
858 parent: String,
860 #[arg(long)]
862 attachment: Option<String>,
863 #[arg(long = "type")]
864 annotation_type: Option<String>,
865 #[arg(long)]
868 position: Option<String>,
869 #[arg(long)]
873 quote: Option<String>,
874 #[arg(long)]
876 page: Option<u32>,
877 #[arg(long = "sort-index")]
879 sort_index: Option<String>,
880 #[arg(long)]
881 text: Option<String>,
882 #[arg(long)]
883 comment: Option<String>,
884 #[arg(long, default_value = "#ffd400")]
885 color: String,
886 #[arg(long)]
887 dry_run: bool,
888 #[arg(long, default_value = DEFAULT_RPC_URL)]
889 url: String,
890 },
891 Delete {
893 annotation_key: String,
894 #[arg(long)]
895 dry_run: bool,
896 #[arg(long, default_value = DEFAULT_RPC_URL)]
897 url: String,
898 },
899}
900
901#[derive(Debug, Subcommand)]
902enum NotesCommand {
903 List {
905 #[arg(long)]
906 parent: String,
907 #[arg(long, default_value_t = 50)]
908 limit: u64,
909 #[arg(long, default_value_t = 0)]
910 offset: u64,
911 #[arg(long, default_value = DEFAULT_RPC_URL)]
912 url: String,
913 },
914 Get {
916 note_key: String,
917 #[arg(long, default_value = DEFAULT_RPC_URL)]
918 url: String,
919 },
920 Create {
922 #[arg(long)]
923 parent: String,
924 #[arg(long)]
925 content: String,
926 #[arg(long = "tag")]
927 tags: Vec<String>,
928 #[arg(long)]
929 dry_run: bool,
930 #[arg(long, default_value = DEFAULT_RPC_URL)]
931 url: String,
932 },
933 Update {
935 note_key: String,
936 #[arg(long)]
937 content: String,
938 #[arg(long)]
939 dry_run: bool,
940 #[arg(long, default_value = DEFAULT_RPC_URL)]
941 url: String,
942 },
943 Delete {
945 note_key: String,
946 #[arg(long)]
947 dry_run: bool,
948 #[arg(long, default_value = DEFAULT_RPC_URL)]
949 url: String,
950 },
951 Search {
953 query: String,
954 #[arg(long, default_value_t = 50)]
955 limit: u64,
956 #[arg(long, default_value = DEFAULT_RPC_URL)]
957 url: String,
958 },
959}
960
961#[derive(Debug, Subcommand)]
962enum CollectionsCommand {
963 List {
965 #[arg(long, default_value = DEFAULT_RPC_URL)]
966 url: String,
967 },
968 Tree {
970 #[arg(long, default_value = DEFAULT_RPC_URL)]
971 url: String,
972 },
973 Get {
975 name_or_id: String,
976 #[arg(long, default_value = DEFAULT_RPC_URL)]
977 url: String,
978 },
979 #[command(name = "get-items", visible_alias = "items")]
981 GetItems {
982 name_or_id: String,
983 #[arg(long)]
984 limit: Option<u64>,
985 #[arg(long, default_value_t = 0)]
986 offset: u64,
987 #[arg(long, default_value = DEFAULT_RPC_URL)]
988 url: String,
989 },
990 Stats {
992 name_or_id: String,
993 #[arg(long, default_value = DEFAULT_RPC_URL)]
994 url: String,
995 },
996 Rename {
998 old_name: String,
999 new_name: String,
1000 #[arg(long, default_value = DEFAULT_RPC_URL)]
1001 url: String,
1002 #[arg(long)]
1003 dry_run: bool,
1004 },
1005 Create {
1007 name: String,
1008 #[arg(long)]
1009 parent: Option<String>,
1010 #[arg(long, default_value = DEFAULT_RPC_URL)]
1011 url: String,
1012 #[arg(long)]
1013 dry_run: bool,
1014 },
1015 Delete {
1017 name_or_id: String,
1018 #[arg(long, default_value = DEFAULT_RPC_URL)]
1019 url: String,
1020 #[arg(long)]
1021 dry_run: bool,
1022 },
1023 #[command(name = "add-items")]
1025 AddItems {
1026 collection: String,
1027 item_keys: Vec<String>,
1028 #[arg(long, default_value = DEFAULT_RPC_URL)]
1029 url: String,
1030 #[arg(long)]
1031 dry_run: bool,
1032 },
1033 #[command(name = "remove-items")]
1035 RemoveItems {
1036 collection: String,
1037 item_keys: Vec<String>,
1038 #[arg(long, default_value = DEFAULT_RPC_URL)]
1039 url: String,
1040 #[arg(long)]
1041 dry_run: bool,
1042 },
1043}
1044
1045#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1046enum JsonStyle {
1047 PythonCompact,
1049 Pretty,
1051}
1052
1053enum ParseOutcome<T> {
1054 Command(T),
1055 Display(String),
1056}
1057
1058fn parse_cli<T>(
1059 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1060) -> Result<ParseOutcome<T>, String>
1061where
1062 T: Parser,
1063{
1064 match T::try_parse_from(args) {
1065 Ok(cli) => Ok(ParseOutcome::Command(cli)),
1066 Err(err)
1067 if matches!(
1068 err.kind(),
1069 ErrorKind::DisplayHelp | ErrorKind::DisplayVersion
1070 ) =>
1071 {
1072 Ok(ParseOutcome::Display(err.to_string()))
1073 }
1074 Err(err) => Err(err.to_string()),
1075 }
1076}
1077
1078pub fn format_error_json(message: &str) -> String {
1079 let message = message.trim_end();
1080 let (code, message) = split_error_code(message).unwrap_or(("RUNTIME_ERROR", message));
1081 serde_json::json!({"error": {"code": code, "message": message}}).to_string()
1082}
1083
1084fn split_error_code(message: &str) -> Option<(&str, &str)> {
1085 let (code, rest) = message.split_once(':')?;
1086 if !code.is_empty()
1087 && code
1088 .chars()
1089 .all(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_')
1090 {
1091 Some((code, rest.trim_start()))
1092 } else {
1093 None
1094 }
1095}
1096
1097pub fn run(
1098 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1099) -> Result<String, String> {
1100 if std::env::var("CLAUDECODE").as_deref() == Ok("1") {
1102 eprintln!(r#"<claude-code-hint v="1" type="plugin" value="zotron@dianzuan/zotron" />"#);
1103 }
1104
1105 let cli = match parse_cli::<Cli>(args)? {
1106 ParseOutcome::Command(cli) => cli,
1107 ParseOutcome::Display(output) => return Ok(output),
1108 };
1109 let url = command_url(&cli.command);
1110 let mut client = ZoteroRpc::new(url);
1111 run_command(cli.command, &mut client)
1112}
1113
1114pub fn run_with_client(
1115 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1116 client: &mut impl RpcCaller,
1117) -> Result<String, String> {
1118 let cli = match parse_cli::<Cli>(args)? {
1119 ParseOutcome::Command(cli) => cli,
1120 ParseOutcome::Display(output) => return Ok(output),
1121 };
1122 run_command(cli.command, client)
1123}
1124
1125fn rag_command_url(command: &RagCommand) -> String {
1126 match command {
1127 RagCommand::Providers => DEFAULT_RPC_URL.to_string(),
1128 RagCommand::Embed { .. } => DEFAULT_RPC_URL.to_string(),
1129 RagCommand::Status { url, .. } => url.clone(),
1130 RagCommand::Search { url, .. } => url.clone(),
1131 }
1132}
1133
1134fn command_url(command: &Command) -> String {
1135 match command {
1136 Command::Ping { url }
1137 | Command::Rpc { url, .. }
1138 | Command::Push { url, .. }
1139 => url.clone(),
1140 Command::Ocr { command } => match command {
1141 OcrCommand::Providers => DEFAULT_RPC_URL.to_string(),
1142 OcrCommand::Run { .. } => DEFAULT_RPC_URL.to_string(),
1143 OcrCommand::Status { url, .. } => url.clone(),
1144 OcrCommand::Process { url, .. } => url.clone(),
1145 },
1146 Command::Rag { command } => rag_command_url(command),
1147 Command::System { command } => match command {
1148 SystemCommand::Version { url }
1149 | SystemCommand::Libraries { url }
1150 | SystemCommand::LibraryStats { url, .. }
1151 | SystemCommand::Schema { url, .. }
1152 | SystemCommand::CurrentCollection { url }
1153 | SystemCommand::Methods { url, .. } => url.clone(),
1154 },
1155 Command::Search(ref args) => match &args.management {
1156 Some(SearchManagementCommand::SavedSearches { url })
1157 | Some(SearchManagementCommand::CreateSaved { url, .. })
1158 | Some(SearchManagementCommand::DeleteSaved { url, .. }) => url.clone(),
1159 None => args.url.clone(),
1160 },
1161 Command::Items { command } => match command {
1162 ItemsCommand::Add { url, .. }
1163 | ItemsCommand::Update { url, .. }
1164 | ItemsCommand::Delete { url, .. }
1165 | ItemsCommand::Trash { url, .. }
1166 | ItemsCommand::Restore { url, .. }
1167 | ItemsCommand::MergeDuplicates { url, .. }
1168 | ItemsCommand::AddRelated { url, .. }
1169 | ItemsCommand::RemoveRelated { url, .. }
1170 | ItemsCommand::Get { url, .. }
1171 | ItemsCommand::List { url, .. }
1172 | ItemsCommand::FindDuplicates { url }
1173 | ItemsCommand::Recent { url, .. }
1174 | ItemsCommand::Fulltext { url, .. }
1175 | ItemsCommand::Related { url, .. }
1176 | ItemsCommand::CitationKey { url, .. }
1177 | ItemsCommand::Path { url, .. }
1178 | ItemsCommand::Attachments { url, .. }
1179 | ItemsCommand::FindPdfs { url, .. } => url.clone(),
1180 },
1181 Command::Collections { command } => match command {
1182 CollectionsCommand::List { url }
1183 | CollectionsCommand::Tree { url }
1184 | CollectionsCommand::Get { url, .. }
1185 | CollectionsCommand::GetItems { url, .. }
1186 | CollectionsCommand::Stats { url, .. }
1187 | CollectionsCommand::Rename { url, .. }
1188 | CollectionsCommand::Create { url, .. }
1189 | CollectionsCommand::Delete { url, .. }
1190 | CollectionsCommand::AddItems { url, .. }
1191 | CollectionsCommand::RemoveItems { url, .. } => url.clone(),
1192 },
1193 Command::Notes { command } => match command {
1194 NotesCommand::List { url, .. }
1195 | NotesCommand::Get { url, .. }
1196 | NotesCommand::Create { url, .. }
1197 | NotesCommand::Update { url, .. }
1198 | NotesCommand::Delete { url, .. }
1199 | NotesCommand::Search { url, .. } => url.clone(),
1200 },
1201 Command::Settings { command } => match command {
1202 SettingsCommand::Get { url, .. }
1203 | SettingsCommand::List { url }
1204 | SettingsCommand::Set { url, .. } => url.clone(),
1205 },
1206 Command::Tags { command } => match command {
1207 TagsCommand::List { url, .. }
1208 | TagsCommand::Rename { url, .. }
1209 | TagsCommand::Delete { url, .. }
1210 | TagsCommand::Add { url, .. }
1211 | TagsCommand::Remove { url, .. } => url.clone(),
1212 },
1213 Command::Export(ref args) => args.url.clone(),
1214 Command::Annotations { command } => match command {
1215 AnnotationsCommand::List { url, .. }
1216 | AnnotationsCommand::Create { url, .. }
1217 | AnnotationsCommand::Delete { url, .. } => url.clone(),
1218 },
1219 }
1220}
1221
1222fn run_ocr_command(command: OcrCommand, client: &mut impl RpcCaller) -> Result<String, String> {
1223 if let OcrCommand::Providers = &command {
1224 return format_json(
1225 &serde_json::json!({ "providers": ocr_provider_specs() }),
1226 JsonStyle::Pretty,
1227 );
1228 }
1229 let value = match command {
1230 OcrCommand::Providers => unreachable!(),
1231 OcrCommand::Run {
1232 provider,
1233 input,
1234 file,
1235 item_key,
1236 attachment_key,
1237 mime_type,
1238 endpoint,
1239 api_key_env,
1240 } => run_ocr_run_command(OcrRunOptions {
1241 provider,
1242 input,
1243 file,
1244 item_key,
1245 attachment_key,
1246 mime_type,
1247 endpoint,
1248 api_key_env,
1249 })?,
1250 OcrCommand::Status { collection, .. } => run_ocr_status_command(client, collection)?,
1251 OcrCommand::Process {
1252 provider,
1253 parent,
1254 attachment,
1255 source_url,
1256 result_dir,
1257 result_zip,
1258 provider_endpoint,
1259 api_key_env,
1260 poll_interval_seconds,
1261 timeout_seconds,
1262 chunk_chars,
1263 ..
1264 } => run_ocr_process_command(
1265 client,
1266 OcrProcessOptions {
1267 provider,
1268 parent,
1269 attachment,
1270 source_url,
1271 result_dir,
1272 result_zip,
1273 provider_endpoint,
1274 api_key_env,
1275 poll_interval_seconds,
1276 timeout_seconds,
1277 chunk_chars,
1278 },
1279 )?,
1280 };
1281 format_json(&value, JsonStyle::PythonCompact)
1282}
1283
1284struct OcrProcessOptions {
1285 provider: String,
1286 parent: String,
1287 attachment: Option<String>,
1288 source_url: Option<String>,
1289 result_dir: Option<String>,
1290 result_zip: Option<String>,
1291 provider_endpoint: Option<String>,
1292 api_key_env: String,
1293 poll_interval_seconds: u64,
1294 timeout_seconds: u64,
1295 chunk_chars: usize,
1296}
1297
1298struct OcrRunOptions {
1299 provider: String,
1300 input: Option<String>,
1301 file: Option<String>,
1302 item_key: Option<String>,
1303 attachment_key: Option<String>,
1304 mime_type: Option<String>,
1305 endpoint: Option<String>,
1306 api_key_env: Option<String>,
1307}
1308
1309fn run_ocr_run_command(options: OcrRunOptions) -> Result<Value, String> {
1310 let input: OcrRequestInput = match (options.input, options.file) {
1311 (Some(input), None) => read_json_input(&input)?,
1312 (None, Some(file)) => ocr_input_from_file(
1313 file,
1314 options.item_key,
1315 options.attachment_key,
1316 options.mime_type,
1317 )?,
1318 (Some(_), Some(_)) => {
1319 return Err("INVALID_ARGS: use either --input or --file, not both".to_string())
1320 }
1321 (None, None) => return Err("INVALID_ARGS: provide --input JSON or --file".to_string()),
1322 };
1323 let request = build_ocr_provider_request(&options.provider, &input)?;
1324 let payload = if request.command.is_empty() {
1325 let method = request
1326 .method
1327 .ok_or_else(|| format!("OCR provider {} missing HTTP method", request.provider))?;
1328 let auth_scheme = raw_ocr_provider_spec(&options.provider)?.auth;
1329 let mut transport =
1330 provider_http_transport_with_auth(options.api_key_env.as_deref(), auth_scheme)?;
1331 transport.post_json(&ProviderHttpInvocation {
1332 provider: request.provider.to_string(),
1333 style: request.style.to_string(),
1334 method: method.to_string(),
1335 url: options
1336 .endpoint
1337 .or_else(|| request.url.map(ToString::to_string)),
1338 auth_header_name: request.auth_header.map(ToString::to_string),
1339 auth_header_value: None,
1340 body: request.body,
1341 })?
1342 } else {
1343 let mut command_runner = StdProviderCommandRunner;
1344 command_runner.run_json(&request.command)?
1345 };
1346 let blocks = match parse_ocr_provider_response(
1347 request.provider,
1348 &payload,
1349 &input.item_key,
1350 &input.attachment_key,
1351 ) {
1352 Ok(blocks) => blocks,
1353 Err(err) => {
1354 if let Some(task) = ocr_async_task_result(request.provider, &payload) {
1355 return Ok(task);
1356 }
1357 return Err(err);
1358 }
1359 };
1360
1361 Ok(serde_json::json!({
1362 "provider": request.provider,
1363 "blocks": blocks,
1364 }))
1365}
1366
1367fn run_ocr_process_command(
1368 client: &mut impl RpcCaller,
1369 mut options: OcrProcessOptions,
1370) -> Result<Value, String> {
1371 let spec = raw_ocr_provider_spec(&options.provider)?;
1372
1373 let attachment = match options.attachment.take() {
1374 Some(key) => key,
1375 None => resolve_first_pdf_attachment_key(client, &options.parent)?,
1376 };
1377 options.attachment = Some(attachment.clone());
1378
1379 let attachment_path = resolve_attachment_path(client, &attachment)?;
1380 let storage_dir = attachment_path
1381 .parent()
1382 .ok_or_else(|| {
1383 format!(
1384 "ATTACHMENT_PATH_INVALID: attachment path has no parent directory: {}",
1385 attachment_path.display()
1386 )
1387 })?
1388 .to_path_buf();
1389
1390 match spec.provider_key {
1391 "mineru" | "mineru-cli" => {
1392 if options.result_dir.is_some() && options.result_zip.is_some() {
1393 return Err("INVALID_ARGS: use either --result-dir or --result-zip, not both".to_string());
1394 }
1395 if options.source_url.is_some()
1396 && (options.result_dir.is_some() || options.result_zip.is_some())
1397 {
1398 return Err(
1399 "INVALID_ARGS: --source-url cannot be combined with --result-dir/--result-zip"
1400 .to_string(),
1401 );
1402 }
1403 let file_name = attachment_path
1404 .file_name()
1405 .and_then(|name| name.to_str())
1406 .unwrap_or("document.pdf")
1407 .to_string();
1408 let source = load_mineru_result_source(&options, &attachment_path, &file_name)?;
1409 let artifacts = persist_mineru_result_sidecars(
1410 &storage_dir, &options.parent, &attachment,
1411 &options.provider, &source, options.chunk_chars,
1412 )?;
1413 Ok(serde_json::json!({
1414 "provider": spec.provider_key,
1415 "status": "indexed",
1416 "item_key": options.parent,
1417 "attachment_key": attachment,
1418 "attachment_path": attachment_path,
1419 "storage_dir": storage_dir,
1420 "task_id": source.task_id,
1421 "state": source.state,
1422 "blocks": artifacts.block_count,
1423 "chunks": artifacts.chunk_count,
1424 "artifacts": artifacts.artifacts,
1425 }))
1426 }
1427 _ => {
1428 run_ocr_process_sync(
1429 client, &options, spec.provider_key,
1430 &attachment, &attachment_path, &storage_dir,
1431 )
1432 }
1433 }
1434}
1435
1436fn run_ocr_process_sync(
1437 client: &mut impl RpcCaller,
1438 options: &OcrProcessOptions,
1439 provider: &str,
1440 attachment_key: &str,
1441 attachment_path: &Path,
1442 storage_dir: &Path,
1443) -> Result<Value, String> {
1444 let api_url = if let Some(endpoint) = &options.provider_endpoint {
1445 endpoint.clone()
1446 } else {
1447 let settings = client.call("settings.getAll", None)?;
1448 settings.get("ocr.apiUrl")
1449 .and_then(Value::as_str)
1450 .unwrap_or("")
1451 .to_string()
1452 };
1453 if api_url.is_empty() {
1454 return Err(format!("MISSING_CONFIG: ocr.apiUrl not configured for provider {provider}"));
1455 }
1456
1457 let api_key = {
1458 let from_env = if !options.api_key_env.is_empty() {
1459 env::var(&options.api_key_env).ok().filter(|v| !v.is_empty())
1460 } else {
1461 None
1462 };
1463 from_env.unwrap_or_else(|| {
1464 client.call("settings.getRaw", Some(serde_json::json!({"key": "ocr.apiKey"})))
1465 .ok()
1466 .and_then(|raw| raw.get("ocr.apiKey").and_then(Value::as_str).map(String::from))
1467 .unwrap_or_default()
1468 })
1469 };
1470
1471 let pdf_bytes = fs::read(attachment_path)
1472 .map_err(|e| format!("READ_PDF_FAILED: {}: {e}", attachment_path.display()))?;
1473
1474 const MAX_PDF_SIZE: usize = 100 * 1024 * 1024; if pdf_bytes.len() > MAX_PDF_SIZE {
1476 return Err(format!(
1477 "PDF_TOO_LARGE: {} is {} MB, max {} MB",
1478 attachment_path.display(),
1479 pdf_bytes.len() / (1024 * 1024),
1480 MAX_PDF_SIZE / (1024 * 1024),
1481 ));
1482 }
1483
1484 let base64_pdf = format!("data:application/pdf;base64,{}", base64_encode(&pdf_bytes));
1485
1486 let input = OcrRequestInput {
1487 content_base64: base64_pdf,
1488 file_name: attachment_path
1489 .file_name()
1490 .and_then(|n| n.to_str())
1491 .unwrap_or("document.pdf")
1492 .to_string(),
1493 mime_type: "application/pdf".to_string(),
1494 item_key: options.parent.clone(),
1495 attachment_key: attachment_key.to_string(),
1496 source_url: None,
1497 local_path: Some(attachment_path.to_string_lossy().to_string()),
1498 output_dir: None,
1499 };
1500 let request = build_ocr_provider_request(provider, &input)?;
1501
1502 let payload = if request.command.is_empty() {
1503 let method = request
1504 .method
1505 .ok_or_else(|| format!("OCR provider {provider} missing HTTP method"))?;
1506 let spec = raw_ocr_provider_spec(provider)?;
1507
1508 let mut transport = if !api_key.is_empty() {
1509 match spec.auth {
1510 "bearer" => UreqProviderHttpTransport::with_bearer_token(&api_key),
1511 "token" => UreqProviderHttpTransport::with_api_key(format!("token {api_key}")),
1512 _ => UreqProviderHttpTransport::new(),
1513 }
1514 } else {
1515 UreqProviderHttpTransport::new()
1516 };
1517
1518 transport.post_json(&ProviderHttpInvocation {
1519 provider: request.provider.to_string(),
1520 style: request.style.to_string(),
1521 method: method.to_string(),
1522 url: Some(api_url),
1523 auth_header_name: request.auth_header.map(ToString::to_string),
1524 auth_header_value: None,
1525 body: request.body,
1526 })?
1527 } else {
1528 let mut runner = StdProviderCommandRunner;
1529 runner.run_json(&request.command)?
1530 };
1531
1532 let blocks = parse_ocr_provider_response(provider, &payload, &options.parent, attachment_key)?;
1533 let chunks = zotron_types::chunks_from_blocks(&blocks, options.chunk_chars);
1534
1535 let artifacts = vec![
1536 write_sidecar_json(
1537 storage_dir, &options.parent, attachment_key,
1538 MachineArtifactKind::OcrRaw, &payload,
1539 )?,
1540 write_sidecar_jsonl(
1541 storage_dir, &options.parent, attachment_key,
1542 MachineArtifactKind::Blocks, &blocks,
1543 )?,
1544 write_sidecar_jsonl(
1545 storage_dir, &options.parent, attachment_key,
1546 MachineArtifactKind::Chunks, &chunks,
1547 )?,
1548 ];
1549
1550 let embedding_count = embed_sidecar_chunks(client, storage_dir, &options.parent, attachment_key, &chunks);
1551
1552 Ok(serde_json::json!({
1553 "provider": provider,
1554 "status": "indexed",
1555 "item_key": options.parent,
1556 "attachment_key": attachment_key,
1557 "embeddings": embedding_count,
1558 "attachment_path": attachment_path,
1559 "storage_dir": storage_dir,
1560 "blocks": blocks.len(),
1561 "chunks": chunks.len(),
1562 "artifacts": artifacts,
1563 }))
1564}
1565
1566fn embed_sidecar_chunks(
1567 client: &mut impl RpcCaller,
1568 storage_dir: &Path,
1569 item_key: &str,
1570 _attachment_key: &str,
1571 chunks: &[zotron_types::StructureChunk],
1572) -> usize {
1573 let Ok((provider, model, api_url, api_key)) = fetch_embedding_settings(client) else {
1574 return 0;
1575 };
1576 if provider.is_empty() || (api_key.is_empty() && provider != "ollama") {
1577 return 0;
1578 }
1579 let emb_chunks: Vec<EmbeddingChunkInput> = chunks
1580 .iter()
1581 .map(|c| EmbeddingChunkInput {
1582 chunk_key: c.chunk_key.clone(),
1583 text: c.text.clone(),
1584 })
1585 .collect();
1586 if emb_chunks.is_empty() {
1587 return 0;
1588 }
1589 let batch_size = 20;
1591 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
1592 for batch in emb_chunks.chunks(batch_size) {
1593 let input = EmbeddingRequestInput {
1594 item_key: item_key.to_string(),
1595 chunks: batch.to_vec(),
1596 model: if model.is_empty() { None } else { Some(model.clone()) },
1597 url: if api_url.is_empty() { None } else { Some(api_url.clone()) },
1598 input_type: Some("document".to_string()),
1599 };
1600 let Ok(request) = build_embedding_provider_request(&provider, &input) else {
1601 break;
1602 };
1603 let Some(url) = request.url.as_deref() else { break };
1604 let mut http = ureq::post(url).set("Content-Type", "application/json");
1605 if let Some(auth) = request.auth_header {
1606 if !api_key.is_empty() {
1607 http = http.set(auth, &format!("Bearer {api_key}"));
1608 }
1609 }
1610 let Ok(resp) = http.send_json(&request.body) else { break };
1611 let Ok(payload): Result<Value, _> = resp.into_json() else { break };
1612 let Ok(vectors) = parse_embedding_provider_response(&provider, &payload, item_key, batch)
1613 else {
1614 break;
1615 };
1616 all_vectors.extend(vectors);
1617 }
1618 let count = all_vectors.len();
1619 if count > 0 {
1620 let filename = embedding_vector_filename(&provider, &model);
1621 let vectors_dir = storage_dir.join(".zotron").join("embeddings");
1622 fs::create_dir_all(&vectors_dir).map_err(|e| {
1623 eprintln!("warning: cannot create embeddings dir {}: {e}", vectors_dir.display());
1624 e
1625 }).ok();
1626 let vectors_path = vectors_dir.join(&filename);
1627 let mut out = String::new();
1628 for v in &all_vectors {
1629 if let Ok(line) = serde_json::to_string(v) {
1630 out.push_str(&line);
1631 out.push('\n');
1632 }
1633 }
1634 if let Err(e) = fs::write(&vectors_path, &out) {
1635 eprintln!("warning: failed to persist embeddings to {}: {e}", vectors_path.display());
1636 }
1637 }
1638 count
1639}
1640
1641struct MineruResultSource {
1642 task_id: Option<String>,
1643 state: String,
1644 result_dir: PathBuf,
1645 raw_zip_bytes: Option<Vec<u8>>,
1646 task_status: Option<Value>,
1647 payload: Value,
1648 content_list_file: Option<PathBuf>,
1649 markdown: Option<String>,
1650}
1651
1652struct PersistedOcrArtifacts {
1653 block_count: usize,
1654 chunk_count: usize,
1655 artifacts: Vec<Value>,
1656}
1657
1658fn resolve_attachment_path(
1659 client: &mut impl RpcCaller,
1660 attachment_key: &str,
1661) -> Result<PathBuf, String> {
1662 let payload = client.call(
1663 "attachments.getPath",
1664 Some(serde_json::json!({"key": attachment_key})),
1665 )?;
1666 let raw_path = payload
1667 .get("path")
1668 .and_then(Value::as_str)
1669 .filter(|path| !path.trim().is_empty())
1670 .ok_or_else(|| {
1671 format!("ATTACHMENT_PATH_NOT_FOUND: attachment {attachment_key} has no local PDF path")
1672 })?;
1673 Ok(PathBuf::from(local_path_from_zotero_path(raw_path)))
1674}
1675
1676fn resolve_first_pdf_attachment_key(
1678 client: &mut impl RpcCaller,
1679 parent_key: &str,
1680) -> Result<String, String> {
1681 let response = client.call(
1682 "attachments.list",
1683 Some(serde_json::json!({"parentKey": parent_key})),
1684 )?;
1685 let attachments = response
1687 .get("items")
1688 .and_then(Value::as_array)
1689 .or_else(|| response.as_array())
1690 .ok_or_else(|| {
1691 format!("NO_PDF_ATTACHMENT: no attachments found for item {parent_key}")
1692 })?;
1693 for attachment in attachments {
1694 if is_pdf_attachment(attachment) {
1695 if let Some(key) = attachment.get("key").and_then(Value::as_str) {
1696 return Ok(key.to_string());
1697 }
1698 }
1699 }
1700 Err(format!(
1701 "NO_PDF_ATTACHMENT: no PDF attachment found for item {parent_key}"
1702 ))
1703}
1704
1705fn load_mineru_result_source(
1706 options: &OcrProcessOptions,
1707 attachment_path: &Path,
1708 file_name: &str,
1709) -> Result<MineruResultSource, String> {
1710 if let Some(result_dir) = options.result_dir.as_deref() {
1711 return mineru_result_source_from_dir(PathBuf::from(result_dir), None, None, None);
1712 }
1713 if let Some(result_zip) = options.result_zip.as_deref() {
1714 let zip_path = PathBuf::from(result_zip);
1715 let zip_bytes = fs::read(&zip_path)
1716 .map_err(|err| format!("read MinerU result zip {}: {err}", zip_path.display()))?;
1717 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1718 return mineru_result_source_from_dir(result_dir, Some(zip_bytes), None, None);
1719 }
1720
1721 let Some(source_url) = options
1722 .source_url
1723 .as_deref()
1724 .filter(|value| !value.trim().is_empty())
1725 else {
1726 return submit_mineru_local_file(options, attachment_path, file_name);
1727 };
1728 let input = OcrRequestInput {
1729 item_key: options.parent.clone(),
1730 attachment_key: options.attachment.clone().expect("attachment resolved"),
1731 file_name: file_name.to_string(),
1732 mime_type: "application/pdf".to_string(),
1733 content_base64: format!("url:{source_url}"),
1734 source_url: Some(source_url.to_string()),
1735 local_path: None,
1736 output_dir: None,
1737 };
1738 let task = submit_mineru_task(
1739 &options.provider,
1740 &input,
1741 options.provider_endpoint.clone(),
1742 &options.api_key_env,
1743 )?;
1744 let task_id = task
1745 .get("data")
1746 .and_then(|data| data.get("task_id"))
1747 .and_then(Value::as_str)
1748 .ok_or_else(|| "MinerU submit response missing data.task_id".to_string())?
1749 .to_string();
1750 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1751 let status = poll_mineru_task(
1752 options.provider_endpoint.as_deref(),
1753 &task_id,
1754 &auth_header,
1755 options.poll_interval_seconds,
1756 options.timeout_seconds,
1757 )?;
1758 let zip_url = status
1759 .pointer("/data/full_zip_url")
1760 .or_else(|| status.pointer("/data/result/full_zip_url"))
1761 .and_then(Value::as_str)
1762 .ok_or_else(|| "MinerU completed task missing data.full_zip_url".to_string())?;
1763 let zip_bytes = download_bytes(zip_url)?;
1764 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1765 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(task_id))
1766}
1767
1768fn submit_mineru_local_file(
1769 options: &OcrProcessOptions,
1770 attachment_path: &Path,
1771 file_name: &str,
1772) -> Result<MineruResultSource, String> {
1773 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1774 let upload_request = create_mineru_file_upload(
1775 options.provider_endpoint.as_deref(),
1776 file_name,
1777 options.attachment.as_deref().expect("attachment resolved"),
1778 &auth_header,
1779 )?;
1780 let upload_url = upload_request
1781 .pointer("/data/file_urls/0")
1782 .or_else(|| upload_request.pointer("/data/fileUrls/0"))
1783 .and_then(Value::as_str)
1784 .ok_or_else(|| "MinerU upload URL response missing data.file_urls[0]".to_string())?;
1785 let batch_id = upload_request
1786 .pointer("/data/batch_id")
1787 .or_else(|| upload_request.pointer("/data/batchId"))
1788 .and_then(Value::as_str)
1789 .ok_or_else(|| "MinerU upload URL response missing data.batch_id".to_string())?
1790 .to_string();
1791 let bytes = fs::read(attachment_path)
1792 .map_err(|err| format!("read attachment PDF {}: {err}", attachment_path.display()))?;
1793 put_bytes(upload_url, &bytes)?;
1794 let status = poll_mineru_batch(
1795 options.provider_endpoint.as_deref(),
1796 &batch_id,
1797 &auth_header,
1798 options.poll_interval_seconds,
1799 options.timeout_seconds,
1800 )?;
1801 let zip_url = mineru_batch_zip_url(&status)
1802 .ok_or_else(|| "MinerU completed batch missing full_zip_url".to_string())?;
1803 let zip_bytes = download_bytes(&zip_url)?;
1804 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1805 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(batch_id))
1806}
1807
1808fn create_mineru_file_upload(
1809 endpoint: Option<&str>,
1810 file_name: &str,
1811 data_id: &str,
1812 auth_header: &str,
1813) -> Result<Value, String> {
1814 let url = mineru_file_urls_url(endpoint);
1815 let body = serde_json::json!({
1816 "files": [{"name": file_name, "data_id": data_id}],
1817 "model_version": "vlm",
1818 "is_ocr": false,
1819 "enable_formula": true,
1820 "enable_table": true,
1821 "language": "ch",
1822 "page_ranges": "1-200",
1823 });
1824 ureq::post(&url)
1825 .set("Authorization", auth_header)
1826 .send_json(body)
1827 .map_err(|err| format!("POST {url} failed: {err}"))?
1828 .into_json::<Value>()
1829 .map_err(|err| format!("POST {url} returned invalid JSON: {err}"))
1830}
1831
1832fn put_bytes(url: &str, bytes: &[u8]) -> Result<(), String> {
1833 ureq::put(url)
1834 .send_bytes(bytes)
1835 .map_err(|err| format!("PUT {url} failed: {err}"))?;
1836 Ok(())
1837}
1838
1839fn submit_mineru_task(
1840 provider: &str,
1841 input: &OcrRequestInput,
1842 endpoint: Option<String>,
1843 api_key_env: &str,
1844) -> Result<Value, String> {
1845 let request = build_ocr_provider_request(provider, input)?;
1846 let method = request
1847 .method
1848 .ok_or_else(|| "MinerU provider missing HTTP method".to_string())?;
1849 let mut transport = provider_http_transport_with_auth(Some(api_key_env), "bearer")?;
1850 transport.post_json(&ProviderHttpInvocation {
1851 provider: request.provider.to_string(),
1852 style: request.style.to_string(),
1853 method: method.to_string(),
1854 url: endpoint.or_else(|| request.url.map(ToString::to_string)),
1855 auth_header_name: request.auth_header.map(ToString::to_string),
1856 auth_header_value: None,
1857 body: request.body,
1858 })
1859}
1860
1861fn poll_mineru_task(
1862 endpoint: Option<&str>,
1863 task_id: &str,
1864 auth_header: &str,
1865 poll_interval_seconds: u64,
1866 timeout_seconds: u64,
1867) -> Result<Value, String> {
1868 let url = mineru_task_status_url(endpoint, task_id);
1869 let started = Instant::now();
1870 loop {
1871 let status = get_json_with_auth(&url, auth_header)?;
1872 let state = status
1873 .pointer("/data/state")
1874 .or_else(|| status.pointer("/data/status"))
1875 .and_then(Value::as_str)
1876 .unwrap_or("unknown");
1877 match state {
1878 "done" | "finished" | "success" => return Ok(status),
1879 "failed" | "error" => return Err(format!("MinerU task {task_id} failed: {status}")),
1880 _ => {
1881 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1882 return Err(format!(
1883 "MinerU task {task_id} timed out after {timeout_seconds}s with state {state}"
1884 ));
1885 }
1886 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1887 }
1888 }
1889 }
1890}
1891
1892fn mineru_task_status_url(endpoint: Option<&str>, task_id: &str) -> String {
1893 let base = endpoint
1894 .unwrap_or("https://mineru.net/api/v4/extract/task")
1895 .trim_end_matches('/');
1896 if base.ends_with("/extract/task") {
1897 format!("{base}/{task_id}")
1898 } else {
1899 format!("{base}/extract/task/{task_id}")
1900 }
1901}
1902
1903fn mineru_file_urls_url(endpoint: Option<&str>) -> String {
1904 let base = mineru_api_base(endpoint);
1905 format!("{base}/file-urls/batch")
1906}
1907
1908fn mineru_batch_status_url(endpoint: Option<&str>, batch_id: &str) -> String {
1909 let base = mineru_api_base(endpoint);
1910 format!("{base}/extract-results/batch/{batch_id}")
1911}
1912
1913fn mineru_api_base(endpoint: Option<&str>) -> String {
1914 let base = endpoint
1915 .unwrap_or("https://mineru.net/api/v4/extract/task")
1916 .trim_end_matches('/');
1917 if let Some(stripped) = base.strip_suffix("/extract/task") {
1918 return stripped.to_string();
1919 }
1920 if let Some(stripped) = base.strip_suffix("/extract") {
1921 return stripped.to_string();
1922 }
1923 base.to_string()
1924}
1925
1926fn poll_mineru_batch(
1927 endpoint: Option<&str>,
1928 batch_id: &str,
1929 auth_header: &str,
1930 poll_interval_seconds: u64,
1931 timeout_seconds: u64,
1932) -> Result<Value, String> {
1933 let url = mineru_batch_status_url(endpoint, batch_id);
1934 let started = Instant::now();
1935 loop {
1936 let status = get_json_with_auth(&url, auth_header)?;
1937 let state = mineru_batch_state(&status).unwrap_or("unknown");
1938 match state {
1939 "done" | "finished" | "success" => return Ok(status),
1940 "failed" | "error" => return Err(format!("MinerU batch {batch_id} failed: {status}")),
1941 _ => {
1942 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1943 return Err(format!(
1944 "MinerU batch {batch_id} timed out after {timeout_seconds}s with state {state}"
1945 ));
1946 }
1947 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1948 }
1949 }
1950 }
1951}
1952
1953fn mineru_batch_state(status: &Value) -> Option<&str> {
1954 status
1955 .pointer("/data/extract_result/0/state")
1956 .or_else(|| status.pointer("/data/extractResult/0/state"))
1957 .or_else(|| status.pointer("/data/state"))
1958 .and_then(Value::as_str)
1959}
1960
1961fn mineru_batch_zip_url(status: &Value) -> Option<String> {
1962 status
1963 .pointer("/data/extract_result/0/full_zip_url")
1964 .or_else(|| status.pointer("/data/extractResult/0/full_zip_url"))
1965 .or_else(|| status.pointer("/data/full_zip_url"))
1966 .and_then(Value::as_str)
1967 .map(ToString::to_string)
1968}
1969
1970fn provider_auth_header_value(api_key_env: &str, auth_scheme: &str) -> Result<String, String> {
1971 let token = env::var(api_key_env)
1972 .map_err(|_| format!("missing provider credential env var {api_key_env}"))?;
1973 let token = token.trim();
1974 if token.is_empty() {
1975 return Err(format!(
1976 "provider credential env var {api_key_env} is empty"
1977 ));
1978 }
1979 Ok(match auth_scheme {
1980 "bearer" if token.starts_with("Bearer ") => token.to_string(),
1981 "bearer" => format!("Bearer {token}"),
1982 "token" if token.starts_with("token ") => token.to_string(),
1983 "token" => format!("token {token}"),
1984 _ => token.to_string(),
1985 })
1986}
1987
1988fn get_json_with_auth(url: &str, auth_header: &str) -> Result<Value, String> {
1989 ureq::get(url)
1990 .set("Authorization", auth_header)
1991 .call()
1992 .map_err(|err| format!("GET {url} failed: {err}"))?
1993 .into_json::<Value>()
1994 .map_err(|err| format!("GET {url} returned invalid JSON: {err}"))
1995}
1996
1997fn download_bytes(url: &str) -> Result<Vec<u8>, String> {
1998 let response = ureq::get(url)
1999 .call()
2000 .map_err(|err| format!("download {url} failed: {err}"))?;
2001 let mut bytes = Vec::new();
2002 response
2003 .into_reader()
2004 .read_to_end(&mut bytes)
2005 .map_err(|err| format!("read download {url}: {err}"))?;
2006 Ok(bytes)
2007}
2008
2009fn extract_zip_bytes_to_temp(prefix: &str, zip_bytes: &[u8]) -> Result<PathBuf, String> {
2010 let dir = unique_temp_path(prefix);
2011 fs::create_dir_all(&dir).map_err(|err| format!("create temp dir {}: {err}", dir.display()))?;
2012 let zip_path = dir.with_extension("zip");
2013 fs::write(&zip_path, zip_bytes)
2014 .map_err(|err| format!("write temp zip {}: {err}", zip_path.display()))?;
2015 let output = ProcessCommand::new("unzip")
2016 .arg("-q")
2017 .arg("-o")
2018 .arg(&zip_path)
2019 .arg("-d")
2020 .arg(&dir)
2021 .output()
2022 .map_err(|err| format!("run unzip: {err}"))?;
2023 if !output.status.success() {
2024 return Err(format!(
2025 "unzip {} failed: {}",
2026 zip_path.display(),
2027 String::from_utf8_lossy(&output.stderr).trim()
2028 ));
2029 }
2030 Ok(dir)
2031}
2032
2033fn unique_temp_path(prefix: &str) -> PathBuf {
2034 let nanos = SystemTime::now()
2035 .duration_since(UNIX_EPOCH)
2036 .map(|duration| duration.as_nanos())
2037 .unwrap_or(0);
2038 env::temp_dir().join(format!("{prefix}-{}-{nanos}", std::process::id()))
2039}
2040
2041fn mineru_result_source_from_dir(
2042 result_dir: PathBuf,
2043 raw_zip_bytes: Option<Vec<u8>>,
2044 task_status: Option<Value>,
2045 task_id: Option<String>,
2046) -> Result<MineruResultSource, String> {
2047 let (payload, content_list_file) = mineru_payload_from_result_dir(&result_dir)?;
2048 let markdown = find_first_file_by_name(&result_dir, "full.md")
2049 .map(|path| {
2050 fs::read_to_string(&path)
2051 .map_err(|err| format!("read native markdown {}: {err}", path.display()))
2052 })
2053 .transpose()?;
2054 Ok(MineruResultSource {
2055 task_id,
2056 state: "done".to_string(),
2057 result_dir,
2058 raw_zip_bytes,
2059 task_status,
2060 payload,
2061 content_list_file,
2062 markdown,
2063 })
2064}
2065
2066fn mineru_payload_from_result_dir(result_dir: &Path) -> Result<(Value, Option<PathBuf>), String> {
2067 let v2 = find_first_file_with_suffix(result_dir, "_content_list_v2.json");
2068 if let Some(path) = v2 {
2069 let value = read_json_file(&path)?;
2070 return Ok((serde_json::json!({"content_list_v2": value}), Some(path)));
2071 }
2072 let content_list = find_first_file_with_suffix(result_dir, "_content_list.json");
2073 if let Some(path) = content_list {
2074 let value = read_json_file(&path)?;
2075 return Ok((serde_json::json!({"content_list": value}), Some(path)));
2076 }
2077 let layout = find_first_file_by_name(result_dir, "layout.json");
2078 if let Some(path) = layout {
2079 return Ok((read_json_file(&path)?, Some(path)));
2080 }
2081 let markdown = find_first_file_by_name(result_dir, "full.md");
2082 if let Some(path) = markdown {
2083 let text = fs::read_to_string(&path)
2084 .map_err(|err| format!("read native markdown {}: {err}", path.display()))?;
2085 return Ok((serde_json::json!({"result": text}), Some(path)));
2086 }
2087 Err(format!(
2088 "MinerU result directory {} missing content_list_v2/content_list/layout/full.md",
2089 result_dir.display()
2090 ))
2091}
2092
2093fn read_json_file(path: &Path) -> Result<Value, String> {
2094 let raw = fs::read_to_string(path).map_err(|err| format!("read {}: {err}", path.display()))?;
2095 serde_json::from_str(&raw).map_err(|err| format!("parse JSON {}: {err}", path.display()))
2096}
2097
2098fn persist_mineru_result_sidecars(
2099 storage_dir: &Path,
2100 item_key: &str,
2101 attachment_key: &str,
2102 provider: &str,
2103 source: &MineruResultSource,
2104 chunk_chars: usize,
2105) -> Result<PersistedOcrArtifacts, String> {
2106 let blocks = parse_ocr_provider_response(provider, &source.payload, item_key, attachment_key)?;
2107 let chunks = zotron_types::chunks_from_blocks(&blocks, chunk_chars);
2108 let assets = copy_mineru_assets(&source.result_dir, storage_dir)?;
2109 let raw_bundle = serde_json::json!({
2110 "provider": provider,
2111 "item_key": item_key,
2112 "attachment_key": attachment_key,
2113 "task_id": source.task_id,
2114 "state": source.state,
2115 "task_status": source.task_status,
2116 "content_list_file": source.content_list_file,
2117 "payload": source.payload,
2118 });
2119
2120 let mut artifacts = Vec::new();
2121 artifacts.push(write_sidecar_json(
2122 storage_dir,
2123 item_key,
2124 attachment_key,
2125 MachineArtifactKind::OcrRaw,
2126 &raw_bundle,
2127 )?);
2128 artifacts.push(write_sidecar_jsonl(
2129 storage_dir,
2130 item_key,
2131 attachment_key,
2132 MachineArtifactKind::Blocks,
2133 &blocks,
2134 )?);
2135 artifacts.push(write_sidecar_jsonl(
2136 storage_dir,
2137 item_key,
2138 attachment_key,
2139 MachineArtifactKind::Chunks,
2140 &chunks,
2141 )?);
2142 if let Some(markdown) = source.markdown.as_deref() {
2143 artifacts.push(write_sidecar_bytes(
2144 storage_dir,
2145 item_key,
2146 attachment_key,
2147 MachineArtifactKind::OcrNativeMarkdown,
2148 markdown.as_bytes(),
2149 )?);
2150 }
2151 artifacts.push(write_sidecar_json(
2152 storage_dir,
2153 item_key,
2154 attachment_key,
2155 MachineArtifactKind::OcrNativeAssets,
2156 &assets,
2157 )?);
2158 if let Some(bytes) = source.raw_zip_bytes.as_deref() {
2159 artifacts.push(write_extra_sidecar_bytes(
2160 storage_dir,
2161 ".zotron/ocr/latest.raw.zip",
2162 bytes,
2163 )?);
2164 }
2165
2166 Ok(PersistedOcrArtifacts {
2167 block_count: blocks.len(),
2168 chunk_count: chunks.len(),
2169 artifacts,
2170 })
2171}
2172
2173fn write_sidecar_json(
2174 storage_dir: &Path,
2175 item_key: &str,
2176 attachment_key: &str,
2177 kind: MachineArtifactKind,
2178 value: &Value,
2179) -> Result<Value, String> {
2180 let bytes = serde_json::to_vec_pretty(value).map_err(|err| err.to_string())?;
2181 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, &bytes)
2182}
2183
2184fn write_sidecar_jsonl<T: serde::Serialize>(
2185 storage_dir: &Path,
2186 item_key: &str,
2187 attachment_key: &str,
2188 kind: MachineArtifactKind,
2189 values: &[T],
2190) -> Result<Value, String> {
2191 let mut out = String::new();
2192 for value in values {
2193 out.push_str(&serde_json::to_string(value).map_err(|err| err.to_string())?);
2194 out.push('\n');
2195 }
2196 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, out.as_bytes())
2197}
2198
2199fn write_sidecar_bytes(
2200 storage_dir: &Path,
2201 item_key: &str,
2202 attachment_key: &str,
2203 kind: MachineArtifactKind,
2204 bytes: &[u8],
2205) -> Result<Value, String> {
2206 let record = write_machine_artifact_sidecar(storage_dir, item_key, attachment_key, kind, bytes)
2207 .map_err(|err| format!("write sidecar {:?}: {err}", kind))?;
2208 Ok(serde_json::json!({
2209 "kind": kind,
2210 "relative_path": record.relative_path,
2211 "absolute_path": record.absolute_path,
2212 }))
2213}
2214
2215fn write_extra_sidecar_bytes(
2216 storage_dir: &Path,
2217 relative_path: &str,
2218 bytes: &[u8],
2219) -> Result<Value, String> {
2220 let absolute_path = storage_dir.join(relative_path);
2221 if let Some(parent) = absolute_path.parent() {
2222 fs::create_dir_all(parent).map_err(|err| format!("create {}: {err}", parent.display()))?;
2223 }
2224 fs::write(&absolute_path, bytes)
2225 .map_err(|err| format!("write sidecar {}: {err}", absolute_path.display()))?;
2226 Ok(serde_json::json!({
2227 "kind": "ocr_raw_zip",
2228 "relative_path": relative_path,
2229 "absolute_path": absolute_path,
2230 }))
2231}
2232
2233fn copy_mineru_assets(result_dir: &Path, storage_dir: &Path) -> Result<Value, String> {
2234 let mut images = Vec::new();
2235 for file in collect_files(result_dir)? {
2236 if !is_image_file(&file) {
2237 continue;
2238 }
2239 let relative = file.strip_prefix(result_dir).unwrap_or(&file).to_path_buf();
2240 let destination = storage_dir.join(".zotron").join("ocr").join(&relative);
2241 if let Some(parent) = destination.parent() {
2242 fs::create_dir_all(parent)
2243 .map_err(|err| format!("create {}: {err}", parent.display()))?;
2244 }
2245 fs::copy(&file, &destination).map_err(|err| {
2246 format!(
2247 "copy MinerU asset {} to {}: {err}",
2248 file.display(),
2249 destination.display()
2250 )
2251 })?;
2252 images.push(serde_json::json!({
2253 "source_relative": relative,
2254 "sidecar_relative": PathBuf::from(".zotron").join("ocr").join(&relative),
2255 "absolute_path": destination,
2256 }));
2257 }
2258 Ok(serde_json::json!({
2259 "provider": "mineru",
2260 "images": images,
2261 }))
2262}
2263
2264fn is_image_file(path: &Path) -> bool {
2265 matches!(
2266 path.extension()
2267 .and_then(|ext| ext.to_str())
2268 .unwrap_or_default()
2269 .to_ascii_lowercase()
2270 .as_str(),
2271 "png" | "jpg" | "jpeg" | "webp" | "gif"
2272 )
2273}
2274
2275fn find_first_file_with_suffix(root: &Path, suffix: &str) -> Option<PathBuf> {
2276 collect_files(root).ok()?.into_iter().find(|path| {
2277 path.file_name()
2278 .and_then(|name| name.to_str())
2279 .is_some_and(|name| name.ends_with(suffix))
2280 })
2281}
2282
2283fn find_first_file_by_name(root: &Path, name: &str) -> Option<PathBuf> {
2284 collect_files(root).ok()?.into_iter().find(|path| {
2285 path.file_name()
2286 .and_then(|file_name| file_name.to_str())
2287 .is_some_and(|file_name| file_name == name)
2288 })
2289}
2290
2291fn collect_files(root: &Path) -> Result<Vec<PathBuf>, String> {
2292 let mut files = Vec::new();
2293 collect_files_into(root, &mut files)?;
2294 files.sort();
2295 Ok(files)
2296}
2297
2298fn collect_files_into(root: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
2299 for entry in fs::read_dir(root).map_err(|err| format!("read dir {}: {err}", root.display()))? {
2300 let entry = entry.map_err(|err| format!("read dir entry {}: {err}", root.display()))?;
2301 let path = entry.path();
2302 let file_type = entry
2303 .file_type()
2304 .map_err(|err| format!("stat {}: {err}", path.display()))?;
2305 if file_type.is_dir() {
2306 collect_files_into(&path, files)?;
2307 } else if file_type.is_file() {
2308 files.push(path);
2309 }
2310 }
2311 Ok(())
2312}
2313
2314fn ocr_async_task_result(provider: &str, payload: &Value) -> Option<Value> {
2315 let data = payload.get("data")?;
2316 let task_id = data.get("task_id").and_then(Value::as_str)?;
2317 Some(serde_json::json!({
2318 "provider": provider,
2319 "status": "submitted",
2320 "task_id": task_id,
2321 "state": data.get("state").and_then(Value::as_str).unwrap_or("submitted"),
2322 "result_url": data.get("full_zip_url").or_else(|| data.get("markdown_url")).cloned(),
2323 "raw": payload,
2324 }))
2325}
2326
2327fn ocr_input_from_file(
2328 file: String,
2329 item_key: Option<String>,
2330 attachment_key: Option<String>,
2331 mime_type: Option<String>,
2332) -> Result<OcrRequestInput, String> {
2333 let item_key = item_key
2334 .ok_or_else(|| "INVALID_ARGS: --item-key is required when using --file".to_string())?;
2335 let attachment_key = attachment_key.ok_or_else(|| {
2336 "INVALID_ARGS: --attachment-key is required when using --file".to_string()
2337 })?;
2338 let path = PathBuf::from(&file);
2339 let bytes = fs::read(&path).map_err(|err| format!("read {file}: {err}"))?;
2340 let file_name = path
2341 .file_name()
2342 .and_then(|name| name.to_str())
2343 .unwrap_or("document.pdf")
2344 .to_string();
2345 let mime_type = mime_type.unwrap_or_else(|| guess_mime_type(&path).to_string());
2346 Ok(OcrRequestInput {
2347 item_key,
2348 attachment_key,
2349 file_name,
2350 mime_type,
2351 content_base64: base64_encode(&bytes),
2352 source_url: None,
2353 local_path: Some(file),
2354 output_dir: None,
2355 })
2356}
2357
2358fn guess_mime_type(path: &Path) -> &'static str {
2359 match path
2360 .extension()
2361 .and_then(|ext| ext.to_str())
2362 .unwrap_or_default()
2363 .to_ascii_lowercase()
2364 .as_str()
2365 {
2366 "png" => "image/png",
2367 "jpg" | "jpeg" => "image/jpeg",
2368 "webp" => "image/webp",
2369 _ => "application/pdf",
2370 }
2371}
2372
2373fn base64_encode(bytes: &[u8]) -> String {
2374 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2375 let mut out = String::with_capacity(bytes.len().div_ceil(3) * 4);
2376 for chunk in bytes.chunks(3) {
2377 let b0 = chunk[0];
2378 let b1 = *chunk.get(1).unwrap_or(&0);
2379 let b2 = *chunk.get(2).unwrap_or(&0);
2380 out.push(TABLE[(b0 >> 2) as usize] as char);
2381 out.push(TABLE[(((b0 & 0b0000_0011) << 4) | (b1 >> 4)) as usize] as char);
2382 if chunk.len() > 1 {
2383 out.push(TABLE[(((b1 & 0b0000_1111) << 2) | (b2 >> 6)) as usize] as char);
2384 } else {
2385 out.push('=');
2386 }
2387 if chunk.len() > 2 {
2388 out.push(TABLE[(b2 & 0b0011_1111) as usize] as char);
2389 } else {
2390 out.push('=');
2391 }
2392 }
2393 out
2394}
2395
2396fn run_ocr_status_command(
2397 client: &mut impl RpcCaller,
2398 collection: String,
2399) -> Result<Value, String> {
2400 let collection_key = find_collection_in_tree(client, &collection)?
2401 .and_then(|node| node.get("key").cloned())
2402 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
2403 let raw = paginate_rpc(
2404 client,
2405 "collections.getItems",
2406 serde_json::json!({"key": collection_key}),
2407 500,
2408 )?;
2409 let items = raw
2410 .get("items")
2411 .and_then(Value::as_array)
2412 .or_else(|| raw.as_array())
2413 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
2414 .clone();
2415
2416 let mut has_ocr = 0usize;
2417 for item in &items {
2418 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
2419 if has_ocr_artifact(client, &item_key)? || has_ocr_note(client, &item_key)? {
2420 has_ocr += 1;
2421 }
2422 }
2423
2424 Ok(serde_json::json!({
2425 "collection": collection,
2426 "total": items.len(),
2427 "has_ocr": has_ocr,
2428 "missing_ocr": items.len() - has_ocr,
2429 }))
2430}
2431
2432fn has_ocr_artifact(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2433 if let Some(item_key) = item_key.as_str() {
2434 if machine_artifact_exists_for_item(
2437 machine_artifact_store_root(),
2438 item_key,
2439 MachineArtifactKind::Chunks,
2440 ) {
2441 return Ok(true);
2442 }
2443 }
2444
2445 let attachments = client.call(
2446 "attachments.list",
2447 Some(serde_json::json!({"parentKey": item_key.clone()})),
2448 )?;
2449 Ok(attachments.as_array().is_some_and(|attachments| {
2450 attachments.iter().any(|attachment| {
2451 let has_sidecar_chunks = attachment
2452 .get("path")
2453 .and_then(Value::as_str)
2454 .map(local_path_from_zotero_path)
2455 .as_deref()
2456 .map(Path::new)
2457 .and_then(Path::parent)
2458 .is_some_and(|dir| {
2459 machine_artifact_exists_in_sidecar(dir, MachineArtifactKind::Chunks)
2460 });
2461 if has_sidecar_chunks {
2462 return true;
2463 }
2464
2465 attachment
2467 .get("title")
2468 .and_then(Value::as_str)
2469 .is_some_and(|title| title.ends_with("zotron-chunks.jsonl"))
2470 })
2471 }))
2472}
2473
2474fn local_path_from_zotero_path(path: &str) -> String {
2475 if is_wsl() && path.as_bytes().get(1) == Some(&b':') {
2476 return ProcessCommand::new("wslpath")
2477 .arg("-u")
2478 .arg(path)
2479 .output()
2480 .ok()
2481 .filter(|output| output.status.success())
2482 .and_then(|output| String::from_utf8(output.stdout).ok())
2483 .map(|converted| converted.trim().to_string())
2484 .filter(|converted| !converted.is_empty())
2485 .unwrap_or_else(|| path.to_string());
2486 }
2487 path.to_string()
2488}
2489
2490fn has_ocr_note(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2491 let notes = client.call(
2492 "notes.list",
2493 Some(serde_json::json!({"parentKey": item_key.clone()})),
2494 )?;
2495 Ok(notes.as_array().is_some_and(|notes| {
2496 notes.iter().any(|note| {
2497 note.get("tags")
2498 .and_then(Value::as_array)
2499 .is_some_and(|tags| tags.iter().any(tag_is_ocr))
2500 })
2501 }))
2502}
2503
2504fn tag_is_ocr(tag: &Value) -> bool {
2505 tag.as_str() == Some("ocr")
2506 || tag
2507 .get("tag")
2508 .and_then(Value::as_str)
2509 .is_some_and(|tag| tag == "ocr")
2510}
2511
2512fn find_collection_in_tree(
2513 client: &mut impl RpcCaller,
2514 collection: &str,
2515) -> Result<Option<Value>, String> {
2516 let tree = client.call("collections.tree", None)?;
2517 let nodes = tree
2518 .as_array()
2519 .ok_or_else(|| "collections.tree returned non-array result".to_string())?;
2520 Ok(search_collection_tree(nodes, collection).cloned())
2521}
2522
2523fn search_collection_tree<'a>(nodes: &'a [Value], collection: &str) -> Option<&'a Value> {
2524 for node in nodes {
2525 if node.get("key").and_then(Value::as_str) == Some(collection)
2526 || node.get("name").and_then(Value::as_str) == Some(collection)
2527 {
2528 return Some(node);
2529 }
2530 if let Some(children) = node.get("children").and_then(Value::as_array) {
2531 if let Some(found) = search_collection_tree(children, collection) {
2532 return Some(found);
2533 }
2534 }
2535 }
2536 None
2537}
2538
2539fn run_command(command: Command, client: &mut impl RpcCaller) -> Result<String, String> {
2540 if let Command::Export(args) = command {
2541 return run_export(args, client);
2542 }
2543
2544 let (value, style) = match command {
2545 Command::Ping { .. } => (
2546 call_json(client, "system.ping", None)?,
2547 JsonStyle::PythonCompact,
2548 ),
2549 Command::Rpc {
2550 method,
2551 params_json,
2552 paginate,
2553 page_size,
2554 ..
2555 } => {
2556 let params = serde_json::from_str::<Value>(¶ms_json)
2557 .map_err(|err| format!("INVALID_JSON: params must be a JSON object: {err}"))?;
2558 if !params.is_object() {
2559 return Err("INVALID_JSON: params must be a JSON object".to_string());
2560 }
2561 if paginate {
2562 (
2563 paginate_rpc(client, &method, params, page_size)?,
2564 JsonStyle::Pretty,
2565 )
2566 } else {
2567 (call_json(client, &method, Some(params))?, JsonStyle::Pretty)
2568 }
2569 }
2570 Command::Push {
2571 json_file,
2572 pdf,
2573 collection,
2574 on_duplicate,
2575 dry_run,
2576 ..
2577 } => return run_push_command(json_file, pdf, collection, on_duplicate, dry_run, client),
2578 Command::System { command } => run_system_command(command, client)?,
2579 Command::Search(args) => {
2580 if let Some(mgmt) = args.management {
2581 run_search_management_command(mgmt, client)?
2582 } else {
2583 run_search(args, client)?
2584 }
2585 }
2586 Command::Items { command } => run_items_command(command, client)?,
2587 Command::Collections { command } => run_collections_command(command, client)?,
2588 Command::Notes { command } => run_notes_command(command, client)?,
2589 Command::Settings { command } => run_settings_command(command, client)?,
2590 Command::Tags { command } => run_tags_command(command, client)?,
2591 Command::Annotations { command } => run_annotations_command(command, client)?,
2592 Command::Ocr { command } => {
2593 return run_ocr_command(command, client);
2594 }
2595 Command::Rag { command } => {
2596 return run_rag_command(command, client);
2597 }
2598 Command::Export(_) => unreachable!("export commands return raw output above"),
2599 };
2600
2601 format_json(&value, style)
2602}
2603
2604fn run_rag_command(command: RagCommand, client: &mut impl RpcCaller) -> Result<String, String> {
2605 match command {
2606 RagCommand::Providers => format_json(
2607 &serde_json::json!({
2608 "providers": [
2609 embedding_provider_spec("volcengine")?,
2610 embedding_provider_spec("alibaba")?,
2611 embedding_provider_spec("custom")?,
2612 ],
2613 }),
2614 JsonStyle::Pretty,
2615 ),
2616 RagCommand::Embed {
2617 provider,
2618 input,
2619 endpoint,
2620 model,
2621 input_type,
2622 api_key_env,
2623 } => {
2624 let value = run_embedding_provider_json_command(
2625 provider,
2626 input,
2627 endpoint,
2628 model,
2629 input_type,
2630 api_key_env,
2631 )?;
2632 format_json(&value, JsonStyle::PythonCompact)
2633 }
2634 RagCommand::Status { collection, .. } => {
2635 let value = rag_status_value(client, &collection)?;
2636 format_json(&value, JsonStyle::PythonCompact)
2637 }
2638 RagCommand::Search {
2639 query,
2640 collection,
2641 keys,
2642 zotero,
2643 top_spans_per_item,
2644 include_fulltext_spans,
2645 top_k,
2646 output,
2647 ..
2648 } => run_rag_search_command(
2649 client,
2650 RagSearchOptions {
2651 query,
2652 collection,
2653 keys,
2654 zotero,
2655 top_spans_per_item,
2656 include_fulltext_spans,
2657 top_k,
2658 output,
2659 },
2660 ),
2661 }
2662}
2663
2664fn run_embedding_provider_json_command(
2665 provider: String,
2666 input: String,
2667 endpoint: Option<String>,
2668 model: Option<String>,
2669 input_type: Option<String>,
2670 api_key_env: Option<String>,
2671) -> Result<Value, String> {
2672 let mut input: EmbeddingRequestInput = read_json_input(&input)?;
2673 if endpoint.is_some() {
2674 input.url = endpoint;
2675 }
2676 if model.is_some() {
2677 input.model = model;
2678 }
2679 if input_type.is_some() {
2680 input.input_type = input_type;
2681 }
2682 let mut transport = provider_http_transport(api_key_env.as_deref())?;
2683 let vectors = execute_embedding_provider_request(&provider, &input, &mut transport)?;
2684
2685 Ok(serde_json::json!({
2686 "provider": provider,
2687 "vectors": vectors,
2688 }))
2689}
2690
2691fn provider_http_transport(api_key_env: Option<&str>) -> Result<UreqProviderHttpTransport, String> {
2692 provider_http_transport_with_auth(api_key_env, "bearer")
2693}
2694
2695fn provider_http_transport_with_auth(
2696 api_key_env: Option<&str>,
2697 auth_scheme: &str,
2698) -> Result<UreqProviderHttpTransport, String> {
2699 let Some(env_name) = api_key_env else {
2700 return Ok(UreqProviderHttpTransport::new());
2701 };
2702 let token = env::var(env_name)
2703 .map_err(|_| format!("missing provider credential env var {env_name}"))?;
2704 if token.trim().is_empty() {
2705 return Err(format!("provider credential env var {env_name} is empty"));
2706 }
2707 let token = token.trim();
2708 match auth_scheme {
2709 "token" if token.starts_with("token ") => {
2710 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2711 }
2712 "token" => Ok(UreqProviderHttpTransport::with_api_key(format!(
2713 "token {token}"
2714 ))),
2715 "bearer" if token.starts_with("Bearer ") => {
2716 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2717 }
2718 "bearer" => Ok(UreqProviderHttpTransport::with_bearer_token(token)),
2719 "none" => Ok(UreqProviderHttpTransport::new()),
2720 other => Err(format!("unsupported provider auth scheme {other}")),
2721 }
2722}
2723
2724fn read_json_input<T: serde::de::DeserializeOwned>(path: &str) -> Result<T, String> {
2725 let payload = if path == "-" {
2726 let mut input = String::new();
2727 io::stdin()
2728 .read_to_string(&mut input)
2729 .map_err(|err| format!("read stdin: {err}"))?;
2730 input
2731 } else {
2732 fs::read_to_string(path).map_err(|err| format!("read {path}: {err}"))?
2733 };
2734 serde_json::from_str::<T>(&payload)
2735 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))
2736}
2737
2738fn fetch_embedding_settings(
2739 client: &mut impl RpcCaller,
2740) -> Result<(String, String, String, String), String> {
2741 let settings = client.call("settings.getAll", None)?;
2742 let provider = settings
2743 .get("embedding.provider")
2744 .and_then(Value::as_str)
2745 .unwrap_or("ollama")
2746 .to_string();
2747 let model = settings
2748 .get("embedding.model")
2749 .and_then(Value::as_str)
2750 .unwrap_or("")
2751 .to_string();
2752 let api_url = settings
2753 .get("embedding.apiUrl")
2754 .and_then(Value::as_str)
2755 .unwrap_or("")
2756 .to_string();
2757 let raw = client.call("settings.getRaw", Some(serde_json::json!({"key": "embedding.apiKey"})))?;
2758 let api_key = raw
2759 .get("embedding.apiKey")
2760 .and_then(Value::as_str)
2761 .unwrap_or("")
2762 .to_string();
2763 Ok((provider, model, api_url, api_key))
2764}
2765
2766fn fetch_retrieval_mode(client: &mut impl RpcCaller) -> String {
2767 client
2768 .call(
2769 "settings.get",
2770 Some(serde_json::json!({"key": "rag.retrievalMode"})),
2771 )
2772 .ok()
2773 .and_then(|v| {
2774 v.get("rag.retrievalMode")
2775 .and_then(Value::as_str)
2776 .map(String::from)
2777 })
2778 .unwrap_or_else(|| "hybrid".to_string())
2779}
2780
2781fn resolve_sidecar_paths(
2782 client: &mut impl RpcCaller,
2783 collection: Option<&str>,
2784 keys: &[String],
2785) -> Result<Vec<(String, String, PathBuf)>, String> {
2786 let items = if !keys.is_empty() {
2787 let mut items = Vec::new();
2788 for key in keys {
2789 let item = client.call("items.get", Some(serde_json::json!({"key": key})))?;
2790 items.push(item);
2791 }
2792 items
2793 } else if let Some(col) = collection {
2794 let col_key = resolve_collection(client, col)?;
2795 let response = client.call(
2796 "collections.getItems",
2797 Some(serde_json::json!({"key": col_key})),
2798 )?;
2799 collection_items(&response)
2800 } else {
2801 return Err("INVALID_ARGS: --collection or --key required".into());
2802 };
2803
2804 let mut results = Vec::new();
2805 for item in &items {
2806 let item_key = item.get("key").and_then(Value::as_str).unwrap_or_default();
2807 let attachments = client.call(
2808 "attachments.list",
2809 Some(serde_json::json!({"parentKey": item_key})),
2810 )?;
2811 let att_list = attachments
2812 .get("items")
2813 .and_then(Value::as_array)
2814 .or_else(|| attachments.as_array())
2815 .cloned()
2816 .unwrap_or_default();
2817 for att in &att_list {
2818 let content_type = att
2819 .get("contentType")
2820 .and_then(Value::as_str)
2821 .unwrap_or("");
2822 if content_type != "application/pdf" {
2823 continue;
2824 }
2825 let att_key = att.get("key").and_then(Value::as_str).unwrap_or_default();
2826 let path = att.get("path").and_then(Value::as_str).unwrap_or_default();
2827 if path.is_empty() {
2828 continue;
2829 }
2830 let local_path = local_path_from_zotero_path(path);
2831 let pdf_path = PathBuf::from(&local_path);
2832 if let Some(parent) = pdf_path.parent() {
2833 let sidecar_root = parent.join(".zotron");
2834 if sidecar_root.exists() {
2835 results.push((item_key.to_string(), att_key.to_string(), sidecar_root));
2836 }
2837 }
2838 }
2839 }
2840 Ok(results)
2841}
2842
2843fn load_sidecar_chunks(sidecar_root: &Path) -> Vec<StructureChunk> {
2844 let chunks_path = sidecar_root.join("chunks").join("chunks.v1.jsonl");
2845 let Ok(content) = fs::read_to_string(&chunks_path) else {
2846 return Vec::new();
2847 };
2848 content
2849 .lines()
2850 .filter(|line| !line.trim().is_empty())
2851 .filter_map(|line| serde_json::from_str::<StructureChunk>(line).ok())
2852 .collect()
2853}
2854
2855fn embedding_vector_filename(provider: &str, model: &str) -> String {
2856 let p = provider.trim().to_lowercase().replace('/', "-");
2857 let m = model.trim().to_lowercase().replace('/', "-");
2858 if p.is_empty() && m.is_empty() {
2859 return "vectors.jsonl".to_string();
2860 }
2861 format!("{p}--{m}.jsonl")
2862}
2863
2864fn load_sidecar_vectors(sidecar_root: &Path, provider: &str, model: &str) -> Vec<EmbeddingVector> {
2865 let embeddings_dir = sidecar_root.join("embeddings");
2866 let target = embedding_vector_filename(provider, model);
2867 let target_path = embeddings_dir.join(&target);
2868 if let Ok(content) = fs::read_to_string(&target_path) {
2869 let vecs: Vec<EmbeddingVector> = content
2870 .lines()
2871 .filter(|line| !line.trim().is_empty())
2872 .filter_map(|line| serde_json::from_str(line).ok())
2873 .collect();
2874 if !vecs.is_empty() {
2875 return vecs;
2876 }
2877 }
2878 for legacy in &["vectors.v1.jsonl", "vectors.jsonl"] {
2880 let path = embeddings_dir.join(legacy);
2881 if let Ok(content) = fs::read_to_string(&path) {
2882 let vecs: Vec<EmbeddingVector> = content
2883 .lines()
2884 .filter(|line| !line.trim().is_empty())
2885 .filter_map(|line| serde_json::from_str::<EmbeddingVector>(line).ok())
2886 .filter(|v| v.source_provider == provider || provider.is_empty())
2887 .collect();
2888 if !vecs.is_empty() {
2889 return vecs;
2890 }
2891 }
2892 }
2893 Vec::new()
2894}
2895
2896fn embed_query_text(
2897 query: &str,
2898 provider: &str,
2899 model: &str,
2900 api_url: &str,
2901 api_key: &str,
2902) -> Result<Vec<f64>, String> {
2903 let input = EmbeddingRequestInput {
2904 item_key: "query".to_string(),
2905 chunks: vec![EmbeddingChunkInput {
2906 chunk_key: "q0".to_string(),
2907 text: query.to_string(),
2908 }],
2909 model: if model.is_empty() {
2910 None
2911 } else {
2912 Some(model.to_string())
2913 },
2914 url: if api_url.is_empty() {
2915 None
2916 } else {
2917 Some(api_url.to_string())
2918 },
2919 input_type: Some("query".to_string()),
2920 };
2921 let request = build_embedding_provider_request(provider, &input)?;
2922 let url = request
2923 .url
2924 .as_deref()
2925 .ok_or("no embedding URL configured")?;
2926 let mut http = ureq::post(url).set("Content-Type", "application/json");
2927 if let Some(auth) = request.auth_header {
2928 if !api_key.is_empty() {
2929 http = http.set(auth, &format!("Bearer {api_key}"));
2930 }
2931 }
2932 let resp = http
2933 .send_json(&request.body)
2934 .map_err(|e| format!("embedding request failed: {e}"))?;
2935 let payload: Value = resp
2936 .into_json()
2937 .map_err(|e| format!("embedding response parse: {e}"))?;
2938 let vectors =
2939 parse_embedding_provider_response(provider, &payload, "query", &input.chunks)?;
2940 vectors
2941 .into_iter()
2942 .next()
2943 .map(|v| v.vector)
2944 .ok_or_else(|| "no embedding vector returned".to_string())
2945}
2946
2947fn run_rag_search_xpi_fallback(
2948 client: &mut impl RpcCaller,
2949 options: &RagSearchOptions,
2950) -> Result<String, String> {
2951 let mut params = serde_json::json!({
2952 "query": options.query,
2953 "limit": options.top_k,
2954 "top_spans_per_item": options.top_spans_per_item,
2955 "include_fulltext_spans": options.include_fulltext_spans,
2956 });
2957 if let Some(map) = params.as_object_mut() {
2958 if let Some(col) = &options.collection {
2959 map.insert("collection".into(), Value::String(col.clone()));
2960 }
2961 if !options.keys.is_empty() {
2962 map.insert(
2963 "keys".into(),
2964 Value::Array(options.keys.iter().map(|k| Value::String(k.clone())).collect()),
2965 );
2966 }
2967 }
2968 let payload = client.call("rag.searchHits", Some(params))?;
2969 let hits = payload
2970 .get("hits")
2971 .and_then(Value::as_array)
2972 .cloned()
2973 .unwrap_or_default();
2974 if options.output == "jsonl" {
2975 let mut out = String::new();
2976 for hit in &hits {
2977 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
2978 out.push('\n');
2979 }
2980 Ok(out)
2981 } else {
2982 let total = hits.len() as u64;
2983 format_json(
2984 &normalize_list_envelope(
2985 serde_json::json!({"items": hits, "total": total}),
2986 "items",
2987 Some(options.top_k),
2988 0,
2989 ),
2990 JsonStyle::Pretty,
2991 )
2992 }
2993}
2994
2995fn run_rag_search_command(
2996 client: &mut impl RpcCaller,
2997 options: RagSearchOptions,
2998) -> Result<String, String> {
2999 if options.zotero {
3001 if options.collection.is_none() && options.keys.is_empty() {
3002 return Err(
3003 "INVALID_ARGS: --collection or --key is required".to_string(),
3004 );
3005 }
3006 return run_rag_search_xpi_fallback(client, &options);
3007 }
3008
3009 if options.collection.is_none() && options.keys.is_empty() {
3011 return Err("INVALID_ARGS: --collection or --key required".to_string());
3012 }
3013
3014 let sidecars = resolve_sidecar_paths(
3016 client,
3017 options.collection.as_deref(),
3018 &options.keys,
3019 );
3020
3021 let sidecars = match sidecars {
3024 Ok(ref s) if !s.is_empty() => s,
3025 Err(ref e) if e.contains("COLLECTION_NOT_FOUND") => return Err(e.clone()),
3026 _ => return run_rag_search_xpi_fallback(client, &options),
3027 };
3028
3029 let (emb_provider, emb_model, emb_url, emb_key) = fetch_embedding_settings(client)?;
3031 let mut all_chunks: Vec<StructureChunk> = Vec::new();
3032 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
3033 for (_item_key, _att_key, sidecar_root) in sidecars {
3034 all_chunks.extend(load_sidecar_chunks(sidecar_root));
3035 all_vectors.extend(load_sidecar_vectors(sidecar_root, &emb_provider, &emb_model));
3036 }
3037
3038 if all_chunks.is_empty() {
3039 return run_rag_search_xpi_fallback(client, &options);
3040 }
3041
3042 let mode = fetch_retrieval_mode(client);
3044
3045 let bm25_ranked = if mode != "dense" {
3047 bm25_score_chunks(&all_chunks, &options.query, 1.2, 0.75)
3048 } else {
3049 Vec::new()
3050 };
3051
3052 let dense_ranked = if mode != "lexical" && !all_vectors.is_empty() {
3054 match embed_query_text(&options.query, &emb_provider, &emb_model, &emb_url, &emb_key) {
3055 Ok(query_vec) => {
3056 let vec_map: std::collections::HashMap<&str, &[f64]> = all_vectors
3057 .iter()
3058 .map(|v| (v.chunk_key.as_str(), v.vector.as_slice()))
3059 .collect();
3060 let mut scores: Vec<(usize, f64)> = all_chunks
3061 .iter()
3062 .enumerate()
3063 .filter_map(|(i, chunk)| {
3064 vec_map.get(chunk.chunk_key.as_str()).map(|stored| {
3065 (i, cosine_similarity(&query_vec, stored))
3066 })
3067 })
3068 .filter(|(_, s)| *s > 0.0)
3069 .collect();
3070 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
3071 scores
3072 }
3073 Err(_) => Vec::new(),
3074 }
3075 } else {
3076 Vec::new()
3077 };
3078
3079 let limit = options.top_k as usize;
3081 let ranked = if !bm25_ranked.is_empty() && !dense_ranked.is_empty() {
3082 rrf_merge(&bm25_ranked, &dense_ranked, 60.0, limit)
3083 } else if !bm25_ranked.is_empty() {
3084 bm25_ranked.into_iter().take(limit).collect()
3085 } else {
3086 dense_ranked.into_iter().take(limit).collect()
3087 };
3088
3089 let mut per_item_count: std::collections::HashMap<&str, u64> =
3091 std::collections::HashMap::new();
3092 let mut selected: Vec<(usize, f64)> = Vec::new();
3093 for (idx, score) in &ranked {
3094 let item_key = all_chunks[*idx].item_key.as_str();
3095 let count = per_item_count.entry(item_key).or_insert(0);
3096 if *count < options.top_spans_per_item {
3097 *count += 1;
3098 selected.push((*idx, *score));
3099 }
3100 }
3101
3102 let mut meta_cache: std::collections::HashMap<String, Value> =
3104 std::collections::HashMap::new();
3105 let mut hits: Vec<Value> = Vec::new();
3106 for (idx, score) in &selected {
3107 let chunk = &all_chunks[*idx];
3108 let meta = if let Some(cached) = meta_cache.get(&chunk.item_key) {
3109 cached.clone()
3110 } else {
3111 let fetched = client
3112 .call(
3113 "items.get",
3114 Some(serde_json::json!({"key": chunk.item_key})),
3115 )
3116 .unwrap_or(Value::Null);
3117 meta_cache.insert(chunk.item_key.clone(), fetched.clone());
3118 fetched
3119 };
3120 let title = meta
3121 .get("title")
3122 .and_then(Value::as_str)
3123 .unwrap_or("")
3124 .to_string();
3125 let authors = meta
3126 .get("creators")
3127 .and_then(Value::as_array)
3128 .map(|creators| {
3129 creators
3130 .iter()
3131 .filter_map(|c| {
3132 let last = c.get("lastName").and_then(Value::as_str).unwrap_or("");
3133 let first = c.get("firstName").and_then(Value::as_str).unwrap_or("");
3134 if last.is_empty() && first.is_empty() {
3135 None
3136 } else {
3137 Some(format!("{last}{first}"))
3138 }
3139 })
3140 .collect::<Vec<_>>()
3141 .join(", ")
3142 })
3143 .unwrap_or_default();
3144 let year = meta.get("date").and_then(Value::as_str).unwrap_or("");
3145 let mut hit = serde_json::json!({
3146 "item_key": chunk.item_key,
3147 "chunk_key": chunk.chunk_key,
3148 "title": title,
3149 "authors": authors,
3150 "year": year,
3151 "text": chunk.text,
3152 "page_range": chunk.page_range,
3153 "section_path": chunk.section_path,
3154 "score": score,
3155 });
3156 if options.include_fulltext_spans {
3157 hit.as_object_mut().unwrap().insert(
3158 "attachment_key".to_string(),
3159 Value::String(chunk.attachment_key.clone()),
3160 );
3161 }
3162 hits.push(hit);
3163 }
3164
3165 if options.output == "jsonl" {
3167 let mut out = String::new();
3168 for hit in &hits {
3169 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3170 out.push('\n');
3171 }
3172 Ok(out)
3173 } else {
3174 let total = hits.len() as u64;
3175 format_json(
3176 &normalize_list_envelope(
3177 serde_json::json!({"items": hits, "total": total}),
3178 "items",
3179 Some(options.top_k),
3180 0,
3181 ),
3182 JsonStyle::Pretty,
3183 )
3184 }
3185}
3186
3187fn rag_status_value(client: &mut impl RpcCaller, collection: &str) -> Result<Value, String> {
3188 let raw_store_path = rag_store_path(collection);
3189 if raw_store_path.exists() {
3190 return rag_status_from_store(collection, &raw_store_path);
3191 }
3192
3193 let mut store_candidates = Vec::new();
3194 let collection_match = find_collection_in_tree(client, collection)?;
3195 if let Some(collection_node) = collection_match.as_ref() {
3196 if let Some(name) = collection_node.get("name").and_then(Value::as_str) {
3197 store_candidates.push(rag_store_path(name));
3198 }
3199 if let Some(key) = collection_node.get("key").and_then(Value::as_str) {
3200 store_candidates.push(rag_store_path(key));
3201 }
3202 }
3203 for store_path in unique_paths(store_candidates) {
3204 if store_path.exists() {
3205 return rag_status_from_store(collection, &store_path);
3206 }
3207 }
3208
3209 rag_status_from_zotero_sidecars(client, collection, collection_match)
3210}
3211
3212fn unique_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
3213 let mut unique = Vec::new();
3214 for path in paths {
3215 if !unique.iter().any(|seen| seen == &path) {
3216 unique.push(path);
3217 }
3218 }
3219 unique
3220}
3221
3222fn rag_status_from_store(collection: &str, store_path: &Path) -> Result<Value, String> {
3223 let raw = fs::read_to_string(store_path)
3224 .map_err(|err| format!("read RAG store {}: {err}", store_path.display()))?;
3225 let store: Value = serde_json::from_str(&raw)
3226 .map_err(|err| format!("parse RAG store {}: {err}", store_path.display()))?;
3227 let chunks = store
3228 .get("chunks")
3229 .and_then(Value::as_array)
3230 .cloned()
3231 .unwrap_or_default();
3232 let mut item_keys = Vec::<Value>::new();
3233 for chunk in &chunks {
3234 let Some(item_key) = chunk.get("item_key") else {
3235 continue;
3236 };
3237 if !item_keys.iter().any(|seen| seen == item_key) {
3238 item_keys.push(item_key.clone());
3239 }
3240 }
3241 Ok(serde_json::json!({
3242 "status": "indexed",
3243 "collection": store.get("collection").and_then(Value::as_str).unwrap_or(collection),
3244 "collection_key": store.get("collection_key").cloned().unwrap_or(Value::Null),
3245 "model": store.get("model").cloned().unwrap_or(Value::String("unknown".to_string())),
3246 "total_chunks": chunks.len(),
3247 "total_items": item_keys.len(),
3248 "store_path": store_path.to_string_lossy(),
3249 }))
3250}
3251
3252fn rag_status_from_zotero_sidecars(
3253 client: &mut impl RpcCaller,
3254 collection: &str,
3255 collection_match: Option<Value>,
3256) -> Result<Value, String> {
3257 let collection_key = collection_match
3258 .as_ref()
3259 .and_then(|node| node.get("key").cloned())
3260 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
3261 let raw = paginate_rpc(
3262 client,
3263 "collections.getItems",
3264 serde_json::json!({"key": collection_key}),
3265 500,
3266 )?;
3267 let items = raw
3268 .get("items")
3269 .and_then(Value::as_array)
3270 .or_else(|| raw.as_array())
3271 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
3272 .clone();
3273
3274 let mut indexed_items = 0usize;
3275 let mut total_chunks = 0usize;
3276 for item in &items {
3277 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
3278 let chunk_count = sidecar_chunk_count_for_item(client, &item_key)?;
3279 if chunk_count > 0 {
3280 indexed_items += 1;
3281 total_chunks += chunk_count;
3282 }
3283 }
3284
3285 if indexed_items == 0 {
3286 return Ok(serde_json::json!({
3287 "status": "not indexed",
3288 "collection": collection,
3289 "total_items": items.len(),
3290 "indexed_items": 0,
3291 }));
3292 }
3293
3294 Ok(serde_json::json!({
3295 "status": "indexed",
3296 "collection": collection,
3297 "total_chunks": total_chunks,
3298 "total_items": indexed_items,
3299 "collection_items": items.len(),
3300 "source": "zotero-sidecar",
3301 }))
3302}
3303
3304fn sidecar_chunk_count_for_item(
3305 client: &mut impl RpcCaller,
3306 item_key: &Value,
3307) -> Result<usize, String> {
3308 let attachments = client.call(
3309 "attachments.list",
3310 Some(serde_json::json!({"parentKey": item_key.clone()})),
3311 )?;
3312 let Some(attachments) = attachments.as_array() else {
3313 return Ok(0);
3314 };
3315
3316 let mut count = 0usize;
3317 for attachment in attachments {
3318 let Some(path) = attachment.get("path").and_then(Value::as_str) else {
3319 continue;
3320 };
3321 let local = local_path_from_zotero_path(path);
3322 let Some(dir) = Path::new(&local).parent() else {
3323 continue;
3324 };
3325 let Ok(bytes) = read_machine_artifact_sidecar(dir, MachineArtifactKind::Chunks) else {
3326 continue;
3327 };
3328 let text = String::from_utf8_lossy(&bytes);
3329 count += text.lines().filter(|line| !line.trim().is_empty()).count();
3330 }
3331 Ok(count)
3332}
3333
3334fn rag_store_path(collection: &str) -> PathBuf {
3335 rag_store_root().join(format!("{collection}.json"))
3336}
3337
3338fn rag_store_root() -> PathBuf {
3339 let xdg_data_home = env::var_os("XDG_DATA_HOME")
3340 .filter(|path| !path.is_empty())
3341 .map(PathBuf::from);
3342 let appdata = env::var_os("APPDATA")
3343 .filter(|path| !path.is_empty())
3344 .map(PathBuf::from);
3345 let userprofile = env::var_os("USERPROFILE")
3346 .filter(|path| !path.is_empty())
3347 .map(PathBuf::from);
3348 let home = env::var_os("HOME")
3349 .filter(|path| !path.is_empty())
3350 .map(PathBuf::from);
3351
3352 rag_store_root_for_platform(
3353 ArtifactStorePlatform::current(),
3354 xdg_data_home.as_deref(),
3355 appdata.as_deref(),
3356 userprofile.as_deref(),
3357 home.as_deref(),
3358 )
3359}
3360
3361fn rag_store_root_for_platform(
3362 platform: ArtifactStorePlatform,
3363 xdg_data_home: Option<&Path>,
3364 appdata: Option<&Path>,
3365 userprofile: Option<&Path>,
3366 home: Option<&Path>,
3367) -> PathBuf {
3368 match platform {
3369 ArtifactStorePlatform::Windows => {
3370 if let Some(path) = appdata {
3371 return path.join("Zotron").join("rag");
3372 }
3373 if let Some(path) = userprofile {
3374 return path
3375 .join("AppData")
3376 .join("Roaming")
3377 .join("Zotron")
3378 .join("rag");
3379 }
3380 if let Some(path) = home {
3381 return path
3382 .join("AppData")
3383 .join("Roaming")
3384 .join("Zotron")
3385 .join("rag");
3386 }
3387 PathBuf::from(".zotron").join("rag")
3388 }
3389 ArtifactStorePlatform::Macos => {
3390 if let Some(path) = home {
3391 return path
3392 .join("Library")
3393 .join("Application Support")
3394 .join("Zotron")
3395 .join("rag");
3396 }
3397 if let Some(path) = xdg_data_home {
3398 return path.join("zotron").join("rag");
3399 }
3400 PathBuf::from(".zotron").join("rag")
3401 }
3402 ArtifactStorePlatform::Linux | ArtifactStorePlatform::Other => xdg_data_home
3403 .map(|path| path.join("zotron").join("rag"))
3404 .or_else(|| {
3405 home.map(|path| path.join(".local").join("share").join("zotron").join("rag"))
3406 })
3407 .unwrap_or_else(|| PathBuf::from(".zotron").join("rag")),
3408 }
3409}
3410
3411fn run_push_command(
3412 json_file: String,
3413 pdf: Option<String>,
3414 collection: Option<String>,
3415 on_duplicate: String,
3416 dry_run: bool,
3417 client: &mut impl RpcCaller,
3418) -> Result<String, String> {
3419 if !matches!(on_duplicate.as_str(), "skip" | "update" | "create") {
3420 return Err(format!(
3421 "INVALID_ARGS: --on-duplicate must be skip|update|create, got {on_duplicate:?}"
3422 ));
3423 }
3424
3425 let payload = if json_file == "-" {
3426 let mut input = String::new();
3427 io::stdin()
3428 .read_to_string(&mut input)
3429 .map_err(|err| format!("read stdin: {err}"))?;
3430 input
3431 } else {
3432 fs::read_to_string(&json_file).map_err(|err| format!("read {json_file}: {err}"))?
3433 };
3434 let item_json = serde_json::from_str::<Value>(&payload)
3435 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
3436
3437 match item_json.get("itemType").and_then(Value::as_str) {
3439 Some(s) if !s.is_empty() => {}
3440 _ => return Err("INVALID_ARGS: input JSON must include a non-empty \"itemType\" field".to_string()),
3441 }
3442
3443 if dry_run {
3444 let collection_key = collection
3445 .as_deref()
3446 .map(|name| resolve_collection(client, name))
3447 .transpose()?;
3448 return format_json(
3449 &serde_json::json!({
3450 "ok": true,
3451 "dryRun": true,
3452 "wouldPush": {
3453 "title": item_json.get("title").cloned().unwrap_or(Value::Null),
3454 "itemType": item_json.get("itemType").cloned().unwrap_or(Value::Null),
3455 "collectionKey": collection_key,
3456 "pdfPath": pdf,
3457 "onDuplicate": on_duplicate,
3458 }
3459 }),
3460 JsonStyle::PythonCompact,
3461 );
3462 }
3463
3464 let result = push_item(
3465 client,
3466 &item_json,
3467 pdf.as_deref(),
3468 collection.as_deref(),
3469 &on_duplicate,
3470 )?;
3471 format_json(&result, JsonStyle::PythonCompact)
3472}
3473
3474fn push_item(
3475 client: &mut impl RpcCaller,
3476 item_json: &Value,
3477 pdf_path: Option<&str>,
3478 collection: Option<&str>,
3479 on_duplicate: &str,
3480) -> Result<Value, String> {
3481 let pdf_size = if let Some(path) = pdf_path {
3482 validate_pdf_magic(path)?
3483 } else {
3484 0
3485 };
3486
3487 let collection_key = match collection {
3488 Some(name) => resolve_collection(client, name)?,
3489 None => resolve_current_collection(client)?,
3490 };
3491
3492 let dup_id = find_duplicate(client, item_json)?;
3493 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "skip") {
3494 if !is_library_root(&collection_key) {
3495 client.call(
3496 "collections.addItems",
3497 Some(serde_json::json!({"key": collection_key, "keys": [dup_id]})),
3498 )?;
3499 }
3500 let mut pdf_attached = false;
3501 if let Some(path) = pdf_path {
3502 if !item_has_pdf_attachment(client, dup_id)? {
3503 attach_pdf(client, dup_id, path)?;
3504 pdf_attached = true;
3505 }
3506 }
3507 return Ok(push_result(
3508 "skipped_duplicate",
3509 Some(dup_id.to_string()),
3510 pdf_attached,
3511 if pdf_attached { pdf_size } else { 0 },
3512 Value::Null,
3513 ));
3514 }
3515
3516 let xpi_payload = to_xpi_payload(item_json, Some(&collection_key));
3517 let (item_key, status) =
3518 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "update") {
3519 let mut params = serde_json::Map::new();
3520 params.insert("key".to_string(), Value::String(dup_id.to_string()));
3521 params.insert(
3522 "fields".to_string(),
3523 xpi_payload
3524 .get("fields")
3525 .cloned()
3526 .unwrap_or_else(|| serde_json::json!({})),
3527 );
3528 if let Some(creators) = xpi_payload.get("creators") {
3529 params.insert("creators".to_string(), creators.clone());
3530 }
3531 if let Some(tags) = xpi_payload.get("tags") {
3532 params.insert("tags".to_string(), tags.clone());
3533 }
3534 client.call("items.update", Some(Value::Object(params)))?;
3535 (dup_id.to_string(), "updated")
3536 } else {
3537 let created = client.call("items.create", Some(xpi_payload))?;
3538 let key = created
3539 .get("key")
3540 .and_then(Value::as_str)
3541 .ok_or_else(|| format!("items.create returned unexpected shape: {created:?}"))?;
3542 (key.to_string(), "created")
3543 };
3544
3545 let mut pdf_attached = false;
3546 if let Some(path) = pdf_path {
3547 if status != "updated" || !item_has_pdf_attachment(client, &item_key)? {
3548 attach_pdf(client, &item_key, path)?;
3549 pdf_attached = true;
3550 }
3551 }
3552
3553 if status == "updated" && !is_library_root(&collection_key) {
3554 client.call(
3555 "collections.addItems",
3556 Some(serde_json::json!({"key": collection_key, "keys": [item_key]})),
3557 )?;
3558 }
3559
3560 Ok(push_result(
3561 status,
3562 Some(item_key),
3563 pdf_attached,
3564 if pdf_attached { pdf_size } else { 0 },
3565 Value::Null,
3566 ))
3567}
3568
3569fn validate_pdf_magic(path: &str) -> Result<u64, String> {
3570 let bytes = fs::read(path)
3571 .map_err(|e| format!("INVALID_PDF: cannot read {path}: {e}"))?;
3572 if !bytes.starts_with(b"%PDF-") {
3573 return Err(format!(
3574 "INVALID_PDF: {path} does not start with %PDF- magic bytes"
3575 ));
3576 }
3577 Ok(bytes.len() as u64)
3578}
3579
3580fn resolve_current_collection(client: &mut impl RpcCaller) -> Result<Value, String> {
3581 let selected = client.call("system.currentCollection", None)?;
3582 Ok(selected
3583 .get("key")
3584 .cloned()
3585 .unwrap_or_else(|| Value::Number(0.into())))
3586}
3587
3588fn find_duplicate(
3589 client: &mut impl RpcCaller,
3590 item_json: &Value,
3591) -> Result<Option<String>, String> {
3592 if let Some(doi) = item_json
3593 .get("DOI")
3594 .and_then(Value::as_str)
3595 .filter(|doi| !doi.is_empty())
3596 {
3597 let hits = client.call("search.byIdentifier", Some(serde_json::json!({"doi": doi})))?;
3598 if let Some(key) = first_hit_key(&hits) {
3599 return Ok(Some(key));
3600 }
3601 }
3602
3603 if let Some(title) = item_json
3604 .get("title")
3605 .and_then(Value::as_str)
3606 .filter(|title| title.len() >= 10)
3607 {
3608 let hits = client.call(
3609 "search.quick",
3610 Some(serde_json::json!({"query": title, "limit": 20})),
3611 )?;
3612 if let Some(items) = response_items(&hits) {
3613 for item in items {
3614 if item.get("title").and_then(Value::as_str) == Some(title) {
3615 if let Some(key) = item.get("key").and_then(Value::as_str) {
3616 return Ok(Some(key.to_string()));
3617 }
3618 }
3619 }
3620 }
3621 }
3622
3623 Ok(None)
3624}
3625
3626fn first_hit_key(response: &Value) -> Option<String> {
3627 response_items(response)?
3628 .first()?
3629 .get("key")?
3630 .as_str()
3631 .map(ToString::to_string)
3632}
3633
3634fn response_items(response: &Value) -> Option<&Vec<Value>> {
3635 response
3636 .get("items")
3637 .and_then(Value::as_array)
3638 .or_else(|| response.as_array())
3639}
3640
3641fn to_xpi_payload(item_json: &Value, collection_key: Option<&Value>) -> Value {
3642 const NON_FIELD_KEYS: &[&str] = &[
3643 "itemType",
3644 "creators",
3645 "tags",
3646 "collections",
3647 "attachments",
3648 "relations",
3649 "notes",
3650 "id",
3651 "key",
3652 "version",
3653 ];
3654
3655 let mut fields = serde_json::Map::new();
3656 if let Some(item) = item_json.as_object() {
3657 for (key, value) in item {
3658 if !NON_FIELD_KEYS.contains(&key.as_str()) && !value.is_null() && value != "" {
3659 fields.insert(key.clone(), value.clone());
3660 }
3661 }
3662 }
3663
3664 let mut payload = serde_json::Map::new();
3665 payload.insert(
3666 "itemType".to_string(),
3667 item_json
3668 .get("itemType")
3669 .cloned()
3670 .unwrap_or_else(|| Value::String("journalArticle".to_string())),
3671 );
3672 payload.insert("fields".to_string(), Value::Object(fields));
3673
3674 if let Some(creators) = item_json.get("creators").and_then(Value::as_array) {
3675 if !creators.is_empty() {
3676 payload.insert(
3677 "creators".to_string(),
3678 Value::Array(
3679 creators
3680 .iter()
3681 .map(|creator| {
3682 let mut c = serde_json::json!({
3683 "firstName": creator.get("firstName").and_then(Value::as_str).unwrap_or(""),
3684 "lastName": creator.get("lastName").and_then(Value::as_str).unwrap_or(""),
3685 "creatorType": creator.get("creatorType").and_then(Value::as_str).unwrap_or("author"),
3686 });
3687 if let Some(fm) = creator.get("fieldMode").and_then(Value::as_u64) {
3688 c["fieldMode"] = Value::from(fm);
3689 }
3690 c
3691 })
3692 .collect(),
3693 ),
3694 );
3695 }
3696 }
3697
3698 if let Some(tags) = item_json.get("tags").and_then(Value::as_array) {
3699 if !tags.is_empty() {
3700 payload.insert(
3701 "tags".to_string(),
3702 Value::Array(
3703 tags.iter()
3704 .map(|tag| tag.get("tag").cloned().unwrap_or_else(|| tag.clone()))
3705 .collect(),
3706 ),
3707 );
3708 }
3709 }
3710
3711 if let Some(collection_key) = collection_key.filter(|key| !is_library_root(key)) {
3712 payload.insert(
3713 "collections".to_string(),
3714 Value::Array(vec![collection_key.clone()]),
3715 );
3716 }
3717
3718 Value::Object(payload)
3719}
3720
3721fn item_has_pdf_attachment(client: &mut impl RpcCaller, item_key: &str) -> Result<bool, String> {
3722 let attachments = client.call(
3723 "attachments.list",
3724 Some(serde_json::json!({"parentKey": item_key})),
3725 )?;
3726 Ok(has_pdf_attachment(&attachments))
3727}
3728
3729fn attach_pdf(client: &mut impl RpcCaller, item_key: &str, path: &str) -> Result<(), String> {
3730 client.call(
3731 "attachments.add",
3732 Some(serde_json::json!({
3733 "parentKey": item_key,
3734 "path": zotero_path(path),
3735 "title": "Full Text PDF",
3736 })),
3737 )?;
3738 Ok(())
3739}
3740
3741fn zotero_path(path: &str) -> String {
3742 let path = Path::new(path)
3743 .canonicalize()
3744 .unwrap_or_else(|_| Path::new(path).to_path_buf())
3745 .to_string_lossy()
3746 .into_owned();
3747 if is_wsl() {
3748 return ProcessCommand::new("wslpath")
3749 .arg("-w")
3750 .arg(&path)
3751 .output()
3752 .ok()
3753 .filter(|output| output.status.success())
3754 .and_then(|output| String::from_utf8(output.stdout).ok())
3755 .map(|converted| converted.trim().to_string())
3756 .filter(|converted| !converted.is_empty())
3757 .unwrap_or(path);
3758 }
3759 path
3760}
3761
3762fn is_wsl() -> bool {
3763 if env::var_os("WSL_DISTRO_NAME").is_some() {
3764 return true;
3765 }
3766 fs::read_to_string("/proc/sys/kernel/osrelease")
3767 .map(|release| release.to_ascii_lowercase().contains("microsoft"))
3768 .unwrap_or(false)
3769}
3770
3771fn is_library_root(value: &Value) -> bool {
3772 value.as_i64() == Some(0) || value.as_u64() == Some(0)
3773}
3774
3775fn push_result(
3776 status: &str,
3777 zotero_item_key: Option<String>,
3778 pdf_attached: bool,
3779 pdf_size_bytes: u64,
3780 error: Value,
3781) -> Value {
3782 serde_json::json!({
3783 "status": status,
3784 "zotero_item_key": zotero_item_key,
3785 "pdf_attached": pdf_attached,
3786 "pdf_size_bytes": pdf_size_bytes,
3787 "error": error,
3788 })
3789}
3790
3791fn run_search(
3792 args: SearchArgs,
3793 client: &mut impl RpcCaller,
3794) -> Result<(Value, JsonStyle), String> {
3795 let SearchArgs {
3796 query, fulltext, author, after, before, journal, tag,
3797 doi, isbn, issn, collection, limit, offset, ..
3798 } = args;
3799
3800 let has_identifier = doi.is_some() || isbn.is_some() || issn.is_some();
3801 if has_identifier {
3802 let mut params = serde_json::Map::new();
3803 if let Some(doi) = doi { params.insert("doi".into(), Value::String(doi)); }
3804 if let Some(isbn) = isbn { params.insert("isbn".into(), Value::String(isbn)); }
3805 if let Some(issn) = issn { params.insert("issn".into(), Value::String(issn)); }
3806 let value = client.call("search.byIdentifier", Some(Value::Object(params)))?;
3807 return Ok((normalize_list_envelope(value, "items", None, 0), JsonStyle::Pretty));
3808 }
3809
3810 if fulltext {
3811 let query = query.ok_or("INVALID_ARGS: --fulltext requires a search query")?;
3812 let mut params = serde_json::json!({"query": query, "limit": limit});
3813 if let (Some(col), Some(map)) = (collection, params.as_object_mut()) {
3814 map.insert("collection".into(), resolve_collection(client, &col)?);
3815 }
3816 let value = client.call("search.fulltext", Some(params))?;
3817 return Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty));
3818 }
3819
3820 let has_filters = author.is_some() || after.is_some() || before.is_some()
3821 || journal.is_some() || tag.is_some();
3822 if has_filters {
3823 let mut conditions: Vec<Value> = Vec::new();
3824 if let Some(query) = &query {
3825 conditions.push(serde_json::json!({
3826 "field": "quicksearch-titleCreatorYear",
3827 "operator": "contains",
3828 "value": query,
3829 }));
3830 }
3831 if let Some(author) = author {
3832 conditions.push(serde_json::json!({
3833 "field": "creator", "operator": "contains", "value": author,
3834 }));
3835 }
3836 if let Some(after) = after {
3837 conditions.push(serde_json::json!({
3838 "field": "date", "operator": "isAfter", "value": after,
3839 }));
3840 }
3841 if let Some(before) = before {
3842 conditions.push(serde_json::json!({
3843 "field": "date", "operator": "isBefore", "value": before,
3844 }));
3845 }
3846 if let Some(journal) = journal {
3847 conditions.push(serde_json::json!({
3848 "field": "publicationTitle", "operator": "contains", "value": journal,
3849 }));
3850 }
3851 if let Some(tag) = tag {
3852 conditions.push(serde_json::json!({
3853 "field": "tag", "operator": "is", "value": tag,
3854 }));
3855 }
3856 let value = client.call(
3857 "search.advanced",
3858 Some(serde_json::json!({
3859 "conditions": conditions,
3860 "operator": "and",
3861 "limit": limit,
3862 "offset": offset,
3863 })),
3864 )?;
3865 return Ok((normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty));
3866 }
3867
3868 let query = query.ok_or(
3869 "INVALID_ARGS: provide a search query, or use --doi/--isbn/--issn for identifier lookup"
3870 )?;
3871 let value = if let Some(col) = collection {
3872 let key = resolve_collection(client, &col)?;
3873 let response = client.call(
3874 "collections.getItems",
3875 Some(serde_json::json!({"key": key})),
3876 )?;
3877 collection_quick_search_response(&response, &query, limit)
3878 } else {
3879 filter_search_artifacts(client.call(
3880 "search.quick",
3881 Some(serde_json::json!({"query": query, "limit": limit})),
3882 )?)
3883 };
3884 Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty))
3885}
3886
3887fn run_search_management_command(
3888 command: SearchManagementCommand,
3889 client: &mut impl RpcCaller,
3890) -> Result<(Value, JsonStyle), String> {
3891 match command {
3892 SearchManagementCommand::SavedSearches { .. } => Ok((
3893 normalize_list_envelope(client.call("search.savedSearches", None)?, "items", None, 0),
3894 JsonStyle::Pretty,
3895 )),
3896 SearchManagementCommand::CreateSaved {
3897 name, condition, dry_run, ..
3898 } => {
3899 let conditions = condition
3900 .iter()
3901 .map(|raw| parse_search_condition(raw))
3902 .collect::<Result<Vec<_>, _>>()?;
3903 let params = serde_json::json!({"name": name, "conditions": conditions});
3904 if dry_run {
3905 Ok((dry_run_value("search.createSavedSearch", params), JsonStyle::PythonCompact))
3906 } else {
3907 Ok((client.call("search.createSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3908 }
3909 }
3910 SearchManagementCommand::DeleteSaved {
3911 search_key, dry_run, ..
3912 } => {
3913 let params = serde_json::json!({"key": search_key});
3914 if dry_run {
3915 Ok((dry_run_value("search.deleteSavedSearch", params), JsonStyle::PythonCompact))
3916 } else {
3917 Ok((client.call("search.deleteSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3918 }
3919 }
3920 }
3921}
3922
3923fn filter_search_artifacts(mut value: Value) -> Value {
3924 let Some(items) = value.get_mut("items").and_then(Value::as_array_mut) else {
3925 return value;
3926 };
3927 items.retain(|item| match item.get("title").and_then(Value::as_str) {
3928 Some(title) => !is_zotron_evidence_artifact(title),
3929 None => true,
3930 });
3931 let total_items = items.len() as u64;
3932 if let Some(total) = value.get_mut("total") {
3933 *total = Value::from(total_items);
3934 }
3935 value
3936}
3937
3938fn collection_quick_search_response(response: &Value, query: &str, limit: u64) -> Value {
3939 let mut matched = collection_items(response)
3940 .into_iter()
3941 .filter(|item| !item_is_evidence_artifact(item))
3942 .filter(|item| quick_item_matches(item, query))
3943 .collect::<Vec<_>>();
3944 let total = matched.len() as u64;
3945 let limit = usize::try_from(limit).unwrap_or(usize::MAX);
3946 if matched.len() > limit {
3947 matched.truncate(limit);
3948 }
3949 serde_json::json!({"items": matched, "total": total})
3950}
3951
3952fn item_is_evidence_artifact(item: &Value) -> bool {
3953 item.get("title")
3954 .and_then(Value::as_str)
3955 .is_some_and(is_zotron_evidence_artifact)
3956}
3957
3958fn quick_item_matches(item: &Value, query: &str) -> bool {
3959 let terms = query
3960 .split_whitespace()
3961 .map(|term| term.to_lowercase())
3962 .filter(|term| !term.is_empty())
3963 .collect::<Vec<_>>();
3964 if terms.is_empty() {
3965 return true;
3966 }
3967 let mut haystack = String::new();
3968 append_search_text(item, &mut haystack);
3969 let haystack = haystack.to_lowercase();
3970 terms.iter().all(|term| haystack.contains(term))
3971}
3972
3973fn append_search_text(value: &Value, out: &mut String) {
3974 match value {
3975 Value::String(text) => {
3976 out.push(' ');
3977 out.push_str(text);
3978 }
3979 Value::Number(number) => {
3980 out.push(' ');
3981 out.push_str(&number.to_string());
3982 }
3983 Value::Bool(value) => {
3984 out.push(' ');
3985 out.push_str(if *value { "true" } else { "false" });
3986 }
3987 Value::Array(items) => {
3988 for item in items {
3989 append_search_text(item, out);
3990 }
3991 }
3992 Value::Object(map) => {
3993 for item in map.values() {
3994 append_search_text(item, out);
3995 }
3996 }
3997 Value::Null => {}
3998 }
3999}
4000
4001fn parse_search_condition(raw: &str) -> Result<Value, String> {
4002 let mut parts = raw.split_whitespace();
4003 let field = parts.next();
4004 let operator = parts.next();
4005 let value = parts.collect::<Vec<_>>().join(" ");
4006 match (field, operator, value.is_empty()) {
4007 (Some(field), Some(operator), false) => Ok(serde_json::json!({
4008 "field": field,
4009 "operator": operator,
4010 "value": value,
4011 })),
4012 _ => Err(format!(
4013 "INVALID_ARGS: --condition must be 'field operator value', got: {raw:?}"
4014 )),
4015 }
4016}
4017
4018fn normalize_list_envelope(value: Value, list_key: &str, limit: Option<u64>, offset: u64) -> Value {
4019 if let Value::Array(arr) = value {
4020 let total = arr.len() as u64;
4021 let mut obj = serde_json::Map::new();
4022 obj.insert(list_key.to_string(), Value::Array(arr));
4023 obj.insert("total".to_string(), Value::from(total));
4024 if let Some(limit) = limit {
4025 obj.insert("limit".to_string(), Value::from(limit));
4026 }
4027 obj.insert("offset".to_string(), Value::from(offset));
4028 obj.insert("hasMore".to_string(), Value::Bool(false));
4029 return Value::Object(obj);
4030 }
4031
4032 let mut obj = match value {
4033 Value::Object(obj) if obj.contains_key(list_key) => obj,
4034 other => return other,
4035 };
4036
4037 let items_len = obj
4038 .get(list_key)
4039 .and_then(Value::as_array)
4040 .map_or(0, |a| a.len()) as u64;
4041 let total = obj
4042 .get("total")
4043 .and_then(Value::as_u64)
4044 .unwrap_or(items_len);
4045
4046 obj.insert("total".to_string(), Value::from(total));
4047 if let Some(limit) = limit {
4048 obj.insert("limit".to_string(), Value::from(limit));
4049 }
4050 obj.insert("offset".to_string(), Value::from(offset));
4051 obj.insert(
4052 "hasMore".to_string(),
4053 Value::Bool(offset + items_len < total),
4054 );
4055
4056 Value::Object(obj)
4057}
4058
4059const RPC_PAGINATION_SAFETY_CAP: usize = 10_000;
4060const RPC_PAGE_LIST_KEYS: [&str; 4] = ["items", "tags", "results", "data"];
4061
4062fn paginate_rpc(
4063 client: &mut impl RpcCaller,
4064 method: &str,
4065 params: Value,
4066 page_size: usize,
4067) -> Result<Value, String> {
4068 let base = params
4069 .as_object()
4070 .ok_or_else(|| "params must be a JSON object".to_string())?;
4071 let mut out = Vec::new();
4072 let mut prev_page: Option<Vec<Value>> = None;
4073 let mut offset = 0usize;
4074
4075 loop {
4076 let mut page_params = base.clone();
4077 page_params.insert("offset".to_string(), Value::Number(offset.into()));
4078 page_params.insert("limit".to_string(), Value::Number(page_size.into()));
4079 let response = client.call(method, Some(Value::Object(page_params)))?;
4080
4081 let page = match extract_page(&response) {
4082 Some(page) => page,
4083 None if out.is_empty() => return Ok(response),
4084 None if response.is_object() => {
4085 return Err(format!(
4086 "paginate: {method:?} returned a non-paginated dict after {} accumulated rows; aborting",
4087 out.len()
4088 ));
4089 }
4090 None => {
4091 return Err(format!(
4092 "paginate: {method:?} returned non-list/non-dict shape after {} accumulated rows; aborting",
4093 out.len()
4094 ));
4095 }
4096 };
4097
4098 if prev_page.as_ref() == Some(&page) {
4099 return Err(format!(
4100 "paginate: {method:?} returned identical pages — method likely ignores offset; aborting after {} rows",
4101 out.len()
4102 ));
4103 }
4104
4105 let page_len = page.len();
4106 out.extend(page.clone());
4107 if page_len < page_size {
4108 return Ok(Value::Array(out));
4109 }
4110 if out.len() >= RPC_PAGINATION_SAFETY_CAP {
4111 out.truncate(RPC_PAGINATION_SAFETY_CAP);
4112 return Ok(Value::Array(out));
4113 }
4114 prev_page = Some(page);
4115 offset += page_size;
4116 }
4117}
4118
4119fn extract_page(response: &Value) -> Option<Vec<Value>> {
4120 if let Some(page) = response.as_array() {
4121 return Some(page.clone());
4122 }
4123 let object = response.as_object()?;
4124 for key in RPC_PAGE_LIST_KEYS {
4125 if let Some(page) = object.get(key).and_then(Value::as_array) {
4126 return Some(page.clone());
4127 }
4128 }
4129 None
4130}
4131
4132fn run_find_pdfs_command(
4133 client: &mut impl RpcCaller,
4134 collection: String,
4135 limit: usize,
4136) -> Result<(Value, JsonStyle), String> {
4137 let collection_key = resolve_collection(client, &collection)?;
4138 let response = client.call(
4139 "collections.getItems",
4140 Some(serde_json::json!({"key": collection_key})),
4141 )?;
4142 let items = collection_items(&response);
4143
4144 let mut missing = Vec::new();
4145 for item in &items {
4146 let Some(item_key) = item.get("key").and_then(Value::as_str) else {
4147 continue;
4148 };
4149 let attachments = client.call(
4150 "attachments.list",
4151 Some(serde_json::json!({"parentKey": item_key})),
4152 )?;
4153 if !has_pdf_attachment(&attachments) {
4154 missing.push(item.clone());
4155 }
4156 if limit > 0 && missing.len() >= limit {
4157 break;
4158 }
4159 }
4160
4161 let mut results = Vec::new();
4162 for item in &missing {
4163 let item_key = item
4164 .get("key")
4165 .and_then(Value::as_str)
4166 .ok_or_else(|| "missing item lacks key".to_string())?;
4167 let response = client.call(
4168 "attachments.findPDF",
4169 Some(serde_json::json!({"parentKey": item_key})),
4170 )?;
4171 let attachment = response.get("attachment").filter(|value| !value.is_null());
4172 results.push(serde_json::json!({
4173 "item_key": item_key,
4174 "title": item.get("title").cloned().unwrap_or(Value::Null),
4175 "found": attachment.is_some(),
4176 "attachment_key": attachment
4177 .and_then(|attachment| attachment.get("key"))
4178 .cloned()
4179 .unwrap_or(Value::Null),
4180 }));
4181 }
4182
4183 Ok((
4184 serde_json::json!({
4185 "scanned": items.len(),
4186 "attempted": missing.len(),
4187 "results": results,
4188 }),
4189 JsonStyle::Pretty,
4190 ))
4191}
4192
4193fn collection_items(response: &Value) -> Vec<Value> {
4194 if let Some(items) = response.get("items").and_then(Value::as_array) {
4195 return items.clone();
4196 }
4197 response.as_array().cloned().unwrap_or_default()
4198}
4199
4200fn has_pdf_attachment(attachments: &Value) -> bool {
4201 attachments
4202 .as_array()
4203 .is_some_and(|attachments| attachments.iter().any(is_pdf_attachment))
4204}
4205
4206fn is_pdf_attachment(attachment: &Value) -> bool {
4207 let content_type = attachment
4208 .get("contentType")
4209 .and_then(Value::as_str)
4210 .unwrap_or_default()
4211 .to_lowercase();
4212 let path = attachment
4213 .get("path")
4214 .and_then(Value::as_str)
4215 .unwrap_or_default()
4216 .to_lowercase();
4217 matches!(
4218 content_type.as_str(),
4219 "application/pdf" | "application/x-pdf"
4220 ) || path.ends_with(".pdf")
4221}
4222
4223fn call_json(
4224 client: &mut impl RpcCaller,
4225 method: &str,
4226 params: Option<Value>,
4227) -> Result<Value, String> {
4228 client.call(method, params)
4229}
4230
4231fn run_system_command(
4232 command: SystemCommand,
4233 client: &mut impl RpcCaller,
4234) -> Result<(Value, JsonStyle), String> {
4235 let value = match command {
4236 SystemCommand::Version { .. } => client.call("system.version", None)?,
4237 SystemCommand::Libraries { .. } => client.call("system.libraries", None)?,
4238 SystemCommand::LibraryStats { library, .. } => {
4239 let params = library.map(|id| serde_json::json!({"id": id}));
4240 client.call("system.libraryStats", params)?
4241 }
4242 SystemCommand::Schema { item_type, .. } => {
4243 if let Some(item_type) = item_type {
4244 let fields = client.call("system.itemFields", Some(serde_json::json!({"itemType": item_type})))?;
4245 let creators = client.call("system.creatorTypes", Some(serde_json::json!({"itemType": item_type})))?;
4246 let field_names: Vec<Value> = fields.as_array().unwrap_or(&vec![])
4247 .iter()
4248 .filter_map(|f| f.get("field").cloned())
4249 .collect();
4250 let creator_names: Vec<Value> = creators.as_array().unwrap_or(&vec![])
4251 .iter()
4252 .filter_map(|c| c.get("creatorType").cloned())
4253 .collect();
4254 serde_json::json!({
4255 "itemType": item_type,
4256 "fields": field_names,
4257 "creatorTypes": creator_names,
4258 })
4259 } else {
4260 let types = client.call("system.itemTypes", None)?;
4261 let type_names: Vec<Value> = types.as_array().unwrap_or(&vec![])
4262 .iter()
4263 .filter_map(|t| t.get("itemType").cloned())
4264 .collect();
4265 Value::Array(type_names)
4266 }
4267 }
4268 SystemCommand::CurrentCollection { .. } => client.call("system.currentCollection", None)?,
4269 SystemCommand::Methods { method, .. } => {
4270 if let Some(method) = method {
4271 client.call("system.describe", Some(serde_json::json!({"method": method})))?
4272 } else {
4273 client.call("system.listMethods", None)?
4274 }
4275 }
4276 };
4277 Ok((value, JsonStyle::Pretty))
4278}
4279
4280fn run_items_command(
4281 command: ItemsCommand,
4282 client: &mut impl RpcCaller,
4283) -> Result<(Value, JsonStyle), String> {
4284 let (value, style) = match command {
4285 ItemsCommand::Add {
4286 doi,
4287 isbn,
4288 from_url,
4289 file,
4290 item_type,
4291 fields,
4292 collection,
4293 dry_run,
4294 ..
4295 } => {
4296 if let Some(doi) = doi {
4297 run_add_identifier_command(client, "items.addByDOI", "doi", doi, collection, dry_run)?
4298 } else if let Some(isbn) = isbn {
4299 run_add_identifier_command(client, "items.addByISBN", "isbn", isbn, collection, dry_run)?
4300 } else if let Some(from_url) = from_url {
4301 run_add_identifier_command(client, "items.addByURL", "url", from_url, collection, dry_run)?
4302 } else if let Some(file) = file {
4303 let mut params = serde_json::json!({"path": zotero_path(&file)});
4304 maybe_insert_collection(client, &mut params, collection)?;
4305 run_mutation_command(client, "items.addFromFile", params, dry_run)?
4306 } else if let Some(item_type) = item_type {
4307 let parsed_fields = parse_field_options(&fields)?;
4308 let mut params = serde_json::json!({"itemType": item_type});
4309 if !parsed_fields.is_empty() {
4310 if let Some(map) = params.as_object_mut() {
4311 map.insert("fields".to_string(), Value::Object(parsed_fields));
4312 }
4313 }
4314 run_mutation_command(client, "items.create", params, dry_run)?
4315 } else {
4316 return Err("INVALID_ARGS: provide one of --doi, --isbn, --from-url, --file, or --type".into());
4317 }
4318 }
4319 ItemsCommand::Update {
4320 key,
4321 fields,
4322 dry_run,
4323 ..
4324 } => {
4325 let parsed_fields = parse_field_options(&fields)?;
4326 let mut params = serde_json::json!({"key": key});
4327 if !parsed_fields.is_empty() {
4328 if let Some(map) = params.as_object_mut() {
4329 map.insert("fields".to_string(), Value::Object(parsed_fields));
4330 }
4331 }
4332 run_mutation_command(client, "items.update", params, dry_run)?
4333 }
4334 ItemsCommand::Delete { key, dry_run, .. } => run_mutation_command(
4335 client,
4336 "items.delete",
4337 serde_json::json!({"key": key}),
4338 dry_run,
4339 )?,
4340 ItemsCommand::Trash {
4341 items, dry_run, ..
4342 } => {
4343 if items.len() == 1 {
4344 run_mutation_command(
4345 client,
4346 "items.trash",
4347 serde_json::json!({"key": items[0]}),
4348 dry_run,
4349 )?
4350 } else {
4351 run_mutation_command(
4352 client,
4353 "items.batchTrash",
4354 serde_json::json!({"keys": items}),
4355 dry_run,
4356 )?
4357 }
4358 }
4359 ItemsCommand::Restore { item, dry_run, .. } => run_mutation_command(
4360 client,
4361 "items.restore",
4362 serde_json::json!({"key": item}),
4363 dry_run,
4364 )?,
4365 ItemsCommand::MergeDuplicates { keys, dry_run, .. } => {
4366 if keys.len() < 2 {
4367 return Err("INVALID_ARGS: need at least 2 keys to merge".to_string());
4368 }
4369 run_mutation_command(
4370 client,
4371 "items.mergeDuplicates",
4372 serde_json::json!({"keys": keys}),
4373 dry_run,
4374 )?
4375 }
4376 ItemsCommand::AddRelated {
4377 key,
4378 target,
4379 dry_run,
4380 ..
4381 } => run_mutation_command(
4382 client,
4383 "items.addRelated",
4384 serde_json::json!({"key": key, "targetKey": target}),
4385 dry_run,
4386 )?,
4387 ItemsCommand::RemoveRelated {
4388 key,
4389 target,
4390 dry_run,
4391 ..
4392 } => run_mutation_command(
4393 client,
4394 "items.removeRelated",
4395 serde_json::json!({"key": key, "targetKey": target}),
4396 dry_run,
4397 )?,
4398 ItemsCommand::Get { item, .. } => (
4399 client.call("items.get", Some(serde_json::json!({"key": item})))?,
4400 JsonStyle::Pretty,
4401 ),
4402 ItemsCommand::List {
4403 limit,
4404 offset,
4405 sort,
4406 direction,
4407 trash,
4408 ..
4409 } => {
4410 if trash {
4411 let value = client.call(
4412 "items.getTrash",
4413 Some(serde_json::json!({"limit": limit, "offset": offset})),
4414 )?;
4415 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4416 } else {
4417 let mut params = serde_json::json!({
4418 "limit": limit,
4419 "offset": offset,
4420 "direction": direction,
4421 });
4422 if let (Some(sort), Some(map)) = (sort, params.as_object_mut()) {
4423 map.insert("sort".to_string(), Value::String(sort));
4424 }
4425 let value = client.call("items.list", Some(params))?;
4426 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4427 }
4428 }
4429 ItemsCommand::FindDuplicates { .. } => (
4430 client.call("items.findDuplicates", None)?,
4431 JsonStyle::Pretty,
4432 ),
4433 ItemsCommand::Recent {
4434 limit,
4435 offset,
4436 recent_type,
4437 ..
4438 } => {
4439 if recent_type != "added" && recent_type != "modified" {
4440 return Err(format!(
4441 "--type must be added or modified, got {recent_type:?}"
4442 ));
4443 }
4444 let value = client.call(
4445 "items.getRecent",
4446 Some(
4447 serde_json::json!({"limit": limit, "offset": offset, "type": recent_type}),
4448 ),
4449 )?;
4450 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4451 }
4452 ItemsCommand::Fulltext { key, .. } => (
4453 client.call("items.getFullText", Some(serde_json::json!({"key": key})))?,
4454 JsonStyle::Pretty,
4455 ),
4456 ItemsCommand::Related { key, .. } => (
4457 normalize_list_envelope(
4458 client.call("items.getRelated", Some(serde_json::json!({"key": key})))?,
4459 "items",
4460 None,
4461 0,
4462 ),
4463 JsonStyle::Pretty,
4464 ),
4465 ItemsCommand::CitationKey { key, .. } => (
4466 client.call("items.citationKey", Some(serde_json::json!({"key": key})))?,
4467 JsonStyle::Pretty,
4468 ),
4469 ItemsCommand::Path { key, .. } => (
4470 localize_attachment_path_response(
4471 client.call("attachments.getPath", Some(serde_json::json!({"key": key})))?,
4472 ),
4473 JsonStyle::Pretty,
4474 ),
4475 ItemsCommand::Attachments { key, offset, .. } => {
4476 let value = client.call(
4477 "attachments.list",
4478 Some(serde_json::json!({"parentKey": key})),
4479 )?;
4480 let total = value
4481 .get("items")
4482 .and_then(Value::as_array)
4483 .map_or(0, |a| a.len()) as u64;
4484 (normalize_list_envelope(value, "items", Some(total), offset), JsonStyle::Pretty)
4485 }
4486 ItemsCommand::FindPdfs { collection, limit, .. } => {
4487 run_find_pdfs_command(client, collection, limit)?
4488 }
4489 };
4490 Ok((value, style))
4491}
4492
4493fn run_add_identifier_command(
4494 client: &mut impl RpcCaller,
4495 method: &str,
4496 param_name: &str,
4497 param_value: String,
4498 collection: Option<String>,
4499 dry_run: bool,
4500) -> Result<(Value, JsonStyle), String> {
4501 let mut params = Value::Object(serde_json::Map::from_iter([(
4502 param_name.to_string(),
4503 Value::String(param_value),
4504 )]));
4505 maybe_insert_collection(client, &mut params, collection)?;
4506 run_mutation_command(client, method, params, dry_run)
4507}
4508
4509fn run_mutation_command(
4510 client: &mut impl RpcCaller,
4511 method: &str,
4512 params: Value,
4513 dry_run: bool,
4514) -> Result<(Value, JsonStyle), String> {
4515 let value = if dry_run {
4516 serde_json::json!({
4517 "ok": true,
4518 "dryRun": true,
4519 "wouldCall": method,
4520 "wouldCallParams": params,
4521 })
4522 } else {
4523 client.call(method, Some(params))?
4524 };
4525 Ok((value, JsonStyle::PythonCompact))
4526}
4527
4528fn parse_field_options(fields: &[String]) -> Result<serde_json::Map<String, Value>, String> {
4529 let mut parsed = serde_json::Map::new();
4530 for field in fields {
4531 let (key, value) = field
4532 .split_once('=')
4533 .ok_or_else(|| format!("INVALID_ARGS: --field must be key=value, got: {field:?}"))?;
4534 parsed.insert(key.to_string(), Value::String(value.to_string()));
4535 }
4536 Ok(parsed)
4537}
4538
4539fn maybe_insert_collection(
4540 client: &mut impl RpcCaller,
4541 params: &mut Value,
4542 collection: Option<String>,
4543) -> Result<(), String> {
4544 let Some(collection) = collection else {
4545 return Ok(());
4546 };
4547 let collection = resolve_collection(client, &collection)?;
4548 let include = match &collection {
4549 Value::Null => false,
4550 Value::Number(number) => number.as_i64() != Some(0),
4551 _ => true,
4552 };
4553 if include {
4554 params
4555 .as_object_mut()
4556 .expect("mutation params are always objects")
4557 .insert("collection".to_string(), collection);
4558 }
4559 Ok(())
4560}
4561
4562fn run_settings_command(
4563 command: SettingsCommand,
4564 client: &mut impl RpcCaller,
4565) -> Result<(Value, JsonStyle), String> {
4566 let (value, style) = match command {
4567 SettingsCommand::Get { key, .. } => (
4568 client.call("settings.get", Some(serde_json::json!({"key": key})))?,
4569 JsonStyle::Pretty,
4570 ),
4571 SettingsCommand::List { .. } => (client.call("settings.getAll", None)?, JsonStyle::Pretty),
4572 SettingsCommand::Set {
4573 pairs,
4574 file,
4575 dry_run,
4576 ..
4577 } => {
4578 if let Some(file) = file {
4579 let raw = fs::read_to_string(&file)
4581 .map_err(|err| format!("INVALID_JSON: Could not read JSON: {err}"))?;
4582 let settings: Value = serde_json::from_str(&raw)
4583 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
4584 if dry_run {
4585 (
4586 dry_run_value("settings.setAll", settings),
4587 JsonStyle::PythonCompact,
4588 )
4589 } else {
4590 (
4591 client.call("settings.setAll", Some(settings))?,
4592 JsonStyle::PythonCompact,
4593 )
4594 }
4595 } else if pairs.len() == 2 {
4596 let key = &pairs[0];
4598 let value = &pairs[1];
4599 let parsed_value = serde_json::from_str::<Value>(value)
4600 .unwrap_or(Value::String(value.clone()));
4601 let params = serde_json::json!({"key": key, "value": parsed_value});
4602 if dry_run {
4603 (
4604 dry_run_value("settings.set", params),
4605 JsonStyle::PythonCompact,
4606 )
4607 } else {
4608 (
4609 client.call("settings.set", Some(params))?,
4610 JsonStyle::PythonCompact,
4611 )
4612 }
4613 } else if pairs.len() > 2 && pairs.len() % 2 == 0 {
4614 let mut map = serde_json::Map::new();
4616 for chunk in pairs.chunks(2) {
4617 let parsed = serde_json::from_str::<Value>(&chunk[1])
4618 .unwrap_or(Value::String(chunk[1].clone()));
4619 map.insert(chunk[0].clone(), parsed);
4620 }
4621 let settings = Value::Object(map);
4622 if dry_run {
4623 (
4624 dry_run_value("settings.setAll", settings),
4625 JsonStyle::PythonCompact,
4626 )
4627 } else {
4628 (
4629 client.call("settings.setAll", Some(settings))?,
4630 JsonStyle::PythonCompact,
4631 )
4632 }
4633 } else {
4634 return Err(
4635 "INVALID_ARGS: provide key value pairs (even number of args) or --file".into(),
4636 );
4637 }
4638 }
4639 };
4640 Ok((value, style))
4641}
4642
4643fn run_tags_command(
4644 command: TagsCommand,
4645 client: &mut impl RpcCaller,
4646) -> Result<(Value, JsonStyle), String> {
4647 let (value, style) = match command {
4648 TagsCommand::List { limit, .. } => {
4649 let value = client.call("tags.list", Some(serde_json::json!({"limit": limit})))?;
4650 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
4651 }
4652 TagsCommand::Rename {
4653 old, new, dry_run, ..
4654 } => run_tag_mutation(
4655 client,
4656 "tags.rename",
4657 serde_json::json!({"oldName": old, "newName": new}),
4658 dry_run,
4659 )?,
4660 TagsCommand::Delete { tag, dry_run, .. } => run_tag_mutation(
4661 client,
4662 "tags.delete",
4663 serde_json::json!({"tag": tag}),
4664 dry_run,
4665 )?,
4666 TagsCommand::Add {
4667 keys, tags, dry_run, ..
4668 } => {
4669 if keys.len() == 1 {
4670 run_tag_mutation(
4671 client,
4672 "tags.add",
4673 serde_json::json!({"key": keys[0], "tags": tags}),
4674 dry_run,
4675 )?
4676 } else {
4677 run_tag_mutation(
4678 client,
4679 "tags.batchUpdate",
4680 serde_json::json!({"keys": keys, "add": tags}),
4681 dry_run,
4682 )?
4683 }
4684 }
4685 TagsCommand::Remove {
4686 keys, tags, dry_run, ..
4687 } => {
4688 if keys.len() == 1 {
4689 run_tag_mutation(
4690 client,
4691 "tags.remove",
4692 serde_json::json!({"key": keys[0], "tags": tags}),
4693 dry_run,
4694 )?
4695 } else {
4696 run_tag_mutation(
4697 client,
4698 "tags.batchUpdate",
4699 serde_json::json!({"keys": keys, "remove": tags}),
4700 dry_run,
4701 )?
4702 }
4703 }
4704 };
4705 Ok((value, style))
4706}
4707
4708fn run_tag_mutation(
4709 client: &mut impl RpcCaller,
4710 method: &str,
4711 params: Value,
4712 dry_run: bool,
4713) -> Result<(Value, JsonStyle), String> {
4714 if dry_run {
4715 Ok((dry_run_value(method, params), JsonStyle::PythonCompact))
4716 } else {
4717 Ok((client.call(method, Some(params))?, JsonStyle::PythonCompact))
4718 }
4719}
4720
4721fn dry_run_value(method: &str, params: Value) -> Value {
4722 serde_json::json!({
4723 "ok": true,
4724 "dryRun": true,
4725 "wouldCall": method,
4726 "wouldCallParams": params,
4727 })
4728}
4729
4730fn run_annotations_command(
4731 command: AnnotationsCommand,
4732 client: &mut impl RpcCaller,
4733) -> Result<(Value, JsonStyle), String> {
4734 let (value, style) = match command {
4735 AnnotationsCommand::List { parent, attachment, .. } => {
4736 let mut params = serde_json::json!({"parentKey": parent});
4737 if let Some(att) = attachment {
4738 params["attachmentKey"] = Value::String(att);
4739 }
4740 let value = client.call("annotations.list", Some(params))?;
4741 let total = value
4742 .get("items")
4743 .and_then(Value::as_array)
4744 .map_or(0, |a| a.len()) as u64;
4745 (normalize_list_envelope(value, "items", Some(total), 0), JsonStyle::Pretty)
4746 }
4747 AnnotationsCommand::Create {
4748 parent,
4749 attachment,
4750 annotation_type,
4751 position,
4752 quote,
4753 page,
4754 sort_index,
4755 text,
4756 comment,
4757 color,
4758 dry_run,
4759 ..
4760 } => {
4761 let annotation_type = annotation_type.unwrap_or_else(|| "highlight".to_string());
4762 if !matches!(
4763 annotation_type.as_str(),
4764 "highlight" | "note" | "underline" | "image" | "ink"
4765 ) {
4766 return Err(format!(
4767 "INVALID_ARGS: --type must be highlight|note|underline|image|ink, got {annotation_type:?}"
4768 ));
4769 }
4770 let mut params = serde_json::Map::new();
4771 params.insert("parentKey".to_string(), Value::String(parent));
4772 if let Some(att) = attachment {
4773 params.insert("attachmentKey".to_string(), Value::String(att));
4774 }
4775 params.insert("type".to_string(), Value::String(annotation_type.clone()));
4776 params.insert("color".to_string(), Value::String(color));
4777
4778 if let Some(ref quote_text) = quote {
4779 if !matches!(annotation_type.as_str(), "highlight" | "underline") {
4780 return Err(format!(
4781 "INVALID_ARGS: --quote is only valid for highlight|underline, got {annotation_type:?}"
4782 ));
4783 }
4784 params.insert("quote".to_string(), Value::String(quote_text.clone()));
4785 if let Some(page_idx) = page {
4786 params.insert(
4787 "pageIndex".to_string(),
4788 Value::Number(page_idx.into()),
4789 );
4790 }
4791 if let Some(raw) = position {
4793 let pos = serde_json::from_str::<Value>(&raw)
4794 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))?;
4795 validate_annotation_position(annotation_type.as_str(), &pos)?;
4796 params.insert("position".to_string(), pos);
4797 }
4798 } else {
4799 let position = position
4800 .ok_or_else(|| "INVALID_ARGS: --position JSON is required (or use --quote)".to_string())
4801 .and_then(|raw| {
4802 serde_json::from_str::<Value>(&raw)
4803 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))
4804 })?;
4805 validate_annotation_position(annotation_type.as_str(), &position)?;
4806 params.insert("position".to_string(), position);
4807 }
4808
4809 if let Some(sort_index) = sort_index {
4810 params.insert(
4811 "sortIndex".to_string(),
4812 parse_annotation_sort_index(sort_index)?,
4813 );
4814 }
4815 if let Some(text) = text {
4816 params.insert("text".to_string(), Value::String(text));
4817 }
4818 if let Some(comment) = comment {
4819 params.insert("comment".to_string(), Value::String(comment));
4820 }
4821 run_mutating_command(client, "annotations.create", Value::Object(params), dry_run)?
4822 }
4823 AnnotationsCommand::Delete {
4824 annotation_key,
4825 dry_run,
4826 ..
4827 } => run_mutating_command(
4828 client,
4829 "annotations.delete",
4830 serde_json::json!({"key": annotation_key}),
4831 dry_run,
4832 )?,
4833 };
4834 Ok((value, style))
4835}
4836
4837fn validate_annotation_position(annotation_type: &str, position: &Value) -> Result<(), String> {
4838 position
4839 .get("pageIndex")
4840 .and_then(Value::as_i64)
4841 .filter(|value| *value >= 0)
4842 .ok_or_else(|| {
4843 "INVALID_ARGS: --position must include a non-negative integer pageIndex".to_string()
4844 })?;
4845
4846 if annotation_type == "ink" {
4847 let has_paths = position
4848 .get("paths")
4849 .and_then(Value::as_array)
4850 .is_some_and(|paths| !paths.is_empty());
4851 if !has_paths {
4852 return Err("INVALID_ARGS: ink --position must include non-empty paths".to_string());
4853 }
4854 return Ok(());
4855 }
4856
4857 let valid_rects = position
4858 .get("rects")
4859 .and_then(Value::as_array)
4860 .is_some_and(|rects| !rects.is_empty() && rects.iter().all(is_annotation_rect));
4861 if !valid_rects {
4862 return Err(
4863 "INVALID_ARGS: --position must include non-empty rects of [x1, y1, x2, y2]".to_string(),
4864 );
4865 }
4866 Ok(())
4867}
4868
4869fn is_annotation_rect(value: &Value) -> bool {
4870 value.as_array().is_some_and(|coords| {
4871 coords.len() == 4
4872 && coords
4873 .iter()
4874 .all(|coord| coord.as_f64().is_some_and(f64::is_finite))
4875 })
4876}
4877
4878fn parse_annotation_sort_index(raw: String) -> Result<Value, String> {
4879 let parsed = serde_json::from_str::<Value>(&raw).unwrap_or_else(|_| Value::String(raw));
4880 let valid = match &parsed {
4881 Value::Number(number) => number.as_f64().is_some_and(f64::is_finite),
4882 Value::String(value) => {
4883 is_zotero_pdf_sort_index(value.trim())
4884 || (!value.trim().is_empty()
4885 && value.trim().parse::<f64>().is_ok_and(f64::is_finite))
4886 }
4887 _ => false,
4888 };
4889 if valid {
4890 Ok(parsed)
4891 } else {
4892 Err(format!(
4893 "INVALID_ARGS: --sort-index must be a finite number or numeric string, got {parsed}"
4894 ))
4895 }
4896}
4897
4898fn is_zotero_pdf_sort_index(value: &str) -> bool {
4899 let mut parts = value.split('|');
4900 matches!(
4901 (parts.next(), parts.next(), parts.next(), parts.next()),
4902 (Some(page), Some(offset), Some(y), None)
4903 if page.len() == 5
4904 && offset.len() == 6
4905 && y.len() == 5
4906 && page.chars().all(|ch| ch.is_ascii_digit())
4907 && offset.chars().all(|ch| ch.is_ascii_digit())
4908 && y.chars().all(|ch| ch.is_ascii_digit())
4909 )
4910}
4911
4912
4913fn localize_attachment_path_response(mut value: Value) -> Value {
4914 if let Some(path) = value.get("path").and_then(Value::as_str) {
4915 let local = local_path_from_zotero_path(path);
4916 if let Some(map) = value.as_object_mut() {
4917 map.insert("path".to_string(), Value::String(local));
4918 }
4919 }
4920 value
4921}
4922
4923fn run_notes_command(
4924 command: NotesCommand,
4925 client: &mut impl RpcCaller,
4926) -> Result<(Value, JsonStyle), String> {
4927 let (value, style) = match command {
4928 NotesCommand::List {
4929 parent,
4930 limit,
4931 offset,
4932 ..
4933 } => {
4934 let value = client.call(
4935 "notes.list",
4936 Some(serde_json::json!({"parentKey": parent})),
4937 )?;
4938 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4939 }
4940 NotesCommand::Get { note_key, .. } => {
4941 let value = client.call("notes.get", Some(serde_json::json!({"key": note_key})))?;
4942 (value, JsonStyle::Pretty)
4943 }
4944 NotesCommand::Create {
4945 parent,
4946 content,
4947 tags,
4948 dry_run,
4949 ..
4950 } => {
4951 let mut params = serde_json::Map::new();
4952 params.insert("parentKey".to_string(), Value::String(parent));
4953 params.insert("content".to_string(), Value::String(content));
4954 if !tags.is_empty() {
4955 params.insert(
4956 "tags".to_string(),
4957 Value::Array(tags.into_iter().map(Value::String).collect()),
4958 );
4959 }
4960 run_mutating_command(client, "notes.create", Value::Object(params), dry_run)?
4961 }
4962 NotesCommand::Update {
4963 note_key,
4964 content,
4965 dry_run,
4966 ..
4967 } => run_mutating_command(
4968 client,
4969 "notes.update",
4970 serde_json::json!({"key": note_key, "content": content}),
4971 dry_run,
4972 )?,
4973 NotesCommand::Delete {
4974 note_key, dry_run, ..
4975 } => {
4976 run_mutating_command(
4978 client,
4979 "items.delete",
4980 serde_json::json!({"key": note_key}),
4981 dry_run,
4982 )?
4983 }
4984 NotesCommand::Search { query, limit, .. } => {
4985 let value = client.call(
4986 "notes.search",
4987 Some(serde_json::json!({"query": query, "limit": limit})),
4988 )?;
4989 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
4990 }
4991 };
4992 Ok((value, style))
4993}
4994
4995fn run_mutating_command(
4996 client: &mut impl RpcCaller,
4997 method: &str,
4998 params: Value,
4999 dry_run: bool,
5000) -> Result<(Value, JsonStyle), String> {
5001 if dry_run {
5002 Ok((
5003 serde_json::json!({
5004 "ok": true,
5005 "dryRun": true,
5006 "wouldCall": method,
5007 "wouldCallParams": params,
5008 }),
5009 JsonStyle::PythonCompact,
5010 ))
5011 } else {
5012 client
5013 .call(method, Some(params))
5014 .map(|value| (value, JsonStyle::PythonCompact))
5015 }
5016}
5017
5018fn run_collections_command(
5019 command: CollectionsCommand,
5020 client: &mut impl RpcCaller,
5021) -> Result<(Value, JsonStyle), String> {
5022 let value = match command {
5023 CollectionsCommand::List { .. } => normalize_list_envelope(
5024 client.call("collections.list", None)?,
5025 "items",
5026 None,
5027 0,
5028 ),
5029 CollectionsCommand::Tree { .. } => client.call("collections.tree", None)?,
5030 CollectionsCommand::Get { name_or_id, .. } => {
5031 let key = resolve_collection(client, &name_or_id)?;
5032 client.call("collections.get", Some(serde_json::json!({"key": key})))?
5033 }
5034 CollectionsCommand::GetItems {
5035 name_or_id,
5036 limit,
5037 offset,
5038 ..
5039 } => {
5040 let key = resolve_collection(client, &name_or_id)?;
5041 let mut params = serde_json::json!({"key": key});
5042 if let Some(map) = params.as_object_mut() {
5043 if let Some(limit) = limit {
5044 map.insert("limit".to_string(), Value::Number(limit.into()));
5045 }
5046 if offset > 0 {
5047 map.insert("offset".to_string(), Value::Number(offset.into()));
5048 }
5049 }
5050 normalize_list_envelope(
5051 client.call("collections.getItems", Some(params))?,
5052 "items",
5053 limit,
5054 offset,
5055 )
5056 }
5057 CollectionsCommand::Stats { name_or_id, .. } => {
5058 let key = resolve_collection(client, &name_or_id)?;
5059 client.call("collections.stats", Some(serde_json::json!({"key": key})))?
5060 }
5061 CollectionsCommand::Rename {
5062 old_name,
5063 new_name,
5064 dry_run,
5065 ..
5066 } => {
5067 let key = resolve_mutable_collection(client, &old_name, "rename")?;
5068 let params = serde_json::json!({"key": key, "name": new_name});
5069 if dry_run {
5070 return Ok((
5071 dry_run_value("collections.rename", params),
5072 JsonStyle::PythonCompact,
5073 ));
5074 }
5075 return Ok((
5076 client.call("collections.rename", Some(params))?,
5077 JsonStyle::PythonCompact,
5078 ));
5079 }
5080 CollectionsCommand::Create {
5081 name,
5082 parent,
5083 dry_run,
5084 ..
5085 } => {
5086 let mut params = serde_json::json!({"name": name});
5087 if let Some(parent) = parent {
5088 let parent_key = resolve_mutable_collection(client, &parent, "use as parent")?;
5089 if let Some(map) = params.as_object_mut() {
5090 map.insert("parentKey".to_string(), parent_key);
5091 }
5092 }
5093 if dry_run {
5094 return Ok((
5095 dry_run_value("collections.create", params),
5096 JsonStyle::PythonCompact,
5097 ));
5098 }
5099 return Ok((
5100 client.call("collections.create", Some(params))?,
5101 JsonStyle::PythonCompact,
5102 ));
5103 }
5104 CollectionsCommand::Delete {
5105 name_or_id,
5106 dry_run,
5107 ..
5108 } => {
5109 let key = resolve_mutable_collection(client, &name_or_id, "delete")?;
5110 let params = serde_json::json!({"key": key});
5111 if dry_run {
5112 return Ok((
5113 dry_run_value("collections.delete", params),
5114 JsonStyle::PythonCompact,
5115 ));
5116 }
5117 return Ok((
5118 client.call("collections.delete", Some(params))?,
5119 JsonStyle::PythonCompact,
5120 ));
5121 }
5122 CollectionsCommand::AddItems {
5123 collection,
5124 item_keys,
5125 dry_run,
5126 ..
5127 } => {
5128 let key = resolve_mutable_collection(client, &collection, "add to")?;
5129 let params = serde_json::json!({"key": key, "keys": item_keys});
5130 if dry_run {
5131 return Ok((
5132 dry_run_value("collections.addItems", params),
5133 JsonStyle::PythonCompact,
5134 ));
5135 }
5136 return Ok((
5137 client.call("collections.addItems", Some(params))?,
5138 JsonStyle::PythonCompact,
5139 ));
5140 }
5141 CollectionsCommand::RemoveItems {
5142 collection,
5143 item_keys,
5144 dry_run,
5145 ..
5146 } => {
5147 let key = resolve_mutable_collection(client, &collection, "operate on")?;
5148 let params = serde_json::json!({"key": key, "keys": item_keys});
5149 if dry_run {
5150 return Ok((
5151 dry_run_value("collections.removeItems", params),
5152 JsonStyle::PythonCompact,
5153 ));
5154 }
5155 return Ok((
5156 client.call("collections.removeItems", Some(params))?,
5157 JsonStyle::PythonCompact,
5158 ));
5159 }
5160 };
5161 Ok((value, JsonStyle::Pretty))
5162}
5163
5164fn resolve_export_keys(
5165 client: &mut impl RpcCaller,
5166 mut keys: Vec<String>,
5167 collection: Option<String>,
5168) -> Result<Vec<String>, String> {
5169 if let Some(name) = collection {
5170 let col_key = resolve_collection(client, &name)?;
5171 let response = client.call(
5172 "collections.getItems",
5173 Some(serde_json::json!({"key": col_key})),
5174 )?;
5175 let items = collection_items(&response);
5176 for item in items {
5177 if let Some(key) = item.get("key").and_then(Value::as_str) {
5178 if !keys.contains(&key.to_string()) {
5179 keys.push(key.to_string());
5180 }
5181 }
5182 }
5183 }
5184 if keys.is_empty() {
5185 return Err("No item keys provided. Pass positional keys and/or --collection.".to_string());
5186 }
5187 Ok(keys)
5188}
5189
5190fn run_export(args: ExportArgs, client: &mut impl RpcCaller) -> Result<String, String> {
5191 let keys = resolve_export_keys(client, args.keys, args.collection)?;
5192 match args.format.as_str() {
5193 "bibtex" => run_export_content_command(client, "export.bibtex", keys),
5194 "ris" => run_export_content_command(client, "export.ris", keys),
5195 "csl-json" => {
5196 let response =
5197 client.call("export.cslJson", Some(serde_json::json!({"keys": keys})))?;
5198 if let Some(content) = response.get("content") {
5199 format_json(content, JsonStyle::Pretty)
5200 } else {
5201 format_json(&response, JsonStyle::PythonCompact)
5202 }
5203 }
5204 "bibliography" => {
5205 let response = client.call(
5206 "export.bibliography",
5207 Some(serde_json::json!({"keys": keys, "style": args.style})),
5208 )?;
5209 if let Some(object) = response.as_object() {
5210 let field = if args.html { "html" } else { "text" };
5211 if object.contains_key("html") || object.contains_key("text") {
5212 return raw_value_output(
5213 object.get(field).unwrap_or(&Value::String(String::new())),
5214 );
5215 }
5216 }
5217 format_json(&response, JsonStyle::PythonCompact)
5218 }
5219 other => Err(format!(
5220 "INVALID_ARGS: unknown format {other:?}, expected bibtex/ris/csl-json/bibliography"
5221 )),
5222 }
5223}
5224
5225fn run_export_content_command(
5226 client: &mut impl RpcCaller,
5227 method: &str,
5228 keys: Vec<String>,
5229) -> Result<String, String> {
5230 let response = client.call(method, Some(serde_json::json!({"keys": keys})))?;
5231 if let Some(content) = response.get("content") {
5232 raw_value_output(content)
5233 } else {
5234 format_json(&response, JsonStyle::PythonCompact)
5235 }
5236}
5237
5238fn raw_value_output(value: &Value) -> Result<String, String> {
5239 let mut out = match value {
5240 Value::Null => String::new(),
5241 Value::String(content) => content.clone(),
5242 other => to_python_repr(other),
5243 };
5244 out.push('\n');
5245 Ok(out)
5246}
5247
5248fn to_python_repr(value: &Value) -> String {
5249 match value {
5250 Value::Null => "None".to_string(),
5251 Value::Bool(value) => {
5252 if *value {
5253 "True".to_string()
5254 } else {
5255 "False".to_string()
5256 }
5257 }
5258 Value::Number(value) => value.to_string(),
5259 Value::String(value) => format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\'")),
5260 Value::Array(values) => {
5261 let inner = values
5262 .iter()
5263 .map(to_python_repr)
5264 .collect::<Vec<_>>()
5265 .join(", ");
5266 format!("[{inner}]")
5267 }
5268 Value::Object(entries) => {
5269 let inner = entries
5270 .iter()
5271 .map(|(key, value)| {
5272 format!("'{}': {}", key.replace('\'', "\\'"), to_python_repr(value))
5273 })
5274 .collect::<Vec<_>>()
5275 .join(", ");
5276 format!("{{{inner}}}")
5277 }
5278 }
5279}
5280
5281fn resolve_collection(client: &mut impl RpcCaller, name_or_id: &str) -> Result<Value, String> {
5282 let trimmed = name_or_id.trim();
5283 if let Ok(id) = trimmed.parse::<i64>() {
5284 return Ok(Value::Number(id.into()));
5285 }
5286
5287 let collections = client.call("collections.list", None)?;
5288 let items = collections
5289 .get("items")
5290 .and_then(Value::as_array)
5291 .or_else(|| collections.as_array())
5292 .ok_or_else(|| "collections.list returned non-array result".to_string())?;
5293
5294 if let Some(collection) = items
5295 .iter()
5296 .find(|collection| collection.get("key").and_then(Value::as_str) == Some(trimmed))
5297 {
5298 return collection_key(collection);
5299 }
5300
5301 let exact = items
5302 .iter()
5303 .filter(|collection| collection.get("name").and_then(Value::as_str) == Some(trimmed))
5304 .collect::<Vec<_>>();
5305 if exact.len() == 1 {
5306 return collection_key(exact[0]);
5307 }
5308
5309 let needle = normalize_collection_name(trimmed);
5310 let fuzzy = items
5311 .iter()
5312 .filter(|collection| {
5313 collection
5314 .get("name")
5315 .and_then(Value::as_str)
5316 .map(normalize_collection_name)
5317 .is_some_and(|name| name.contains(&needle))
5318 })
5319 .collect::<Vec<_>>();
5320
5321 match fuzzy.len() {
5322 1 => collection_key(fuzzy[0]),
5323 0 => Err(format!(
5324 "COLLECTION_NOT_FOUND: No collection named {trimmed:?}"
5325 )),
5326 _ => Err(format!(
5327 "COLLECTION_AMBIGUOUS: Multiple collections match {trimmed:?}"
5328 )),
5329 }
5330}
5331
5332fn collection_key(collection: &Value) -> Result<Value, String> {
5333 collection
5334 .get("key")
5335 .cloned()
5336 .ok_or_else(|| "collection result is missing key".to_string())
5337}
5338
5339fn resolve_mutable_collection(
5340 client: &mut impl RpcCaller,
5341 name_or_id: &str,
5342 operation: &str,
5343) -> Result<Value, String> {
5344 let key = resolve_collection(client, name_or_id)?;
5345 if key.as_i64() == Some(0) {
5346 return Err(format!(
5347 "COLLECTION_NOT_FOUND: {name_or_id:?} resolved to library root (cannot {operation})"
5348 ));
5349 }
5350 Ok(key)
5351}
5352
5353fn normalize_collection_name(name: &str) -> String {
5354 name.split_whitespace()
5355 .collect::<Vec<_>>()
5356 .join(" ")
5357 .to_lowercase()
5358}
5359
5360fn format_json(value: &Value, style: JsonStyle) -> Result<String, String> {
5361 let mut out = match style {
5362 JsonStyle::PythonCompact => to_python_compact_json(value),
5363 JsonStyle::Pretty => serde_json::to_string_pretty(value).map_err(|err| err.to_string())?,
5364 };
5365 out.push('\n');
5366 Ok(out)
5367}
5368
5369fn to_python_compact_json(value: &Value) -> String {
5370 match value {
5371 Value::Null => "null".to_string(),
5372 Value::Bool(value) => value.to_string(),
5373 Value::Number(value) => value.to_string(),
5374 Value::String(value) => {
5375 serde_json::to_string(value).expect("string serialization cannot fail")
5376 }
5377 Value::Array(values) => {
5378 let inner = values
5379 .iter()
5380 .map(to_python_compact_json)
5381 .collect::<Vec<_>>()
5382 .join(", ");
5383 format!("[{inner}]")
5384 }
5385 Value::Object(entries) => {
5386 let inner = entries
5387 .iter()
5388 .map(|(key, value)| {
5389 let key = serde_json::to_string(key).expect("string serialization cannot fail");
5390 format!("{key}: {}", to_python_compact_json(value))
5391 })
5392 .collect::<Vec<_>>()
5393 .join(", ");
5394 format!("{{{inner}}}")
5395 }
5396 }
5397}