1use std::{
4 env, fs,
5 io::{self, Read},
6 path::{Path, PathBuf},
7 process::Command as ProcessCommand,
8 thread,
9 time::{Duration, Instant, SystemTime, UNIX_EPOCH},
10};
11
12use clap::{error::ErrorKind, Parser, Subcommand};
13use serde_json::Value;
14use zotron_rpc::{StdProviderCommandRunner, UreqProviderHttpTransport, ZoteroRpc};
15use zotron_types::{
16 bm25_score_chunks, build_embedding_provider_request, build_ocr_provider_request,
17 builtin_ocr_provider_specs, cosine_similarity, execute_embedding_provider_request,
18 is_zotron_evidence_artifact, machine_artifact_exists_for_item,
19 machine_artifact_exists_in_sidecar, machine_artifact_store_root,
20 ocr_provider_spec as raw_ocr_provider_spec, parse_embedding_provider_response,
21 parse_ocr_provider_response, read_machine_artifact_sidecar, rrf_merge,
22 write_machine_artifact_sidecar, ArtifactStorePlatform, EmbeddingChunkInput,
23 EmbeddingRequestInput, EmbeddingVector, MachineArtifactKind, OcrRequestInput,
24 ProviderCommandRunner, ProviderHttpInvocation, ProviderHttpTransport, StructureChunk,
25 DEFAULT_RPC_URL,
26};
27
28pub trait RpcCaller {
29 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String>;
30}
31
32#[derive(Debug, Clone, PartialEq, serde::Serialize)]
33pub struct CliOcrProviderSpec {
34 pub id: &'static str,
35 pub provider: &'static str,
36 pub request_style: &'static str,
37 pub auth: &'static str,
38 pub auth_header: &'static str,
39 pub supports_pdf_direct: bool,
40 pub key_field: &'static str,
41}
42
43#[derive(Debug, Clone, PartialEq, serde::Serialize)]
44pub struct CliEmbeddingProviderSpec {
45 pub id: &'static str,
46 pub provider: &'static str,
47 pub request_style: &'static str,
48 pub default_url: String,
49 pub default_model: &'static str,
50 pub auth: &'static str,
51 pub key_field: &'static str,
52}
53
54pub fn ocr_provider_specs() -> Vec<CliOcrProviderSpec> {
55 builtin_ocr_provider_specs()
56 .into_iter()
57 .map(cli_ocr_provider_spec)
58 .collect()
59}
60
61pub fn ocr_provider_spec(provider: &str) -> Result<CliOcrProviderSpec, String> {
62 zotron_types::ocr_provider_spec(provider).map(cli_ocr_provider_spec)
63}
64
65pub fn embedding_provider_spec(provider: &str) -> Result<CliEmbeddingProviderSpec, String> {
66 let spec = zotron_types::embedding_provider_spec(provider)?;
67 Ok(CliEmbeddingProviderSpec {
68 id: spec.id,
69 provider: spec.provider_key,
70 request_style: if spec.provider_key == "alibaba" {
71 "dashscope"
72 } else {
73 spec.request_style.as_str()
74 },
75 default_url: spec.default_url.unwrap_or("").to_string(),
76 default_model: spec.default_model,
77 auth: spec.auth,
78 key_field: spec.key_field,
79 })
80}
81
82pub fn chunks_from_blocks(blocks: &[Value], max_chars: usize) -> Result<Vec<Value>, String> {
83 let typed = blocks
84 .iter()
85 .map(json_block_to_pdf_block)
86 .collect::<Result<Vec<_>, _>>()?;
87 let chunks = zotron_types::chunks_from_blocks(&typed, max_chars);
88 chunks
89 .into_iter()
90 .map(|chunk| chunk_to_cli_value(&chunk, &typed))
91 .collect()
92}
93
94fn cli_ocr_provider_spec(spec: zotron_types::OcrProviderSpec) -> CliOcrProviderSpec {
95 CliOcrProviderSpec {
96 id: spec.provider_key,
97 provider: spec.provider_key,
98 request_style: spec.request_style.as_str(),
99 auth: spec.auth,
100 auth_header: spec.auth_header,
101 supports_pdf_direct: spec.supports_pdf_direct,
102 key_field: spec.key_field,
103 }
104}
105
106fn json_block_to_pdf_block(value: &Value) -> Result<zotron_types::PdfEvidenceBlock, String> {
107 let block_key = value
108 .get("block_key")
109 .and_then(Value::as_str)
110 .ok_or_else(|| "block missing block_key".to_string())?
111 .to_string();
112 let item_key = value
113 .get("item_key")
114 .and_then(Value::as_str)
115 .ok_or_else(|| "block missing item_key".to_string())?
116 .to_string();
117 let attachment_key = value
118 .get("attachment_key")
119 .and_then(Value::as_str)
120 .ok_or_else(|| "block missing attachment_key".to_string())?
121 .to_string();
122 let page_idx = value
123 .get("page_idx")
124 .or_else(|| value.get("page"))
125 .and_then(Value::as_u64)
126 .unwrap_or(1);
127 let block_type = value
128 .get("type")
129 .or_else(|| value.get("block_type"))
130 .and_then(Value::as_str)
131 .unwrap_or("paragraph")
132 .to_string();
133 let section_path = value
134 .get("section_path")
135 .and_then(Value::as_array)
136 .map(|items| {
137 items
138 .iter()
139 .filter_map(Value::as_str)
140 .map(ToString::to_string)
141 .collect::<Vec<_>>()
142 })
143 .unwrap_or_default();
144 let text = value
145 .get("text")
146 .and_then(Value::as_str)
147 .unwrap_or("")
148 .to_string();
149 let bbox = value.get("bbox").and_then(value_bbox4);
150
151 Ok(zotron_types::PdfEvidenceBlock {
152 block_key,
153 item_key,
154 attachment_key,
155 page_idx,
156 block_type,
157 bbox,
158 section_path,
159 text,
160 })
161}
162
163fn chunk_to_cli_value(
164 chunk: &zotron_types::StructureChunk,
165 blocks: &[zotron_types::PdfEvidenceBlock],
166) -> Result<Value, String> {
167 let refs = chunk
168 .block_keys
169 .iter()
170 .filter_map(|key| blocks.iter().find(|block| &block.block_key == key))
171 .map(|block| {
172 serde_json::json!({
173 "block_key": block.block_key,
174 "page_idx": block.page_idx,
175 "bbox": block.bbox.map(|bbox| bbox.iter().map(|n| {
176 if n.fract() == 0.0 {
177 Value::from(*n as i64)
178 } else {
179 Value::from(*n)
180 }
181 }).collect::<Vec<_>>()),
182 })
183 })
184 .collect::<Vec<_>>();
185 Ok(serde_json::json!({
186 "chunk_key": chunk.chunk_key,
187 "item_key": chunk.item_key,
188 "attachment_key": chunk.attachment_key,
189 "block_keys": chunk.block_keys,
190 "section_path": chunk.section_path,
191 "text": chunk.text,
192 "page_start": chunk.page_start,
193 "page_end": chunk.page_end,
194 "evidence_refs": refs,
195 }))
196}
197
198fn value_bbox4(value: &Value) -> Option<[f64; 4]> {
199 let arr = value.as_array()?;
200 if arr.len() != 4 {
201 return None;
202 }
203 Some([
204 arr[0].as_f64()?,
205 arr[1].as_f64()?,
206 arr[2].as_f64()?,
207 arr[3].as_f64()?,
208 ])
209}
210
211impl RpcCaller for ZoteroRpc {
212 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String> {
213 self.call(method, params).map_err(|err| err.to_string())
214 }
215}
216
217#[derive(Debug, Parser)]
218#[command(name = "zotron", about = "Rust client + CLI for the Zotron XPI")]
219struct Cli {
220 #[command(subcommand)]
221 command: Command,
222}
223
224#[derive(Debug, Subcommand)]
225enum OcrCommand {
226 Providers,
228 #[command(name = "run")]
230 Run {
231 #[arg(long)]
232 provider: String,
233 #[arg(long)]
235 input: Option<String>,
236 #[arg(long)]
238 file: Option<String>,
239 #[arg(long = "item-key")]
241 item_key: Option<String>,
242 #[arg(long = "attachment-key")]
244 attachment_key: Option<String>,
245 #[arg(long = "mime-type")]
247 mime_type: Option<String>,
248 #[arg(long)]
250 endpoint: Option<String>,
251 #[arg(long = "api-key-env")]
253 api_key_env: Option<String>,
254 },
255 Status {
257 #[arg(long)]
258 collection: String,
259 #[arg(long, default_value = DEFAULT_RPC_URL)]
260 url: String,
261 },
262 #[command(name = "process")]
264 Process {
265 #[arg(long, default_value = "mineru")]
266 provider: String,
267 #[arg(long)]
269 parent: String,
270 #[arg(long)]
272 attachment: Option<String>,
273 #[arg(long = "source-url")]
275 source_url: Option<String>,
276 #[arg(long = "result-dir")]
278 result_dir: Option<String>,
279 #[arg(long = "result-zip")]
281 result_zip: Option<String>,
282 #[arg(long = "provider-endpoint")]
284 provider_endpoint: Option<String>,
285 #[arg(long = "api-key-env", default_value = "ZOTRON_MINERU_API_KEY")]
287 api_key_env: String,
288 #[arg(long = "poll-interval-seconds", default_value_t = 5)]
289 poll_interval_seconds: u64,
290 #[arg(long = "timeout-seconds", default_value_t = 900)]
291 timeout_seconds: u64,
292 #[arg(long = "chunk-chars", default_value_t = 1200)]
293 chunk_chars: usize,
294 #[arg(long, default_value = DEFAULT_RPC_URL)]
295 url: String,
296 },
297}
298
299#[derive(Debug, Subcommand)]
300enum Command {
301 Ping {
303 #[arg(long, default_value = DEFAULT_RPC_URL)]
304 url: String,
305 },
306 Rpc {
308 method: String,
309 #[arg(default_value = "{}")]
310 params_json: String,
311 #[arg(long, default_value = DEFAULT_RPC_URL)]
312 url: String,
313 #[arg(long)]
314 paginate: bool,
315 #[arg(long, default_value_t = 100)]
316 page_size: usize,
317 },
318 Push {
320 json_file: String,
322 #[arg(long)]
324 pdf: Option<String>,
325 #[arg(long)]
327 collection: Option<String>,
328 #[arg(long = "on-duplicate", default_value = "skip")]
330 on_duplicate: String,
331 #[arg(long, default_value = DEFAULT_RPC_URL)]
332 url: String,
333 #[arg(long = "dry-run")]
335 dry_run: bool,
336 },
337 System {
339 #[command(subcommand)]
340 command: SystemCommand,
341 },
342 Search(SearchArgs),
344 Items {
346 #[command(subcommand)]
347 command: ItemsCommand,
348 },
349 Collections {
351 #[command(subcommand)]
352 command: CollectionsCommand,
353 },
354 Notes {
356 #[command(subcommand)]
357 command: NotesCommand,
358 },
359 Attachments {
361 #[command(subcommand)]
362 command: AttachmentsCommand,
363 },
364 Settings {
366 #[command(subcommand)]
367 command: SettingsCommand,
368 },
369 Tags {
371 #[command(subcommand)]
372 command: TagsCommand,
373 },
374 Export(ExportArgs),
376 Annotations {
378 #[command(subcommand)]
379 command: AnnotationsCommand,
380 },
381 Ocr {
383 #[command(subcommand)]
384 command: OcrCommand,
385 },
386 Rag {
388 #[command(subcommand)]
389 command: RagCommand,
390 },
391 #[command(name = "find-pdfs")]
393 FindPdfs {
394 #[arg(long)]
395 collection: String,
396 #[arg(long, default_value_t = 0)]
397 limit: usize,
398 #[arg(long, default_value = DEFAULT_RPC_URL)]
399 url: String,
400 },
401}
402
403struct RagSearchOptions {
404 query: String,
405 collection: Option<String>,
406 keys: Vec<String>,
407 zotero: bool,
408 top_spans_per_item: u64,
409 include_fulltext_spans: bool,
410 top_k: u64,
411 output: String,
412}
413
414#[derive(Debug, Subcommand)]
415enum RagCommand {
416 #[command(name = "providers")]
418 Providers,
419 #[command(name = "embed")]
421 Embed {
422 #[arg(long)]
423 provider: String,
424 #[arg(long)]
426 input: String,
427 #[arg(long)]
429 endpoint: Option<String>,
430 #[arg(long)]
432 model: Option<String>,
433 #[arg(long = "input-type")]
435 input_type: Option<String>,
436 #[arg(long = "api-key-env")]
438 api_key_env: Option<String>,
439 },
440 Status {
442 #[arg(long)]
443 collection: String,
444 #[arg(long, default_value = DEFAULT_RPC_URL)]
445 url: String,
446 },
447 #[command(name = "search")]
449 Search {
450 query: String,
451 #[arg(long)]
452 collection: Option<String>,
453 #[arg(long = "key", alias = "keys")]
455 keys: Vec<String>,
456 #[arg(long)]
457 zotero: bool,
458 #[arg(long = "top-spans-per-item", default_value_t = 3)]
459 top_spans_per_item: u64,
460 #[arg(long = "include-fulltext-spans")]
461 include_fulltext_spans: bool,
462 #[arg(long = "limit", alias = "top-k", default_value_t = 50)]
463 top_k: u64,
464 #[arg(long, default_value = "json", value_parser = ["json", "jsonl"])]
465 output: String,
466 #[arg(long, default_value = DEFAULT_RPC_URL)]
467 url: String,
468 },
469}
470
471#[derive(Debug, Subcommand)]
472enum SystemCommand {
473 Version {
475 #[arg(long, default_value = DEFAULT_RPC_URL)]
476 url: String,
477 },
478 Libraries {
480 #[arg(long, default_value = DEFAULT_RPC_URL)]
481 url: String,
482 },
483 #[command(name = "library-stats")]
485 LibraryStats {
486 #[arg(long)]
487 library: Option<i64>,
488 #[arg(long, default_value = DEFAULT_RPC_URL)]
489 url: String,
490 },
491 Schema {
493 #[arg(long = "type")]
494 item_type: Option<String>,
495 #[arg(long, default_value = DEFAULT_RPC_URL)]
496 url: String,
497 },
498 #[command(name = "current-collection")]
500 CurrentCollection {
501 #[arg(long, default_value = DEFAULT_RPC_URL)]
502 url: String,
503 },
504 #[command(name = "list-methods")]
506 ListMethods {
507 #[arg(long, default_value = DEFAULT_RPC_URL)]
508 url: String,
509 },
510 Describe {
512 method: Option<String>,
513 #[arg(long, default_value = DEFAULT_RPC_URL)]
514 url: String,
515 },
516}
517
518#[derive(Debug, clap::Args)]
519struct SearchArgs {
520 query: Option<String>,
522 #[arg(long)]
524 fulltext: bool,
525 #[arg(long)]
527 author: Option<String>,
528 #[arg(long)]
530 after: Option<String>,
531 #[arg(long)]
533 before: Option<String>,
534 #[arg(long)]
536 journal: Option<String>,
537 #[arg(long)]
539 tag: Option<String>,
540 #[arg(long)]
542 doi: Option<String>,
543 #[arg(long)]
545 isbn: Option<String>,
546 #[arg(long)]
548 issn: Option<String>,
549 #[arg(long)]
551 collection: Option<String>,
552 #[arg(long, default_value_t = 50)]
553 limit: u64,
554 #[arg(long, default_value_t = 0)]
555 offset: u64,
556 #[arg(long, default_value = DEFAULT_RPC_URL)]
557 url: String,
558 #[command(subcommand)]
559 management: Option<SearchManagementCommand>,
560}
561
562#[derive(Debug, Subcommand)]
563enum SearchManagementCommand {
564 #[command(name = "saved-searches")]
566 SavedSearches {
567 #[arg(long, default_value = DEFAULT_RPC_URL)]
568 url: String,
569 },
570 #[command(name = "create-saved")]
572 CreateSaved {
573 name: String,
574 #[arg(long = "condition", required = true)]
575 condition: Vec<String>,
576 #[arg(long)]
577 dry_run: bool,
578 #[arg(long, default_value = DEFAULT_RPC_URL)]
579 url: String,
580 },
581 #[command(name = "delete-saved")]
583 DeleteSaved {
584 search_key: String,
585 #[arg(long)]
586 dry_run: bool,
587 #[arg(long, default_value = DEFAULT_RPC_URL)]
588 url: String,
589 },
590}
591
592#[derive(Debug, Subcommand)]
593enum ItemsCommand {
594 Add {
596 #[arg(long)]
597 doi: Option<String>,
598 #[arg(long)]
599 isbn: Option<String>,
600 #[arg(long = "from-url")]
602 from_url: Option<String>,
603 #[arg(long)]
605 file: Option<String>,
606 #[arg(long)]
607 collection: Option<String>,
608 #[arg(long)]
609 dry_run: bool,
610 #[arg(long, default_value = DEFAULT_RPC_URL)]
611 url: String,
612 },
613 Create {
615 #[arg(long = "type")]
616 item_type: String,
617 #[arg(long = "field")]
618 fields: Vec<String>,
619 #[arg(long)]
620 dry_run: bool,
621 #[arg(long, default_value = DEFAULT_RPC_URL)]
622 url: String,
623 },
624 Update {
626 key: String,
627 #[arg(long = "field")]
628 fields: Vec<String>,
629 #[arg(long)]
630 dry_run: bool,
631 #[arg(long, default_value = DEFAULT_RPC_URL)]
632 url: String,
633 },
634 Delete {
636 key: String,
637 #[arg(long)]
638 dry_run: bool,
639 #[arg(long, default_value = DEFAULT_RPC_URL)]
640 url: String,
641 },
642 Trash {
644 items: Vec<String>,
645 #[arg(long)]
646 dry_run: bool,
647 #[arg(long, default_value = DEFAULT_RPC_URL)]
648 url: String,
649 },
650 Restore {
652 item: String,
653 #[arg(long)]
654 dry_run: bool,
655 #[arg(long, default_value = DEFAULT_RPC_URL)]
656 url: String,
657 },
658 #[command(name = "merge-duplicates")]
660 MergeDuplicates {
661 keys: Vec<String>,
662 #[arg(long)]
663 dry_run: bool,
664 #[arg(long, default_value = DEFAULT_RPC_URL)]
665 url: String,
666 },
667 #[command(name = "add-related")]
669 AddRelated {
670 key: String,
671 #[arg(long)]
672 target: String,
673 #[arg(long)]
674 dry_run: bool,
675 #[arg(long, default_value = DEFAULT_RPC_URL)]
676 url: String,
677 },
678 #[command(name = "remove-related")]
680 RemoveRelated {
681 key: String,
682 #[arg(long)]
683 target: String,
684 #[arg(long)]
685 dry_run: bool,
686 #[arg(long, default_value = DEFAULT_RPC_URL)]
687 url: String,
688 },
689 Get {
691 item: String,
692 #[arg(long, default_value = DEFAULT_RPC_URL)]
693 url: String,
694 },
695 List {
697 #[arg(long, default_value_t = 50)]
698 limit: u64,
699 #[arg(long, default_value_t = 0)]
700 offset: u64,
701 #[arg(long)]
702 sort: Option<String>,
703 #[arg(long, default_value = "asc")]
704 direction: String,
705 #[arg(long)]
707 trash: bool,
708 #[arg(long, default_value = DEFAULT_RPC_URL)]
709 url: String,
710 },
711 #[command(name = "find-duplicates")]
713 FindDuplicates {
714 #[arg(long, default_value = DEFAULT_RPC_URL)]
715 url: String,
716 },
717 Recent {
719 #[arg(long, default_value_t = 20)]
720 limit: u64,
721 #[arg(long, default_value_t = 0)]
722 offset: u64,
723 #[arg(long = "type", default_value = "added")]
724 recent_type: String,
725 #[arg(long, default_value = DEFAULT_RPC_URL)]
726 url: String,
727 },
728 Fulltext {
730 key: String,
731 #[arg(long, default_value = DEFAULT_RPC_URL)]
732 url: String,
733 },
734 Related {
736 key: String,
737 #[arg(long, default_value = DEFAULT_RPC_URL)]
738 url: String,
739 },
740 #[command(name = "citation-key")]
742 CitationKey {
743 key: String,
744 #[arg(long, default_value = DEFAULT_RPC_URL)]
745 url: String,
746 },
747}
748
749#[derive(Debug, Subcommand)]
750enum SettingsCommand {
751 Get {
753 key: String,
754 #[arg(long, default_value = DEFAULT_RPC_URL)]
755 url: String,
756 },
757 #[command(visible_alias = "get-all")]
759 List {
760 #[arg(long, default_value = DEFAULT_RPC_URL)]
761 url: String,
762 },
763 Set {
765 pairs: Vec<String>,
767 #[arg(long)]
769 file: Option<String>,
770 #[arg(long)]
771 dry_run: bool,
772 #[arg(long, default_value = DEFAULT_RPC_URL)]
773 url: String,
774 },
775}
776
777#[derive(Debug, Subcommand)]
778enum TagsCommand {
779 List {
781 #[arg(long, default_value_t = 200)]
782 limit: u64,
783 #[arg(long, default_value = DEFAULT_RPC_URL)]
784 url: String,
785 },
786 Rename {
788 old: String,
789 new: String,
790 #[arg(long)]
791 dry_run: bool,
792 #[arg(long, default_value = DEFAULT_RPC_URL)]
793 url: String,
794 },
795 Delete {
797 tag: String,
798 #[arg(long)]
799 dry_run: bool,
800 #[arg(long, default_value = DEFAULT_RPC_URL)]
801 url: String,
802 },
803 Add {
805 keys: Vec<String>,
806 #[arg(long = "tag", required = true)]
807 tags: Vec<String>,
808 #[arg(long)]
809 dry_run: bool,
810 #[arg(long, default_value = DEFAULT_RPC_URL)]
811 url: String,
812 },
813 Remove {
815 keys: Vec<String>,
816 #[arg(long = "tag", required = true)]
817 tags: Vec<String>,
818 #[arg(long)]
819 dry_run: bool,
820 #[arg(long, default_value = DEFAULT_RPC_URL)]
821 url: String,
822 },
823}
824
825#[derive(Debug, clap::Args)]
826struct ExportArgs {
827 keys: Vec<String>,
829 #[arg(long, default_value = "bibtex")]
831 format: String,
832 #[arg(long)]
834 collection: Option<String>,
835 #[arg(long, default_value = "http://www.zotero.org/styles/apa")]
837 style: String,
838 #[arg(long)]
840 html: bool,
841 #[arg(long, default_value = DEFAULT_RPC_URL)]
842 url: String,
843}
844
845#[derive(Debug, Subcommand)]
846enum AnnotationsCommand {
847 List {
849 parent: String,
850 #[arg(long, default_value = DEFAULT_RPC_URL)]
851 url: String,
852 },
853 Create {
855 parent: String,
856 #[arg(long = "type")]
857 annotation_type: Option<String>,
858 #[arg(long)]
861 position: Option<String>,
862 #[arg(long)]
865 quote: Option<String>,
866 #[arg(long)]
868 page: Option<u32>,
869 #[arg(long = "sort-index")]
871 sort_index: Option<String>,
872 #[arg(long)]
873 text: Option<String>,
874 #[arg(long)]
875 comment: Option<String>,
876 #[arg(long, default_value = "#ffd400")]
877 color: String,
878 #[arg(long)]
879 dry_run: bool,
880 #[arg(long, default_value = DEFAULT_RPC_URL)]
881 url: String,
882 },
883 Delete {
885 annotation_key: String,
886 #[arg(long)]
887 dry_run: bool,
888 #[arg(long, default_value = DEFAULT_RPC_URL)]
889 url: String,
890 },
891}
892
893#[derive(Debug, Subcommand)]
894enum AttachmentsCommand {
895 List {
897 #[arg(long)]
898 parent: String,
899 #[arg(long, default_value_t = 50)]
900 limit: u64,
901 #[arg(long, default_value_t = 0)]
902 offset: u64,
903 #[arg(long, default_value = DEFAULT_RPC_URL)]
904 url: String,
905 },
906 Get {
908 key: String,
909 #[arg(long, default_value = DEFAULT_RPC_URL)]
910 url: String,
911 },
912 Fulltext {
914 key: String,
915 #[arg(long, default_value = DEFAULT_RPC_URL)]
916 url: String,
917 },
918 Path {
920 key: String,
921 #[arg(long, default_value = DEFAULT_RPC_URL)]
922 url: String,
923 },
924 Add {
926 #[arg(long)]
927 parent: String,
928 #[arg(long)]
930 path: Option<String>,
931 #[arg(long = "from-url")]
933 from_url: Option<String>,
934 #[arg(long)]
935 title: Option<String>,
936 #[arg(long)]
937 dry_run: bool,
938 #[arg(long, default_value = DEFAULT_RPC_URL)]
939 url: String,
940 },
941 Delete {
943 key: String,
944 #[arg(long, default_value = DEFAULT_RPC_URL)]
945 url: String,
946 #[arg(long)]
947 dry_run: bool,
948 },
949 #[command(name = "find-pdf")]
951 FindPdf {
952 #[arg(long)]
953 parent: String,
954 #[arg(long, default_value = DEFAULT_RPC_URL)]
955 url: String,
956 },
957}
958
959#[derive(Debug, Subcommand)]
960enum NotesCommand {
961 List {
963 #[arg(long)]
964 parent: String,
965 #[arg(long, default_value_t = 50)]
966 limit: u64,
967 #[arg(long, default_value_t = 0)]
968 offset: u64,
969 #[arg(long, default_value = DEFAULT_RPC_URL)]
970 url: String,
971 },
972 Get {
974 note_key: String,
975 #[arg(long, default_value = DEFAULT_RPC_URL)]
976 url: String,
977 },
978 Create {
980 #[arg(long)]
981 parent: String,
982 #[arg(long)]
983 content: String,
984 #[arg(long = "tag")]
985 tags: Vec<String>,
986 #[arg(long)]
987 dry_run: bool,
988 #[arg(long, default_value = DEFAULT_RPC_URL)]
989 url: String,
990 },
991 Update {
993 note_key: String,
994 #[arg(long)]
995 content: String,
996 #[arg(long)]
997 dry_run: bool,
998 #[arg(long, default_value = DEFAULT_RPC_URL)]
999 url: String,
1000 },
1001 Delete {
1003 note_key: String,
1004 #[arg(long)]
1005 dry_run: bool,
1006 #[arg(long, default_value = DEFAULT_RPC_URL)]
1007 url: String,
1008 },
1009 Search {
1011 query: String,
1012 #[arg(long, default_value_t = 50)]
1013 limit: u64,
1014 #[arg(long, default_value = DEFAULT_RPC_URL)]
1015 url: String,
1016 },
1017}
1018
1019#[derive(Debug, Subcommand)]
1020enum CollectionsCommand {
1021 List {
1023 #[arg(long, default_value = DEFAULT_RPC_URL)]
1024 url: String,
1025 },
1026 Tree {
1028 #[arg(long, default_value = DEFAULT_RPC_URL)]
1029 url: String,
1030 },
1031 Get {
1033 name_or_id: String,
1034 #[arg(long, default_value = DEFAULT_RPC_URL)]
1035 url: String,
1036 },
1037 #[command(name = "get-items", visible_alias = "items")]
1039 GetItems {
1040 name_or_id: String,
1041 #[arg(long)]
1042 limit: Option<u64>,
1043 #[arg(long, default_value_t = 0)]
1044 offset: u64,
1045 #[arg(long, default_value = DEFAULT_RPC_URL)]
1046 url: String,
1047 },
1048 Stats {
1050 name_or_id: String,
1051 #[arg(long, default_value = DEFAULT_RPC_URL)]
1052 url: String,
1053 },
1054 Rename {
1056 old_name: String,
1057 new_name: String,
1058 #[arg(long, default_value = DEFAULT_RPC_URL)]
1059 url: String,
1060 #[arg(long)]
1061 dry_run: bool,
1062 },
1063 Create {
1065 name: String,
1066 #[arg(long)]
1067 parent: Option<String>,
1068 #[arg(long, default_value = DEFAULT_RPC_URL)]
1069 url: String,
1070 #[arg(long)]
1071 dry_run: bool,
1072 },
1073 Delete {
1075 name_or_id: String,
1076 #[arg(long, default_value = DEFAULT_RPC_URL)]
1077 url: String,
1078 #[arg(long)]
1079 dry_run: bool,
1080 },
1081 #[command(name = "add-items")]
1083 AddItems {
1084 collection: String,
1085 item_keys: Vec<String>,
1086 #[arg(long, default_value = DEFAULT_RPC_URL)]
1087 url: String,
1088 #[arg(long)]
1089 dry_run: bool,
1090 },
1091 #[command(name = "remove-items")]
1093 RemoveItems {
1094 collection: String,
1095 item_keys: Vec<String>,
1096 #[arg(long, default_value = DEFAULT_RPC_URL)]
1097 url: String,
1098 #[arg(long)]
1099 dry_run: bool,
1100 },
1101}
1102
1103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1104enum JsonStyle {
1105 PythonCompact,
1107 Pretty,
1109}
1110
1111enum ParseOutcome<T> {
1112 Command(T),
1113 Display(String),
1114}
1115
1116fn parse_cli<T>(
1117 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1118) -> Result<ParseOutcome<T>, String>
1119where
1120 T: Parser,
1121{
1122 match T::try_parse_from(args) {
1123 Ok(cli) => Ok(ParseOutcome::Command(cli)),
1124 Err(err)
1125 if matches!(
1126 err.kind(),
1127 ErrorKind::DisplayHelp | ErrorKind::DisplayVersion
1128 ) =>
1129 {
1130 Ok(ParseOutcome::Display(err.to_string()))
1131 }
1132 Err(err) => Err(err.to_string()),
1133 }
1134}
1135
1136pub fn format_error_json(message: &str) -> String {
1137 let message = message.trim_end();
1138 let (code, message) = split_error_code(message).unwrap_or(("RUNTIME_ERROR", message));
1139 serde_json::json!({"error": {"code": code, "message": message}}).to_string()
1140}
1141
1142fn split_error_code(message: &str) -> Option<(&str, &str)> {
1143 let (code, rest) = message.split_once(':')?;
1144 if !code.is_empty()
1145 && code
1146 .chars()
1147 .all(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_')
1148 {
1149 Some((code, rest.trim_start()))
1150 } else {
1151 None
1152 }
1153}
1154
1155pub fn run(
1156 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1157) -> Result<String, String> {
1158 let cli = match parse_cli::<Cli>(args)? {
1159 ParseOutcome::Command(cli) => cli,
1160 ParseOutcome::Display(output) => return Ok(output),
1161 };
1162 let url = command_url(&cli.command);
1163 let mut client = ZoteroRpc::new(url);
1164 run_command(cli.command, &mut client)
1165}
1166
1167pub fn run_with_client(
1168 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1169 client: &mut impl RpcCaller,
1170) -> Result<String, String> {
1171 let cli = match parse_cli::<Cli>(args)? {
1172 ParseOutcome::Command(cli) => cli,
1173 ParseOutcome::Display(output) => return Ok(output),
1174 };
1175 run_command(cli.command, client)
1176}
1177
1178fn rag_command_url(command: &RagCommand) -> String {
1179 match command {
1180 RagCommand::Providers => DEFAULT_RPC_URL.to_string(),
1181 RagCommand::Embed { .. } => DEFAULT_RPC_URL.to_string(),
1182 RagCommand::Status { url, .. } => url.clone(),
1183 RagCommand::Search { url, .. } => url.clone(),
1184 }
1185}
1186
1187fn command_url(command: &Command) -> String {
1188 match command {
1189 Command::Ping { url }
1190 | Command::Rpc { url, .. }
1191 | Command::Push { url, .. }
1192 | Command::FindPdfs { url, .. } => url.clone(),
1193 Command::Ocr { command } => match command {
1194 OcrCommand::Providers => DEFAULT_RPC_URL.to_string(),
1195 OcrCommand::Run { .. } => DEFAULT_RPC_URL.to_string(),
1196 OcrCommand::Status { url, .. } => url.clone(),
1197 OcrCommand::Process { url, .. } => url.clone(),
1198 },
1199 Command::Rag { command } => rag_command_url(command),
1200 Command::System { command } => match command {
1201 SystemCommand::Version { url }
1202 | SystemCommand::Libraries { url }
1203 | SystemCommand::LibraryStats { url, .. }
1204 | SystemCommand::Schema { url, .. }
1205 | SystemCommand::CurrentCollection { url }
1206 | SystemCommand::ListMethods { url }
1207 | SystemCommand::Describe { url, .. } => url.clone(),
1208 },
1209 Command::Search(ref args) => match &args.management {
1210 Some(SearchManagementCommand::SavedSearches { url })
1211 | Some(SearchManagementCommand::CreateSaved { url, .. })
1212 | Some(SearchManagementCommand::DeleteSaved { url, .. }) => url.clone(),
1213 None => args.url.clone(),
1214 },
1215 Command::Items { command } => match command {
1216 ItemsCommand::Add { url, .. }
1217 | ItemsCommand::Create { url, .. }
1218 | ItemsCommand::Update { url, .. }
1219 | ItemsCommand::Delete { url, .. }
1220 | ItemsCommand::Trash { url, .. }
1221 | ItemsCommand::Restore { url, .. }
1222 | ItemsCommand::MergeDuplicates { url, .. }
1223 | ItemsCommand::AddRelated { url, .. }
1224 | ItemsCommand::RemoveRelated { url, .. }
1225 | ItemsCommand::Get { url, .. }
1226 | ItemsCommand::List { url, .. }
1227 | ItemsCommand::FindDuplicates { url }
1228 | ItemsCommand::Recent { url, .. }
1229 | ItemsCommand::Fulltext { url, .. }
1230 | ItemsCommand::Related { url, .. }
1231 | ItemsCommand::CitationKey { url, .. } => url.clone(),
1232 },
1233 Command::Collections { command } => match command {
1234 CollectionsCommand::List { url }
1235 | CollectionsCommand::Tree { url }
1236 | CollectionsCommand::Get { url, .. }
1237 | CollectionsCommand::GetItems { url, .. }
1238 | CollectionsCommand::Stats { url, .. }
1239 | CollectionsCommand::Rename { url, .. }
1240 | CollectionsCommand::Create { url, .. }
1241 | CollectionsCommand::Delete { url, .. }
1242 | CollectionsCommand::AddItems { url, .. }
1243 | CollectionsCommand::RemoveItems { url, .. } => url.clone(),
1244 },
1245 Command::Notes { command } => match command {
1246 NotesCommand::List { url, .. }
1247 | NotesCommand::Get { url, .. }
1248 | NotesCommand::Create { url, .. }
1249 | NotesCommand::Update { url, .. }
1250 | NotesCommand::Delete { url, .. }
1251 | NotesCommand::Search { url, .. } => url.clone(),
1252 },
1253 Command::Attachments { command } => match command {
1254 AttachmentsCommand::List { url, .. }
1255 | AttachmentsCommand::Get { url, .. }
1256 | AttachmentsCommand::Fulltext { url, .. }
1257 | AttachmentsCommand::Path { url, .. }
1258 | AttachmentsCommand::Add { url, .. }
1259 | AttachmentsCommand::Delete { url, .. }
1260 | AttachmentsCommand::FindPdf { url, .. } => url.clone(),
1261 },
1262 Command::Settings { command } => match command {
1263 SettingsCommand::Get { url, .. }
1264 | SettingsCommand::List { url }
1265 | SettingsCommand::Set { url, .. } => url.clone(),
1266 },
1267 Command::Tags { command } => match command {
1268 TagsCommand::List { url, .. }
1269 | TagsCommand::Rename { url, .. }
1270 | TagsCommand::Delete { url, .. }
1271 | TagsCommand::Add { url, .. }
1272 | TagsCommand::Remove { url, .. } => url.clone(),
1273 },
1274 Command::Export(ref args) => args.url.clone(),
1275 Command::Annotations { command } => match command {
1276 AnnotationsCommand::List { url, .. }
1277 | AnnotationsCommand::Create { url, .. }
1278 | AnnotationsCommand::Delete { url, .. } => url.clone(),
1279 },
1280 }
1281}
1282
1283fn run_ocr_command(command: OcrCommand, client: &mut impl RpcCaller) -> Result<String, String> {
1284 if let OcrCommand::Providers = &command {
1285 return format_json(
1286 &serde_json::json!({ "providers": ocr_provider_specs() }),
1287 JsonStyle::Pretty,
1288 );
1289 }
1290 let value = match command {
1291 OcrCommand::Providers => unreachable!(),
1292 OcrCommand::Run {
1293 provider,
1294 input,
1295 file,
1296 item_key,
1297 attachment_key,
1298 mime_type,
1299 endpoint,
1300 api_key_env,
1301 } => run_ocr_run_command(OcrRunOptions {
1302 provider,
1303 input,
1304 file,
1305 item_key,
1306 attachment_key,
1307 mime_type,
1308 endpoint,
1309 api_key_env,
1310 })?,
1311 OcrCommand::Status { collection, .. } => run_ocr_status_command(client, collection)?,
1312 OcrCommand::Process {
1313 provider,
1314 parent,
1315 attachment,
1316 source_url,
1317 result_dir,
1318 result_zip,
1319 provider_endpoint,
1320 api_key_env,
1321 poll_interval_seconds,
1322 timeout_seconds,
1323 chunk_chars,
1324 ..
1325 } => run_ocr_process_command(
1326 client,
1327 OcrProcessOptions {
1328 provider,
1329 parent,
1330 attachment,
1331 source_url,
1332 result_dir,
1333 result_zip,
1334 provider_endpoint,
1335 api_key_env,
1336 poll_interval_seconds,
1337 timeout_seconds,
1338 chunk_chars,
1339 },
1340 )?,
1341 };
1342 format_json(&value, JsonStyle::PythonCompact)
1343}
1344
1345struct OcrProcessOptions {
1346 provider: String,
1347 parent: String,
1348 attachment: Option<String>,
1349 source_url: Option<String>,
1350 result_dir: Option<String>,
1351 result_zip: Option<String>,
1352 provider_endpoint: Option<String>,
1353 api_key_env: String,
1354 poll_interval_seconds: u64,
1355 timeout_seconds: u64,
1356 chunk_chars: usize,
1357}
1358
1359struct OcrRunOptions {
1360 provider: String,
1361 input: Option<String>,
1362 file: Option<String>,
1363 item_key: Option<String>,
1364 attachment_key: Option<String>,
1365 mime_type: Option<String>,
1366 endpoint: Option<String>,
1367 api_key_env: Option<String>,
1368}
1369
1370fn run_ocr_run_command(options: OcrRunOptions) -> Result<Value, String> {
1371 let input: OcrRequestInput = match (options.input, options.file) {
1372 (Some(input), None) => read_json_input(&input)?,
1373 (None, Some(file)) => ocr_input_from_file(
1374 file,
1375 options.item_key,
1376 options.attachment_key,
1377 options.mime_type,
1378 )?,
1379 (Some(_), Some(_)) => {
1380 return Err("INVALID_ARGS: use either --input or --file, not both".to_string())
1381 }
1382 (None, None) => return Err("INVALID_ARGS: provide --input JSON or --file".to_string()),
1383 };
1384 let request = build_ocr_provider_request(&options.provider, &input)?;
1385 let payload = if request.command.is_empty() {
1386 let method = request
1387 .method
1388 .ok_or_else(|| format!("OCR provider {} missing HTTP method", request.provider))?;
1389 let auth_scheme = raw_ocr_provider_spec(&options.provider)?.auth;
1390 let mut transport =
1391 provider_http_transport_with_auth(options.api_key_env.as_deref(), auth_scheme)?;
1392 transport.post_json(&ProviderHttpInvocation {
1393 provider: request.provider.to_string(),
1394 style: request.style.to_string(),
1395 method: method.to_string(),
1396 url: options
1397 .endpoint
1398 .or_else(|| request.url.map(ToString::to_string)),
1399 auth_header_name: request.auth_header.map(ToString::to_string),
1400 auth_header_value: None,
1401 body: request.body,
1402 })?
1403 } else {
1404 let mut command_runner = StdProviderCommandRunner;
1405 command_runner.run_json(&request.command)?
1406 };
1407 let blocks = match parse_ocr_provider_response(
1408 request.provider,
1409 &payload,
1410 &input.item_key,
1411 &input.attachment_key,
1412 ) {
1413 Ok(blocks) => blocks,
1414 Err(err) => {
1415 if let Some(task) = ocr_async_task_result(request.provider, &payload) {
1416 return Ok(task);
1417 }
1418 return Err(err);
1419 }
1420 };
1421
1422 Ok(serde_json::json!({
1423 "provider": request.provider,
1424 "blocks": blocks,
1425 }))
1426}
1427
1428fn run_ocr_process_command(
1429 client: &mut impl RpcCaller,
1430 mut options: OcrProcessOptions,
1431) -> Result<Value, String> {
1432 let spec = raw_ocr_provider_spec(&options.provider)?;
1433
1434 let attachment = match options.attachment.take() {
1435 Some(key) => key,
1436 None => resolve_first_pdf_attachment_key(client, &options.parent)?,
1437 };
1438 options.attachment = Some(attachment.clone());
1439
1440 let attachment_path = resolve_attachment_path(client, &attachment)?;
1441 let storage_dir = attachment_path
1442 .parent()
1443 .ok_or_else(|| {
1444 format!(
1445 "ATTACHMENT_PATH_INVALID: attachment path has no parent directory: {}",
1446 attachment_path.display()
1447 )
1448 })?
1449 .to_path_buf();
1450
1451 match spec.provider_key {
1452 "mineru" | "mineru-cli" => {
1453 if options.result_dir.is_some() && options.result_zip.is_some() {
1454 return Err("INVALID_ARGS: use either --result-dir or --result-zip, not both".to_string());
1455 }
1456 if options.source_url.is_some()
1457 && (options.result_dir.is_some() || options.result_zip.is_some())
1458 {
1459 return Err(
1460 "INVALID_ARGS: --source-url cannot be combined with --result-dir/--result-zip"
1461 .to_string(),
1462 );
1463 }
1464 let file_name = attachment_path
1465 .file_name()
1466 .and_then(|name| name.to_str())
1467 .unwrap_or("document.pdf")
1468 .to_string();
1469 let source = load_mineru_result_source(&options, &attachment_path, &file_name)?;
1470 let artifacts = persist_mineru_result_sidecars(
1471 &storage_dir, &options.parent, &attachment,
1472 &options.provider, &source, options.chunk_chars,
1473 )?;
1474 Ok(serde_json::json!({
1475 "provider": spec.provider_key,
1476 "status": "indexed",
1477 "item_key": options.parent,
1478 "attachment_key": attachment,
1479 "attachment_path": attachment_path,
1480 "storage_dir": storage_dir,
1481 "task_id": source.task_id,
1482 "state": source.state,
1483 "blocks": artifacts.block_count,
1484 "chunks": artifacts.chunk_count,
1485 "artifacts": artifacts.artifacts,
1486 }))
1487 }
1488 _ => {
1489 run_ocr_process_sync(
1490 client, &options, spec.provider_key,
1491 &attachment, &attachment_path, &storage_dir,
1492 )
1493 }
1494 }
1495}
1496
1497fn run_ocr_process_sync(
1498 client: &mut impl RpcCaller,
1499 options: &OcrProcessOptions,
1500 provider: &str,
1501 attachment_key: &str,
1502 attachment_path: &Path,
1503 storage_dir: &Path,
1504) -> Result<Value, String> {
1505 let api_url = if let Some(endpoint) = &options.provider_endpoint {
1506 endpoint.clone()
1507 } else {
1508 let settings = client.call("settings.getAll", None)?;
1509 settings.get("ocr.apiUrl")
1510 .and_then(Value::as_str)
1511 .unwrap_or("")
1512 .to_string()
1513 };
1514 if api_url.is_empty() {
1515 return Err(format!("MISSING_CONFIG: ocr.apiUrl not configured for provider {provider}"));
1516 }
1517
1518 let api_key = {
1519 let from_env = if !options.api_key_env.is_empty() {
1520 env::var(&options.api_key_env).ok().filter(|v| !v.is_empty())
1521 } else {
1522 None
1523 };
1524 from_env.unwrap_or_else(|| {
1525 client.call("settings.getRaw", Some(serde_json::json!({"key": "ocr.apiKey"})))
1526 .ok()
1527 .and_then(|raw| raw.get("ocr.apiKey").and_then(Value::as_str).map(String::from))
1528 .unwrap_or_default()
1529 })
1530 };
1531
1532 let pdf_bytes = fs::read(attachment_path)
1533 .map_err(|e| format!("READ_PDF_FAILED: {}: {e}", attachment_path.display()))?;
1534
1535 const MAX_PDF_SIZE: usize = 100 * 1024 * 1024; if pdf_bytes.len() > MAX_PDF_SIZE {
1537 return Err(format!(
1538 "PDF_TOO_LARGE: {} is {} MB, max {} MB",
1539 attachment_path.display(),
1540 pdf_bytes.len() / (1024 * 1024),
1541 MAX_PDF_SIZE / (1024 * 1024),
1542 ));
1543 }
1544
1545 let base64_pdf = format!("data:application/pdf;base64,{}", base64_encode(&pdf_bytes));
1546
1547 let input = OcrRequestInput {
1548 content_base64: base64_pdf,
1549 file_name: attachment_path
1550 .file_name()
1551 .and_then(|n| n.to_str())
1552 .unwrap_or("document.pdf")
1553 .to_string(),
1554 mime_type: "application/pdf".to_string(),
1555 item_key: options.parent.clone(),
1556 attachment_key: attachment_key.to_string(),
1557 source_url: None,
1558 local_path: Some(attachment_path.to_string_lossy().to_string()),
1559 output_dir: None,
1560 };
1561 let request = build_ocr_provider_request(provider, &input)?;
1562
1563 let payload = if request.command.is_empty() {
1564 let method = request
1565 .method
1566 .ok_or_else(|| format!("OCR provider {provider} missing HTTP method"))?;
1567 let spec = raw_ocr_provider_spec(provider)?;
1568
1569 let mut transport = if !api_key.is_empty() {
1570 match spec.auth {
1571 "bearer" => UreqProviderHttpTransport::with_bearer_token(&api_key),
1572 "token" => UreqProviderHttpTransport::with_api_key(format!("token {api_key}")),
1573 _ => UreqProviderHttpTransport::new(),
1574 }
1575 } else {
1576 UreqProviderHttpTransport::new()
1577 };
1578
1579 transport.post_json(&ProviderHttpInvocation {
1580 provider: request.provider.to_string(),
1581 style: request.style.to_string(),
1582 method: method.to_string(),
1583 url: Some(api_url),
1584 auth_header_name: request.auth_header.map(ToString::to_string),
1585 auth_header_value: None,
1586 body: request.body,
1587 })?
1588 } else {
1589 let mut runner = StdProviderCommandRunner;
1590 runner.run_json(&request.command)?
1591 };
1592
1593 let blocks = parse_ocr_provider_response(provider, &payload, &options.parent, attachment_key)?;
1594 let chunks = zotron_types::chunks_from_blocks(&blocks, options.chunk_chars);
1595
1596 let artifacts = vec![
1597 write_sidecar_json(
1598 storage_dir, &options.parent, attachment_key,
1599 MachineArtifactKind::OcrRaw, &payload,
1600 )?,
1601 write_sidecar_jsonl(
1602 storage_dir, &options.parent, attachment_key,
1603 MachineArtifactKind::Blocks, &blocks,
1604 )?,
1605 write_sidecar_jsonl(
1606 storage_dir, &options.parent, attachment_key,
1607 MachineArtifactKind::Chunks, &chunks,
1608 )?,
1609 ];
1610
1611 let embedding_count = embed_sidecar_chunks(client, storage_dir, &options.parent, attachment_key, &chunks);
1612
1613 Ok(serde_json::json!({
1614 "provider": provider,
1615 "status": "indexed",
1616 "item_key": options.parent,
1617 "attachment_key": attachment_key,
1618 "embeddings": embedding_count,
1619 "attachment_path": attachment_path,
1620 "storage_dir": storage_dir,
1621 "blocks": blocks.len(),
1622 "chunks": chunks.len(),
1623 "artifacts": artifacts,
1624 }))
1625}
1626
1627fn embed_sidecar_chunks(
1628 client: &mut impl RpcCaller,
1629 storage_dir: &Path,
1630 item_key: &str,
1631 _attachment_key: &str,
1632 chunks: &[zotron_types::StructureChunk],
1633) -> usize {
1634 let Ok((provider, model, api_url, api_key)) = fetch_embedding_settings(client) else {
1635 return 0;
1636 };
1637 if provider.is_empty() || (api_key.is_empty() && provider != "ollama") {
1638 return 0;
1639 }
1640 let emb_chunks: Vec<EmbeddingChunkInput> = chunks
1641 .iter()
1642 .map(|c| EmbeddingChunkInput {
1643 chunk_key: c.chunk_key.clone(),
1644 text: c.text.clone(),
1645 })
1646 .collect();
1647 if emb_chunks.is_empty() {
1648 return 0;
1649 }
1650 let batch_size = 20;
1652 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
1653 for batch in emb_chunks.chunks(batch_size) {
1654 let input = EmbeddingRequestInput {
1655 item_key: item_key.to_string(),
1656 chunks: batch.to_vec(),
1657 model: if model.is_empty() { None } else { Some(model.clone()) },
1658 url: if api_url.is_empty() { None } else { Some(api_url.clone()) },
1659 input_type: Some("document".to_string()),
1660 };
1661 let Ok(request) = build_embedding_provider_request(&provider, &input) else {
1662 break;
1663 };
1664 let Some(url) = request.url.as_deref() else { break };
1665 let mut http = ureq::post(url).set("Content-Type", "application/json");
1666 if let Some(auth) = request.auth_header {
1667 if !api_key.is_empty() {
1668 http = http.set(auth, &format!("Bearer {api_key}"));
1669 }
1670 }
1671 let Ok(resp) = http.send_json(&request.body) else { break };
1672 let Ok(payload): Result<Value, _> = resp.into_json() else { break };
1673 let Ok(vectors) = parse_embedding_provider_response(&provider, &payload, item_key, batch)
1674 else {
1675 break;
1676 };
1677 all_vectors.extend(vectors);
1678 }
1679 let count = all_vectors.len();
1680 if count > 0 {
1681 let filename = embedding_vector_filename(&provider, &model);
1682 let vectors_dir = storage_dir.join(".zotron").join("embeddings");
1683 let _ = fs::create_dir_all(&vectors_dir);
1684 let vectors_path = vectors_dir.join(&filename);
1685 let mut out = String::new();
1686 for v in &all_vectors {
1687 if let Ok(line) = serde_json::to_string(v) {
1688 out.push_str(&line);
1689 out.push('\n');
1690 }
1691 }
1692 let _ = fs::write(&vectors_path, &out);
1693 }
1694 count
1695}
1696
1697struct MineruResultSource {
1698 task_id: Option<String>,
1699 state: String,
1700 result_dir: PathBuf,
1701 raw_zip_bytes: Option<Vec<u8>>,
1702 task_status: Option<Value>,
1703 payload: Value,
1704 content_list_file: Option<PathBuf>,
1705 markdown: Option<String>,
1706}
1707
1708struct PersistedOcrArtifacts {
1709 block_count: usize,
1710 chunk_count: usize,
1711 artifacts: Vec<Value>,
1712}
1713
1714fn resolve_attachment_path(
1715 client: &mut impl RpcCaller,
1716 attachment_key: &str,
1717) -> Result<PathBuf, String> {
1718 let payload = client.call(
1719 "attachments.getPath",
1720 Some(serde_json::json!({"key": attachment_key})),
1721 )?;
1722 let raw_path = payload
1723 .get("path")
1724 .and_then(Value::as_str)
1725 .filter(|path| !path.trim().is_empty())
1726 .ok_or_else(|| {
1727 format!("ATTACHMENT_PATH_NOT_FOUND: attachment {attachment_key} has no local PDF path")
1728 })?;
1729 Ok(PathBuf::from(local_path_from_zotero_path(raw_path)))
1730}
1731
1732fn resolve_first_pdf_attachment_key(
1734 client: &mut impl RpcCaller,
1735 parent_key: &str,
1736) -> Result<String, String> {
1737 let response = client.call(
1738 "attachments.list",
1739 Some(serde_json::json!({"parentKey": parent_key})),
1740 )?;
1741 let attachments = response
1743 .get("items")
1744 .and_then(Value::as_array)
1745 .or_else(|| response.as_array())
1746 .ok_or_else(|| {
1747 format!("NO_PDF_ATTACHMENT: no attachments found for item {parent_key}")
1748 })?;
1749 for attachment in attachments {
1750 if is_pdf_attachment(attachment) {
1751 if let Some(key) = attachment.get("key").and_then(Value::as_str) {
1752 return Ok(key.to_string());
1753 }
1754 }
1755 }
1756 Err(format!(
1757 "NO_PDF_ATTACHMENT: no PDF attachment found for item {parent_key}"
1758 ))
1759}
1760
1761fn load_mineru_result_source(
1762 options: &OcrProcessOptions,
1763 attachment_path: &Path,
1764 file_name: &str,
1765) -> Result<MineruResultSource, String> {
1766 if let Some(result_dir) = options.result_dir.as_deref() {
1767 return mineru_result_source_from_dir(PathBuf::from(result_dir), None, None, None);
1768 }
1769 if let Some(result_zip) = options.result_zip.as_deref() {
1770 let zip_path = PathBuf::from(result_zip);
1771 let zip_bytes = fs::read(&zip_path)
1772 .map_err(|err| format!("read MinerU result zip {}: {err}", zip_path.display()))?;
1773 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1774 return mineru_result_source_from_dir(result_dir, Some(zip_bytes), None, None);
1775 }
1776
1777 let Some(source_url) = options
1778 .source_url
1779 .as_deref()
1780 .filter(|value| !value.trim().is_empty())
1781 else {
1782 return submit_mineru_local_file(options, attachment_path, file_name);
1783 };
1784 let input = OcrRequestInput {
1785 item_key: options.parent.clone(),
1786 attachment_key: options.attachment.clone().expect("attachment resolved"),
1787 file_name: file_name.to_string(),
1788 mime_type: "application/pdf".to_string(),
1789 content_base64: format!("url:{source_url}"),
1790 source_url: Some(source_url.to_string()),
1791 local_path: None,
1792 output_dir: None,
1793 };
1794 let task = submit_mineru_task(
1795 &options.provider,
1796 &input,
1797 options.provider_endpoint.clone(),
1798 &options.api_key_env,
1799 )?;
1800 let task_id = task
1801 .get("data")
1802 .and_then(|data| data.get("task_id"))
1803 .and_then(Value::as_str)
1804 .ok_or_else(|| "MinerU submit response missing data.task_id".to_string())?
1805 .to_string();
1806 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1807 let status = poll_mineru_task(
1808 options.provider_endpoint.as_deref(),
1809 &task_id,
1810 &auth_header,
1811 options.poll_interval_seconds,
1812 options.timeout_seconds,
1813 )?;
1814 let zip_url = status
1815 .pointer("/data/full_zip_url")
1816 .or_else(|| status.pointer("/data/result/full_zip_url"))
1817 .and_then(Value::as_str)
1818 .ok_or_else(|| "MinerU completed task missing data.full_zip_url".to_string())?;
1819 let zip_bytes = download_bytes(zip_url)?;
1820 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1821 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(task_id))
1822}
1823
1824fn submit_mineru_local_file(
1825 options: &OcrProcessOptions,
1826 attachment_path: &Path,
1827 file_name: &str,
1828) -> Result<MineruResultSource, String> {
1829 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1830 let upload_request = create_mineru_file_upload(
1831 options.provider_endpoint.as_deref(),
1832 file_name,
1833 options.attachment.as_deref().expect("attachment resolved"),
1834 &auth_header,
1835 )?;
1836 let upload_url = upload_request
1837 .pointer("/data/file_urls/0")
1838 .or_else(|| upload_request.pointer("/data/fileUrls/0"))
1839 .and_then(Value::as_str)
1840 .ok_or_else(|| "MinerU upload URL response missing data.file_urls[0]".to_string())?;
1841 let batch_id = upload_request
1842 .pointer("/data/batch_id")
1843 .or_else(|| upload_request.pointer("/data/batchId"))
1844 .and_then(Value::as_str)
1845 .ok_or_else(|| "MinerU upload URL response missing data.batch_id".to_string())?
1846 .to_string();
1847 let bytes = fs::read(attachment_path)
1848 .map_err(|err| format!("read attachment PDF {}: {err}", attachment_path.display()))?;
1849 put_bytes(upload_url, &bytes)?;
1850 let status = poll_mineru_batch(
1851 options.provider_endpoint.as_deref(),
1852 &batch_id,
1853 &auth_header,
1854 options.poll_interval_seconds,
1855 options.timeout_seconds,
1856 )?;
1857 let zip_url = mineru_batch_zip_url(&status)
1858 .ok_or_else(|| "MinerU completed batch missing full_zip_url".to_string())?;
1859 let zip_bytes = download_bytes(&zip_url)?;
1860 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1861 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(batch_id))
1862}
1863
1864fn create_mineru_file_upload(
1865 endpoint: Option<&str>,
1866 file_name: &str,
1867 data_id: &str,
1868 auth_header: &str,
1869) -> Result<Value, String> {
1870 let url = mineru_file_urls_url(endpoint);
1871 let body = serde_json::json!({
1872 "files": [{"name": file_name, "data_id": data_id}],
1873 "model_version": "vlm",
1874 "is_ocr": false,
1875 "enable_formula": true,
1876 "enable_table": true,
1877 "language": "ch",
1878 "page_ranges": "1-200",
1879 });
1880 ureq::post(&url)
1881 .set("Authorization", auth_header)
1882 .send_json(body)
1883 .map_err(|err| format!("POST {url} failed: {err}"))?
1884 .into_json::<Value>()
1885 .map_err(|err| format!("POST {url} returned invalid JSON: {err}"))
1886}
1887
1888fn put_bytes(url: &str, bytes: &[u8]) -> Result<(), String> {
1889 ureq::put(url)
1890 .send_bytes(bytes)
1891 .map_err(|err| format!("PUT {url} failed: {err}"))?;
1892 Ok(())
1893}
1894
1895fn submit_mineru_task(
1896 provider: &str,
1897 input: &OcrRequestInput,
1898 endpoint: Option<String>,
1899 api_key_env: &str,
1900) -> Result<Value, String> {
1901 let request = build_ocr_provider_request(provider, input)?;
1902 let method = request
1903 .method
1904 .ok_or_else(|| "MinerU provider missing HTTP method".to_string())?;
1905 let mut transport = provider_http_transport_with_auth(Some(api_key_env), "bearer")?;
1906 transport.post_json(&ProviderHttpInvocation {
1907 provider: request.provider.to_string(),
1908 style: request.style.to_string(),
1909 method: method.to_string(),
1910 url: endpoint.or_else(|| request.url.map(ToString::to_string)),
1911 auth_header_name: request.auth_header.map(ToString::to_string),
1912 auth_header_value: None,
1913 body: request.body,
1914 })
1915}
1916
1917fn poll_mineru_task(
1918 endpoint: Option<&str>,
1919 task_id: &str,
1920 auth_header: &str,
1921 poll_interval_seconds: u64,
1922 timeout_seconds: u64,
1923) -> Result<Value, String> {
1924 let url = mineru_task_status_url(endpoint, task_id);
1925 let started = Instant::now();
1926 loop {
1927 let status = get_json_with_auth(&url, auth_header)?;
1928 let state = status
1929 .pointer("/data/state")
1930 .or_else(|| status.pointer("/data/status"))
1931 .and_then(Value::as_str)
1932 .unwrap_or("unknown");
1933 match state {
1934 "done" | "finished" | "success" => return Ok(status),
1935 "failed" | "error" => return Err(format!("MinerU task {task_id} failed: {status}")),
1936 _ => {
1937 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1938 return Err(format!(
1939 "MinerU task {task_id} timed out after {timeout_seconds}s with state {state}"
1940 ));
1941 }
1942 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1943 }
1944 }
1945 }
1946}
1947
1948fn mineru_task_status_url(endpoint: Option<&str>, task_id: &str) -> String {
1949 let base = endpoint
1950 .unwrap_or("https://mineru.net/api/v4/extract/task")
1951 .trim_end_matches('/');
1952 if base.ends_with("/extract/task") {
1953 format!("{base}/{task_id}")
1954 } else {
1955 format!("{base}/extract/task/{task_id}")
1956 }
1957}
1958
1959fn mineru_file_urls_url(endpoint: Option<&str>) -> String {
1960 let base = mineru_api_base(endpoint);
1961 format!("{base}/file-urls/batch")
1962}
1963
1964fn mineru_batch_status_url(endpoint: Option<&str>, batch_id: &str) -> String {
1965 let base = mineru_api_base(endpoint);
1966 format!("{base}/extract-results/batch/{batch_id}")
1967}
1968
1969fn mineru_api_base(endpoint: Option<&str>) -> String {
1970 let base = endpoint
1971 .unwrap_or("https://mineru.net/api/v4/extract/task")
1972 .trim_end_matches('/');
1973 if let Some(stripped) = base.strip_suffix("/extract/task") {
1974 return stripped.to_string();
1975 }
1976 if let Some(stripped) = base.strip_suffix("/extract") {
1977 return stripped.to_string();
1978 }
1979 base.to_string()
1980}
1981
1982fn poll_mineru_batch(
1983 endpoint: Option<&str>,
1984 batch_id: &str,
1985 auth_header: &str,
1986 poll_interval_seconds: u64,
1987 timeout_seconds: u64,
1988) -> Result<Value, String> {
1989 let url = mineru_batch_status_url(endpoint, batch_id);
1990 let started = Instant::now();
1991 loop {
1992 let status = get_json_with_auth(&url, auth_header)?;
1993 let state = mineru_batch_state(&status).unwrap_or("unknown");
1994 match state {
1995 "done" | "finished" | "success" => return Ok(status),
1996 "failed" | "error" => return Err(format!("MinerU batch {batch_id} failed: {status}")),
1997 _ => {
1998 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1999 return Err(format!(
2000 "MinerU batch {batch_id} timed out after {timeout_seconds}s with state {state}"
2001 ));
2002 }
2003 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
2004 }
2005 }
2006 }
2007}
2008
2009fn mineru_batch_state(status: &Value) -> Option<&str> {
2010 status
2011 .pointer("/data/extract_result/0/state")
2012 .or_else(|| status.pointer("/data/extractResult/0/state"))
2013 .or_else(|| status.pointer("/data/state"))
2014 .and_then(Value::as_str)
2015}
2016
2017fn mineru_batch_zip_url(status: &Value) -> Option<String> {
2018 status
2019 .pointer("/data/extract_result/0/full_zip_url")
2020 .or_else(|| status.pointer("/data/extractResult/0/full_zip_url"))
2021 .or_else(|| status.pointer("/data/full_zip_url"))
2022 .and_then(Value::as_str)
2023 .map(ToString::to_string)
2024}
2025
2026fn provider_auth_header_value(api_key_env: &str, auth_scheme: &str) -> Result<String, String> {
2027 let token = env::var(api_key_env)
2028 .map_err(|_| format!("missing provider credential env var {api_key_env}"))?;
2029 let token = token.trim();
2030 if token.is_empty() {
2031 return Err(format!(
2032 "provider credential env var {api_key_env} is empty"
2033 ));
2034 }
2035 Ok(match auth_scheme {
2036 "bearer" if token.starts_with("Bearer ") => token.to_string(),
2037 "bearer" => format!("Bearer {token}"),
2038 "token" if token.starts_with("token ") => token.to_string(),
2039 "token" => format!("token {token}"),
2040 _ => token.to_string(),
2041 })
2042}
2043
2044fn get_json_with_auth(url: &str, auth_header: &str) -> Result<Value, String> {
2045 ureq::get(url)
2046 .set("Authorization", auth_header)
2047 .call()
2048 .map_err(|err| format!("GET {url} failed: {err}"))?
2049 .into_json::<Value>()
2050 .map_err(|err| format!("GET {url} returned invalid JSON: {err}"))
2051}
2052
2053fn download_bytes(url: &str) -> Result<Vec<u8>, String> {
2054 let response = ureq::get(url)
2055 .call()
2056 .map_err(|err| format!("download {url} failed: {err}"))?;
2057 let mut bytes = Vec::new();
2058 response
2059 .into_reader()
2060 .read_to_end(&mut bytes)
2061 .map_err(|err| format!("read download {url}: {err}"))?;
2062 Ok(bytes)
2063}
2064
2065fn extract_zip_bytes_to_temp(prefix: &str, zip_bytes: &[u8]) -> Result<PathBuf, String> {
2066 let dir = unique_temp_path(prefix);
2067 fs::create_dir_all(&dir).map_err(|err| format!("create temp dir {}: {err}", dir.display()))?;
2068 let zip_path = dir.with_extension("zip");
2069 fs::write(&zip_path, zip_bytes)
2070 .map_err(|err| format!("write temp zip {}: {err}", zip_path.display()))?;
2071 let output = ProcessCommand::new("unzip")
2072 .arg("-q")
2073 .arg("-o")
2074 .arg(&zip_path)
2075 .arg("-d")
2076 .arg(&dir)
2077 .output()
2078 .map_err(|err| format!("run unzip: {err}"))?;
2079 if !output.status.success() {
2080 return Err(format!(
2081 "unzip {} failed: {}",
2082 zip_path.display(),
2083 String::from_utf8_lossy(&output.stderr).trim()
2084 ));
2085 }
2086 Ok(dir)
2087}
2088
2089fn unique_temp_path(prefix: &str) -> PathBuf {
2090 let nanos = SystemTime::now()
2091 .duration_since(UNIX_EPOCH)
2092 .map(|duration| duration.as_nanos())
2093 .unwrap_or(0);
2094 env::temp_dir().join(format!("{prefix}-{}-{nanos}", std::process::id()))
2095}
2096
2097fn mineru_result_source_from_dir(
2098 result_dir: PathBuf,
2099 raw_zip_bytes: Option<Vec<u8>>,
2100 task_status: Option<Value>,
2101 task_id: Option<String>,
2102) -> Result<MineruResultSource, String> {
2103 let (payload, content_list_file) = mineru_payload_from_result_dir(&result_dir)?;
2104 let markdown = find_first_file_by_name(&result_dir, "full.md")
2105 .map(|path| {
2106 fs::read_to_string(&path)
2107 .map_err(|err| format!("read native markdown {}: {err}", path.display()))
2108 })
2109 .transpose()?;
2110 Ok(MineruResultSource {
2111 task_id,
2112 state: "done".to_string(),
2113 result_dir,
2114 raw_zip_bytes,
2115 task_status,
2116 payload,
2117 content_list_file,
2118 markdown,
2119 })
2120}
2121
2122fn mineru_payload_from_result_dir(result_dir: &Path) -> Result<(Value, Option<PathBuf>), String> {
2123 let v2 = find_first_file_with_suffix(result_dir, "_content_list_v2.json");
2124 if let Some(path) = v2 {
2125 let value = read_json_file(&path)?;
2126 return Ok((serde_json::json!({"content_list_v2": value}), Some(path)));
2127 }
2128 let content_list = find_first_file_with_suffix(result_dir, "_content_list.json");
2129 if let Some(path) = content_list {
2130 let value = read_json_file(&path)?;
2131 return Ok((serde_json::json!({"content_list": value}), Some(path)));
2132 }
2133 let layout = find_first_file_by_name(result_dir, "layout.json");
2134 if let Some(path) = layout {
2135 return Ok((read_json_file(&path)?, Some(path)));
2136 }
2137 let markdown = find_first_file_by_name(result_dir, "full.md");
2138 if let Some(path) = markdown {
2139 let text = fs::read_to_string(&path)
2140 .map_err(|err| format!("read native markdown {}: {err}", path.display()))?;
2141 return Ok((serde_json::json!({"result": text}), Some(path)));
2142 }
2143 Err(format!(
2144 "MinerU result directory {} missing content_list_v2/content_list/layout/full.md",
2145 result_dir.display()
2146 ))
2147}
2148
2149fn read_json_file(path: &Path) -> Result<Value, String> {
2150 let raw = fs::read_to_string(path).map_err(|err| format!("read {}: {err}", path.display()))?;
2151 serde_json::from_str(&raw).map_err(|err| format!("parse JSON {}: {err}", path.display()))
2152}
2153
2154fn persist_mineru_result_sidecars(
2155 storage_dir: &Path,
2156 item_key: &str,
2157 attachment_key: &str,
2158 provider: &str,
2159 source: &MineruResultSource,
2160 chunk_chars: usize,
2161) -> Result<PersistedOcrArtifacts, String> {
2162 let blocks = parse_ocr_provider_response(provider, &source.payload, item_key, attachment_key)?;
2163 let chunks = zotron_types::chunks_from_blocks(&blocks, chunk_chars);
2164 let assets = copy_mineru_assets(&source.result_dir, storage_dir)?;
2165 let raw_bundle = serde_json::json!({
2166 "provider": provider,
2167 "item_key": item_key,
2168 "attachment_key": attachment_key,
2169 "task_id": source.task_id,
2170 "state": source.state,
2171 "task_status": source.task_status,
2172 "content_list_file": source.content_list_file,
2173 "payload": source.payload,
2174 });
2175
2176 let mut artifacts = Vec::new();
2177 artifacts.push(write_sidecar_json(
2178 storage_dir,
2179 item_key,
2180 attachment_key,
2181 MachineArtifactKind::OcrRaw,
2182 &raw_bundle,
2183 )?);
2184 artifacts.push(write_sidecar_jsonl(
2185 storage_dir,
2186 item_key,
2187 attachment_key,
2188 MachineArtifactKind::Blocks,
2189 &blocks,
2190 )?);
2191 artifacts.push(write_sidecar_jsonl(
2192 storage_dir,
2193 item_key,
2194 attachment_key,
2195 MachineArtifactKind::Chunks,
2196 &chunks,
2197 )?);
2198 if let Some(markdown) = source.markdown.as_deref() {
2199 artifacts.push(write_sidecar_bytes(
2200 storage_dir,
2201 item_key,
2202 attachment_key,
2203 MachineArtifactKind::OcrNativeMarkdown,
2204 markdown.as_bytes(),
2205 )?);
2206 }
2207 artifacts.push(write_sidecar_json(
2208 storage_dir,
2209 item_key,
2210 attachment_key,
2211 MachineArtifactKind::OcrNativeAssets,
2212 &assets,
2213 )?);
2214 if let Some(bytes) = source.raw_zip_bytes.as_deref() {
2215 artifacts.push(write_extra_sidecar_bytes(
2216 storage_dir,
2217 ".zotron/ocr/latest.raw.zip",
2218 bytes,
2219 )?);
2220 }
2221
2222 Ok(PersistedOcrArtifacts {
2223 block_count: blocks.len(),
2224 chunk_count: chunks.len(),
2225 artifacts,
2226 })
2227}
2228
2229fn write_sidecar_json(
2230 storage_dir: &Path,
2231 item_key: &str,
2232 attachment_key: &str,
2233 kind: MachineArtifactKind,
2234 value: &Value,
2235) -> Result<Value, String> {
2236 let bytes = serde_json::to_vec_pretty(value).map_err(|err| err.to_string())?;
2237 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, &bytes)
2238}
2239
2240fn write_sidecar_jsonl<T: serde::Serialize>(
2241 storage_dir: &Path,
2242 item_key: &str,
2243 attachment_key: &str,
2244 kind: MachineArtifactKind,
2245 values: &[T],
2246) -> Result<Value, String> {
2247 let mut out = String::new();
2248 for value in values {
2249 out.push_str(&serde_json::to_string(value).map_err(|err| err.to_string())?);
2250 out.push('\n');
2251 }
2252 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, out.as_bytes())
2253}
2254
2255fn write_sidecar_bytes(
2256 storage_dir: &Path,
2257 item_key: &str,
2258 attachment_key: &str,
2259 kind: MachineArtifactKind,
2260 bytes: &[u8],
2261) -> Result<Value, String> {
2262 let record = write_machine_artifact_sidecar(storage_dir, item_key, attachment_key, kind, bytes)
2263 .map_err(|err| format!("write sidecar {:?}: {err}", kind))?;
2264 Ok(serde_json::json!({
2265 "kind": kind,
2266 "relative_path": record.relative_path,
2267 "absolute_path": record.absolute_path,
2268 }))
2269}
2270
2271fn write_extra_sidecar_bytes(
2272 storage_dir: &Path,
2273 relative_path: &str,
2274 bytes: &[u8],
2275) -> Result<Value, String> {
2276 let absolute_path = storage_dir.join(relative_path);
2277 if let Some(parent) = absolute_path.parent() {
2278 fs::create_dir_all(parent).map_err(|err| format!("create {}: {err}", parent.display()))?;
2279 }
2280 fs::write(&absolute_path, bytes)
2281 .map_err(|err| format!("write sidecar {}: {err}", absolute_path.display()))?;
2282 Ok(serde_json::json!({
2283 "kind": "ocr_raw_zip",
2284 "relative_path": relative_path,
2285 "absolute_path": absolute_path,
2286 }))
2287}
2288
2289fn copy_mineru_assets(result_dir: &Path, storage_dir: &Path) -> Result<Value, String> {
2290 let mut images = Vec::new();
2291 for file in collect_files(result_dir)? {
2292 if !is_image_file(&file) {
2293 continue;
2294 }
2295 let relative = file.strip_prefix(result_dir).unwrap_or(&file).to_path_buf();
2296 let destination = storage_dir.join(".zotron").join("ocr").join(&relative);
2297 if let Some(parent) = destination.parent() {
2298 fs::create_dir_all(parent)
2299 .map_err(|err| format!("create {}: {err}", parent.display()))?;
2300 }
2301 fs::copy(&file, &destination).map_err(|err| {
2302 format!(
2303 "copy MinerU asset {} to {}: {err}",
2304 file.display(),
2305 destination.display()
2306 )
2307 })?;
2308 images.push(serde_json::json!({
2309 "source_relative": relative,
2310 "sidecar_relative": PathBuf::from(".zotron").join("ocr").join(&relative),
2311 "absolute_path": destination,
2312 }));
2313 }
2314 Ok(serde_json::json!({
2315 "provider": "mineru",
2316 "images": images,
2317 }))
2318}
2319
2320fn is_image_file(path: &Path) -> bool {
2321 matches!(
2322 path.extension()
2323 .and_then(|ext| ext.to_str())
2324 .unwrap_or_default()
2325 .to_ascii_lowercase()
2326 .as_str(),
2327 "png" | "jpg" | "jpeg" | "webp" | "gif"
2328 )
2329}
2330
2331fn find_first_file_with_suffix(root: &Path, suffix: &str) -> Option<PathBuf> {
2332 collect_files(root).ok()?.into_iter().find(|path| {
2333 path.file_name()
2334 .and_then(|name| name.to_str())
2335 .is_some_and(|name| name.ends_with(suffix))
2336 })
2337}
2338
2339fn find_first_file_by_name(root: &Path, name: &str) -> Option<PathBuf> {
2340 collect_files(root).ok()?.into_iter().find(|path| {
2341 path.file_name()
2342 .and_then(|file_name| file_name.to_str())
2343 .is_some_and(|file_name| file_name == name)
2344 })
2345}
2346
2347fn collect_files(root: &Path) -> Result<Vec<PathBuf>, String> {
2348 let mut files = Vec::new();
2349 collect_files_into(root, &mut files)?;
2350 files.sort();
2351 Ok(files)
2352}
2353
2354fn collect_files_into(root: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
2355 for entry in fs::read_dir(root).map_err(|err| format!("read dir {}: {err}", root.display()))? {
2356 let entry = entry.map_err(|err| format!("read dir entry {}: {err}", root.display()))?;
2357 let path = entry.path();
2358 let file_type = entry
2359 .file_type()
2360 .map_err(|err| format!("stat {}: {err}", path.display()))?;
2361 if file_type.is_dir() {
2362 collect_files_into(&path, files)?;
2363 } else if file_type.is_file() {
2364 files.push(path);
2365 }
2366 }
2367 Ok(())
2368}
2369
2370fn ocr_async_task_result(provider: &str, payload: &Value) -> Option<Value> {
2371 let data = payload.get("data")?;
2372 let task_id = data.get("task_id").and_then(Value::as_str)?;
2373 Some(serde_json::json!({
2374 "provider": provider,
2375 "status": "submitted",
2376 "task_id": task_id,
2377 "state": data.get("state").and_then(Value::as_str).unwrap_or("submitted"),
2378 "result_url": data.get("full_zip_url").or_else(|| data.get("markdown_url")).cloned(),
2379 "raw": payload,
2380 }))
2381}
2382
2383fn ocr_input_from_file(
2384 file: String,
2385 item_key: Option<String>,
2386 attachment_key: Option<String>,
2387 mime_type: Option<String>,
2388) -> Result<OcrRequestInput, String> {
2389 let item_key = item_key
2390 .ok_or_else(|| "INVALID_ARGS: --item-key is required when using --file".to_string())?;
2391 let attachment_key = attachment_key.ok_or_else(|| {
2392 "INVALID_ARGS: --attachment-key is required when using --file".to_string()
2393 })?;
2394 let path = PathBuf::from(&file);
2395 let bytes = fs::read(&path).map_err(|err| format!("read {file}: {err}"))?;
2396 let file_name = path
2397 .file_name()
2398 .and_then(|name| name.to_str())
2399 .unwrap_or("document.pdf")
2400 .to_string();
2401 let mime_type = mime_type.unwrap_or_else(|| guess_mime_type(&path).to_string());
2402 Ok(OcrRequestInput {
2403 item_key,
2404 attachment_key,
2405 file_name,
2406 mime_type,
2407 content_base64: base64_encode(&bytes),
2408 source_url: None,
2409 local_path: Some(file),
2410 output_dir: None,
2411 })
2412}
2413
2414fn guess_mime_type(path: &Path) -> &'static str {
2415 match path
2416 .extension()
2417 .and_then(|ext| ext.to_str())
2418 .unwrap_or_default()
2419 .to_ascii_lowercase()
2420 .as_str()
2421 {
2422 "png" => "image/png",
2423 "jpg" | "jpeg" => "image/jpeg",
2424 "webp" => "image/webp",
2425 _ => "application/pdf",
2426 }
2427}
2428
2429fn base64_encode(bytes: &[u8]) -> String {
2430 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2431 let mut out = String::with_capacity(bytes.len().div_ceil(3) * 4);
2432 for chunk in bytes.chunks(3) {
2433 let b0 = chunk[0];
2434 let b1 = *chunk.get(1).unwrap_or(&0);
2435 let b2 = *chunk.get(2).unwrap_or(&0);
2436 out.push(TABLE[(b0 >> 2) as usize] as char);
2437 out.push(TABLE[(((b0 & 0b0000_0011) << 4) | (b1 >> 4)) as usize] as char);
2438 if chunk.len() > 1 {
2439 out.push(TABLE[(((b1 & 0b0000_1111) << 2) | (b2 >> 6)) as usize] as char);
2440 } else {
2441 out.push('=');
2442 }
2443 if chunk.len() > 2 {
2444 out.push(TABLE[(b2 & 0b0011_1111) as usize] as char);
2445 } else {
2446 out.push('=');
2447 }
2448 }
2449 out
2450}
2451
2452fn run_ocr_status_command(
2453 client: &mut impl RpcCaller,
2454 collection: String,
2455) -> Result<Value, String> {
2456 let collection_key = find_collection_in_tree(client, &collection)?
2457 .and_then(|node| node.get("key").cloned())
2458 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
2459 let raw = paginate_rpc(
2460 client,
2461 "collections.getItems",
2462 serde_json::json!({"key": collection_key}),
2463 500,
2464 )?;
2465 let items = raw
2466 .get("items")
2467 .and_then(Value::as_array)
2468 .or_else(|| raw.as_array())
2469 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
2470 .clone();
2471
2472 let mut has_ocr = 0usize;
2473 for item in &items {
2474 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
2475 if has_ocr_artifact(client, &item_key)? || has_ocr_note(client, &item_key)? {
2476 has_ocr += 1;
2477 }
2478 }
2479
2480 Ok(serde_json::json!({
2481 "collection": collection,
2482 "total": items.len(),
2483 "has_ocr": has_ocr,
2484 "missing_ocr": items.len() - has_ocr,
2485 }))
2486}
2487
2488fn has_ocr_artifact(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2489 if let Some(item_key) = item_key.as_str() {
2490 if machine_artifact_exists_for_item(
2493 machine_artifact_store_root(),
2494 item_key,
2495 MachineArtifactKind::Chunks,
2496 ) {
2497 return Ok(true);
2498 }
2499 }
2500
2501 let attachments = client.call(
2502 "attachments.list",
2503 Some(serde_json::json!({"parentKey": item_key.clone()})),
2504 )?;
2505 Ok(attachments.as_array().is_some_and(|attachments| {
2506 attachments.iter().any(|attachment| {
2507 let has_sidecar_chunks = attachment
2508 .get("path")
2509 .and_then(Value::as_str)
2510 .map(local_path_from_zotero_path)
2511 .as_deref()
2512 .map(Path::new)
2513 .and_then(Path::parent)
2514 .is_some_and(|dir| {
2515 machine_artifact_exists_in_sidecar(dir, MachineArtifactKind::Chunks)
2516 });
2517 if has_sidecar_chunks {
2518 return true;
2519 }
2520
2521 attachment
2523 .get("title")
2524 .and_then(Value::as_str)
2525 .is_some_and(|title| title.ends_with("zotron-chunks.jsonl"))
2526 })
2527 }))
2528}
2529
2530fn local_path_from_zotero_path(path: &str) -> String {
2531 if is_wsl() && path.as_bytes().get(1) == Some(&b':') {
2532 return ProcessCommand::new("wslpath")
2533 .arg("-u")
2534 .arg(path)
2535 .output()
2536 .ok()
2537 .filter(|output| output.status.success())
2538 .and_then(|output| String::from_utf8(output.stdout).ok())
2539 .map(|converted| converted.trim().to_string())
2540 .filter(|converted| !converted.is_empty())
2541 .unwrap_or_else(|| path.to_string());
2542 }
2543 path.to_string()
2544}
2545
2546fn has_ocr_note(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2547 let notes = client.call(
2548 "notes.list",
2549 Some(serde_json::json!({"parentKey": item_key.clone()})),
2550 )?;
2551 Ok(notes.as_array().is_some_and(|notes| {
2552 notes.iter().any(|note| {
2553 note.get("tags")
2554 .and_then(Value::as_array)
2555 .is_some_and(|tags| tags.iter().any(tag_is_ocr))
2556 })
2557 }))
2558}
2559
2560fn tag_is_ocr(tag: &Value) -> bool {
2561 tag.as_str() == Some("ocr")
2562 || tag
2563 .get("tag")
2564 .and_then(Value::as_str)
2565 .is_some_and(|tag| tag == "ocr")
2566}
2567
2568fn find_collection_in_tree(
2569 client: &mut impl RpcCaller,
2570 collection: &str,
2571) -> Result<Option<Value>, String> {
2572 let tree = client.call("collections.tree", None)?;
2573 let nodes = tree
2574 .as_array()
2575 .ok_or_else(|| "collections.tree returned non-array result".to_string())?;
2576 Ok(search_collection_tree(nodes, collection).cloned())
2577}
2578
2579fn search_collection_tree<'a>(nodes: &'a [Value], collection: &str) -> Option<&'a Value> {
2580 for node in nodes {
2581 if node.get("key").and_then(Value::as_str) == Some(collection)
2582 || node.get("name").and_then(Value::as_str) == Some(collection)
2583 {
2584 return Some(node);
2585 }
2586 if let Some(children) = node.get("children").and_then(Value::as_array) {
2587 if let Some(found) = search_collection_tree(children, collection) {
2588 return Some(found);
2589 }
2590 }
2591 }
2592 None
2593}
2594
2595fn run_command(command: Command, client: &mut impl RpcCaller) -> Result<String, String> {
2596 if let Command::Export(args) = command {
2597 return run_export(args, client);
2598 }
2599
2600 let (value, style) = match command {
2601 Command::Ping { .. } => (
2602 call_json(client, "system.ping", None)?,
2603 JsonStyle::PythonCompact,
2604 ),
2605 Command::Rpc {
2606 method,
2607 params_json,
2608 paginate,
2609 page_size,
2610 ..
2611 } => {
2612 let params = serde_json::from_str::<Value>(¶ms_json)
2613 .map_err(|err| format!("INVALID_JSON: params must be a JSON object: {err}"))?;
2614 if !params.is_object() {
2615 return Err("INVALID_JSON: params must be a JSON object".to_string());
2616 }
2617 if paginate {
2618 (
2619 paginate_rpc(client, &method, params, page_size)?,
2620 JsonStyle::Pretty,
2621 )
2622 } else {
2623 (call_json(client, &method, Some(params))?, JsonStyle::Pretty)
2624 }
2625 }
2626 Command::Push {
2627 json_file,
2628 pdf,
2629 collection,
2630 on_duplicate,
2631 dry_run,
2632 ..
2633 } => return run_push_command(json_file, pdf, collection, on_duplicate, dry_run, client),
2634 Command::System { command } => run_system_command(command, client)?,
2635 Command::Search(args) => {
2636 if let Some(mgmt) = args.management {
2637 run_search_management_command(mgmt, client)?
2638 } else {
2639 run_search(args, client)?
2640 }
2641 }
2642 Command::Items { command } => run_items_command(command, client)?,
2643 Command::Collections { command } => run_collections_command(command, client)?,
2644 Command::Notes { command } => run_notes_command(command, client)?,
2645 Command::Attachments { command } => run_attachments_command(command, client)?,
2646 Command::Settings { command } => run_settings_command(command, client)?,
2647 Command::Tags { command } => run_tags_command(command, client)?,
2648 Command::Annotations { command } => run_annotations_command(command, client)?,
2649 Command::Ocr { command } => {
2650 return run_ocr_command(command, client);
2651 }
2652 Command::Rag { command } => {
2653 return run_rag_command(command, client);
2654 }
2655 Command::Export(_) => unreachable!("export commands return raw output above"),
2656 Command::FindPdfs {
2657 collection, limit, ..
2658 } => run_find_pdfs_command(client, collection, limit)?,
2659 };
2660
2661 format_json(&value, style)
2662}
2663
2664fn run_rag_command(command: RagCommand, client: &mut impl RpcCaller) -> Result<String, String> {
2665 match command {
2666 RagCommand::Providers => format_json(
2667 &serde_json::json!({
2668 "providers": [
2669 embedding_provider_spec("volcengine")?,
2670 embedding_provider_spec("alibaba")?,
2671 embedding_provider_spec("custom")?,
2672 ],
2673 }),
2674 JsonStyle::Pretty,
2675 ),
2676 RagCommand::Embed {
2677 provider,
2678 input,
2679 endpoint,
2680 model,
2681 input_type,
2682 api_key_env,
2683 } => {
2684 let value = run_embedding_provider_json_command(
2685 provider,
2686 input,
2687 endpoint,
2688 model,
2689 input_type,
2690 api_key_env,
2691 )?;
2692 format_json(&value, JsonStyle::PythonCompact)
2693 }
2694 RagCommand::Status { collection, .. } => {
2695 let value = rag_status_value(client, &collection)?;
2696 format_json(&value, JsonStyle::PythonCompact)
2697 }
2698 RagCommand::Search {
2699 query,
2700 collection,
2701 keys,
2702 zotero,
2703 top_spans_per_item,
2704 include_fulltext_spans,
2705 top_k,
2706 output,
2707 ..
2708 } => run_rag_search_command(
2709 client,
2710 RagSearchOptions {
2711 query,
2712 collection,
2713 keys,
2714 zotero,
2715 top_spans_per_item,
2716 include_fulltext_spans,
2717 top_k,
2718 output,
2719 },
2720 ),
2721 }
2722}
2723
2724fn run_embedding_provider_json_command(
2725 provider: String,
2726 input: String,
2727 endpoint: Option<String>,
2728 model: Option<String>,
2729 input_type: Option<String>,
2730 api_key_env: Option<String>,
2731) -> Result<Value, String> {
2732 let mut input: EmbeddingRequestInput = read_json_input(&input)?;
2733 if endpoint.is_some() {
2734 input.url = endpoint;
2735 }
2736 if model.is_some() {
2737 input.model = model;
2738 }
2739 if input_type.is_some() {
2740 input.input_type = input_type;
2741 }
2742 let mut transport = provider_http_transport(api_key_env.as_deref())?;
2743 let vectors = execute_embedding_provider_request(&provider, &input, &mut transport)?;
2744
2745 Ok(serde_json::json!({
2746 "provider": provider,
2747 "vectors": vectors,
2748 }))
2749}
2750
2751fn provider_http_transport(api_key_env: Option<&str>) -> Result<UreqProviderHttpTransport, String> {
2752 provider_http_transport_with_auth(api_key_env, "bearer")
2753}
2754
2755fn provider_http_transport_with_auth(
2756 api_key_env: Option<&str>,
2757 auth_scheme: &str,
2758) -> Result<UreqProviderHttpTransport, String> {
2759 let Some(env_name) = api_key_env else {
2760 return Ok(UreqProviderHttpTransport::new());
2761 };
2762 let token = env::var(env_name)
2763 .map_err(|_| format!("missing provider credential env var {env_name}"))?;
2764 if token.trim().is_empty() {
2765 return Err(format!("provider credential env var {env_name} is empty"));
2766 }
2767 let token = token.trim();
2768 match auth_scheme {
2769 "token" if token.starts_with("token ") => {
2770 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2771 }
2772 "token" => Ok(UreqProviderHttpTransport::with_api_key(format!(
2773 "token {token}"
2774 ))),
2775 "bearer" if token.starts_with("Bearer ") => {
2776 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2777 }
2778 "bearer" => Ok(UreqProviderHttpTransport::with_bearer_token(token)),
2779 "none" => Ok(UreqProviderHttpTransport::new()),
2780 other => Err(format!("unsupported provider auth scheme {other}")),
2781 }
2782}
2783
2784fn read_json_input<T: serde::de::DeserializeOwned>(path: &str) -> Result<T, String> {
2785 let payload = if path == "-" {
2786 let mut input = String::new();
2787 io::stdin()
2788 .read_to_string(&mut input)
2789 .map_err(|err| format!("read stdin: {err}"))?;
2790 input
2791 } else {
2792 fs::read_to_string(path).map_err(|err| format!("read {path}: {err}"))?
2793 };
2794 serde_json::from_str::<T>(&payload)
2795 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))
2796}
2797
2798fn fetch_embedding_settings(
2799 client: &mut impl RpcCaller,
2800) -> Result<(String, String, String, String), String> {
2801 let settings = client.call("settings.getAll", None)?;
2802 let provider = settings
2803 .get("embedding.provider")
2804 .and_then(Value::as_str)
2805 .unwrap_or("ollama")
2806 .to_string();
2807 let model = settings
2808 .get("embedding.model")
2809 .and_then(Value::as_str)
2810 .unwrap_or("")
2811 .to_string();
2812 let api_url = settings
2813 .get("embedding.apiUrl")
2814 .and_then(Value::as_str)
2815 .unwrap_or("")
2816 .to_string();
2817 let raw = client.call("settings.getRaw", Some(serde_json::json!({"key": "embedding.apiKey"})))?;
2818 let api_key = raw
2819 .get("embedding.apiKey")
2820 .and_then(Value::as_str)
2821 .unwrap_or("")
2822 .to_string();
2823 Ok((provider, model, api_url, api_key))
2824}
2825
2826fn fetch_retrieval_mode(client: &mut impl RpcCaller) -> String {
2827 client
2828 .call(
2829 "settings.get",
2830 Some(serde_json::json!({"key": "rag.retrievalMode"})),
2831 )
2832 .ok()
2833 .and_then(|v| {
2834 v.get("rag.retrievalMode")
2835 .and_then(Value::as_str)
2836 .map(String::from)
2837 })
2838 .unwrap_or_else(|| "hybrid".to_string())
2839}
2840
2841fn resolve_sidecar_paths(
2842 client: &mut impl RpcCaller,
2843 collection: Option<&str>,
2844 keys: &[String],
2845) -> Result<Vec<(String, String, PathBuf)>, String> {
2846 let items = if !keys.is_empty() {
2847 let mut items = Vec::new();
2848 for key in keys {
2849 let item = client.call("items.get", Some(serde_json::json!({"key": key})))?;
2850 items.push(item);
2851 }
2852 items
2853 } else if let Some(col) = collection {
2854 let col_key = resolve_collection(client, col)?;
2855 let response = client.call(
2856 "collections.getItems",
2857 Some(serde_json::json!({"key": col_key})),
2858 )?;
2859 collection_items(&response)
2860 } else {
2861 return Err("INVALID_ARGS: --collection or --key required".into());
2862 };
2863
2864 let mut results = Vec::new();
2865 for item in &items {
2866 let item_key = item.get("key").and_then(Value::as_str).unwrap_or_default();
2867 let attachments = client.call(
2868 "attachments.list",
2869 Some(serde_json::json!({"parentKey": item_key})),
2870 )?;
2871 let att_list = attachments
2872 .get("items")
2873 .and_then(Value::as_array)
2874 .or_else(|| attachments.as_array())
2875 .cloned()
2876 .unwrap_or_default();
2877 for att in &att_list {
2878 let content_type = att
2879 .get("contentType")
2880 .and_then(Value::as_str)
2881 .unwrap_or("");
2882 if content_type != "application/pdf" {
2883 continue;
2884 }
2885 let att_key = att.get("key").and_then(Value::as_str).unwrap_or_default();
2886 let path = att.get("path").and_then(Value::as_str).unwrap_or_default();
2887 if path.is_empty() {
2888 continue;
2889 }
2890 let local_path = local_path_from_zotero_path(path);
2891 let pdf_path = PathBuf::from(&local_path);
2892 if let Some(parent) = pdf_path.parent() {
2893 let sidecar_root = parent.join(".zotron");
2894 if sidecar_root.exists() {
2895 results.push((item_key.to_string(), att_key.to_string(), sidecar_root));
2896 }
2897 }
2898 }
2899 }
2900 Ok(results)
2901}
2902
2903fn load_sidecar_chunks(sidecar_root: &Path) -> Vec<StructureChunk> {
2904 let chunks_path = sidecar_root.join("chunks").join("chunks.v1.jsonl");
2905 let Ok(content) = fs::read_to_string(&chunks_path) else {
2906 return Vec::new();
2907 };
2908 content
2909 .lines()
2910 .filter(|line| !line.trim().is_empty())
2911 .filter_map(|line| serde_json::from_str::<StructureChunk>(line).ok())
2912 .collect()
2913}
2914
2915fn embedding_vector_filename(provider: &str, model: &str) -> String {
2916 let p = provider.trim().to_lowercase().replace('/', "-");
2917 let m = model.trim().to_lowercase().replace('/', "-");
2918 if p.is_empty() && m.is_empty() {
2919 return "vectors.jsonl".to_string();
2920 }
2921 format!("{p}--{m}.jsonl")
2922}
2923
2924fn load_sidecar_vectors(sidecar_root: &Path, provider: &str, model: &str) -> Vec<EmbeddingVector> {
2925 let embeddings_dir = sidecar_root.join("embeddings");
2926 let target = embedding_vector_filename(provider, model);
2927 let target_path = embeddings_dir.join(&target);
2928 if let Ok(content) = fs::read_to_string(&target_path) {
2929 let vecs: Vec<EmbeddingVector> = content
2930 .lines()
2931 .filter(|line| !line.trim().is_empty())
2932 .filter_map(|line| serde_json::from_str(line).ok())
2933 .collect();
2934 if !vecs.is_empty() {
2935 return vecs;
2936 }
2937 }
2938 for legacy in &["vectors.v1.jsonl", "vectors.jsonl"] {
2940 let path = embeddings_dir.join(legacy);
2941 if let Ok(content) = fs::read_to_string(&path) {
2942 let vecs: Vec<EmbeddingVector> = content
2943 .lines()
2944 .filter(|line| !line.trim().is_empty())
2945 .filter_map(|line| serde_json::from_str::<EmbeddingVector>(line).ok())
2946 .filter(|v| v.source_provider == provider || provider.is_empty())
2947 .collect();
2948 if !vecs.is_empty() {
2949 return vecs;
2950 }
2951 }
2952 }
2953 Vec::new()
2954}
2955
2956fn embed_query_text(
2957 query: &str,
2958 provider: &str,
2959 model: &str,
2960 api_url: &str,
2961 api_key: &str,
2962) -> Result<Vec<f64>, String> {
2963 let input = EmbeddingRequestInput {
2964 item_key: "query".to_string(),
2965 chunks: vec![EmbeddingChunkInput {
2966 chunk_key: "q0".to_string(),
2967 text: query.to_string(),
2968 }],
2969 model: if model.is_empty() {
2970 None
2971 } else {
2972 Some(model.to_string())
2973 },
2974 url: if api_url.is_empty() {
2975 None
2976 } else {
2977 Some(api_url.to_string())
2978 },
2979 input_type: Some("query".to_string()),
2980 };
2981 let request = build_embedding_provider_request(provider, &input)?;
2982 let url = request
2983 .url
2984 .as_deref()
2985 .ok_or("no embedding URL configured")?;
2986 let mut http = ureq::post(url).set("Content-Type", "application/json");
2987 if let Some(auth) = request.auth_header {
2988 if !api_key.is_empty() {
2989 http = http.set(auth, &format!("Bearer {api_key}"));
2990 }
2991 }
2992 let resp = http
2993 .send_json(&request.body)
2994 .map_err(|e| format!("embedding request failed: {e}"))?;
2995 let payload: Value = resp
2996 .into_json()
2997 .map_err(|e| format!("embedding response parse: {e}"))?;
2998 let vectors =
2999 parse_embedding_provider_response(provider, &payload, "query", &input.chunks)?;
3000 vectors
3001 .into_iter()
3002 .next()
3003 .map(|v| v.vector)
3004 .ok_or_else(|| "no embedding vector returned".to_string())
3005}
3006
3007fn run_rag_search_xpi_fallback(
3008 client: &mut impl RpcCaller,
3009 options: &RagSearchOptions,
3010) -> Result<String, String> {
3011 let mut params = serde_json::json!({
3012 "query": options.query,
3013 "limit": options.top_k,
3014 "top_spans_per_item": options.top_spans_per_item,
3015 "include_fulltext_spans": options.include_fulltext_spans,
3016 });
3017 if let Some(map) = params.as_object_mut() {
3018 if let Some(col) = &options.collection {
3019 map.insert("collection".into(), Value::String(col.clone()));
3020 }
3021 if !options.keys.is_empty() {
3022 map.insert(
3023 "keys".into(),
3024 Value::Array(options.keys.iter().map(|k| Value::String(k.clone())).collect()),
3025 );
3026 }
3027 }
3028 let payload = client.call("rag.searchHits", Some(params))?;
3029 let hits = payload
3030 .get("hits")
3031 .and_then(Value::as_array)
3032 .cloned()
3033 .unwrap_or_default();
3034 if options.output == "jsonl" {
3035 let mut out = String::new();
3036 for hit in &hits {
3037 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3038 out.push('\n');
3039 }
3040 Ok(out)
3041 } else {
3042 let total = hits.len() as u64;
3043 format_json(
3044 &normalize_list_envelope(
3045 serde_json::json!({"items": hits, "total": total}),
3046 "items",
3047 Some(options.top_k),
3048 0,
3049 ),
3050 JsonStyle::Pretty,
3051 )
3052 }
3053}
3054
3055fn run_rag_search_command(
3056 client: &mut impl RpcCaller,
3057 options: RagSearchOptions,
3058) -> Result<String, String> {
3059 if options.zotero {
3061 if options.collection.is_none() && options.keys.is_empty() {
3062 return Err(
3063 "INVALID_ARGS: --collection or --key is required".to_string(),
3064 );
3065 }
3066 return run_rag_search_xpi_fallback(client, &options);
3067 }
3068
3069 if options.collection.is_none() && options.keys.is_empty() {
3071 return Err("INVALID_ARGS: --collection or --key required".to_string());
3072 }
3073
3074 let sidecars = resolve_sidecar_paths(
3076 client,
3077 options.collection.as_deref(),
3078 &options.keys,
3079 );
3080
3081 let sidecars = match sidecars {
3084 Ok(ref s) if !s.is_empty() => s,
3085 Err(ref e) if e.contains("COLLECTION_NOT_FOUND") => return Err(e.clone()),
3086 _ => return run_rag_search_xpi_fallback(client, &options),
3087 };
3088
3089 let (emb_provider, emb_model, emb_url, emb_key) = fetch_embedding_settings(client)?;
3091 let mut all_chunks: Vec<StructureChunk> = Vec::new();
3092 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
3093 for (_item_key, _att_key, sidecar_root) in sidecars {
3094 all_chunks.extend(load_sidecar_chunks(sidecar_root));
3095 all_vectors.extend(load_sidecar_vectors(sidecar_root, &emb_provider, &emb_model));
3096 }
3097
3098 if all_chunks.is_empty() {
3099 return run_rag_search_xpi_fallback(client, &options);
3100 }
3101
3102 let mode = fetch_retrieval_mode(client);
3104
3105 let bm25_ranked = if mode != "dense" {
3107 bm25_score_chunks(&all_chunks, &options.query, 1.2, 0.75)
3108 } else {
3109 Vec::new()
3110 };
3111
3112 let dense_ranked = if mode != "lexical" && !all_vectors.is_empty() {
3114 match embed_query_text(&options.query, &emb_provider, &emb_model, &emb_url, &emb_key) {
3115 Ok(query_vec) => {
3116 let vec_map: std::collections::HashMap<&str, &[f64]> = all_vectors
3117 .iter()
3118 .map(|v| (v.chunk_key.as_str(), v.vector.as_slice()))
3119 .collect();
3120 let mut scores: Vec<(usize, f64)> = all_chunks
3121 .iter()
3122 .enumerate()
3123 .filter_map(|(i, chunk)| {
3124 vec_map.get(chunk.chunk_key.as_str()).map(|stored| {
3125 (i, cosine_similarity(&query_vec, stored))
3126 })
3127 })
3128 .filter(|(_, s)| *s > 0.0)
3129 .collect();
3130 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
3131 scores
3132 }
3133 Err(_) => Vec::new(),
3134 }
3135 } else {
3136 Vec::new()
3137 };
3138
3139 let limit = options.top_k as usize;
3141 let ranked = if !bm25_ranked.is_empty() && !dense_ranked.is_empty() {
3142 rrf_merge(&bm25_ranked, &dense_ranked, 60.0, limit)
3143 } else if !bm25_ranked.is_empty() {
3144 bm25_ranked.into_iter().take(limit).collect()
3145 } else {
3146 dense_ranked.into_iter().take(limit).collect()
3147 };
3148
3149 let mut per_item_count: std::collections::HashMap<&str, u64> =
3151 std::collections::HashMap::new();
3152 let mut selected: Vec<(usize, f64)> = Vec::new();
3153 for (idx, score) in &ranked {
3154 let item_key = all_chunks[*idx].item_key.as_str();
3155 let count = per_item_count.entry(item_key).or_insert(0);
3156 if *count < options.top_spans_per_item {
3157 *count += 1;
3158 selected.push((*idx, *score));
3159 }
3160 }
3161
3162 let mut meta_cache: std::collections::HashMap<String, Value> =
3164 std::collections::HashMap::new();
3165 let mut hits: Vec<Value> = Vec::new();
3166 for (idx, score) in &selected {
3167 let chunk = &all_chunks[*idx];
3168 let meta = if let Some(cached) = meta_cache.get(&chunk.item_key) {
3169 cached.clone()
3170 } else {
3171 let fetched = client
3172 .call(
3173 "items.get",
3174 Some(serde_json::json!({"key": chunk.item_key})),
3175 )
3176 .unwrap_or(Value::Null);
3177 meta_cache.insert(chunk.item_key.clone(), fetched.clone());
3178 fetched
3179 };
3180 let title = meta
3181 .get("title")
3182 .and_then(Value::as_str)
3183 .unwrap_or("")
3184 .to_string();
3185 let authors = meta
3186 .get("creators")
3187 .and_then(Value::as_array)
3188 .map(|creators| {
3189 creators
3190 .iter()
3191 .filter_map(|c| {
3192 let last = c.get("lastName").and_then(Value::as_str).unwrap_or("");
3193 let first = c.get("firstName").and_then(Value::as_str).unwrap_or("");
3194 if last.is_empty() && first.is_empty() {
3195 None
3196 } else {
3197 Some(format!("{last}{first}"))
3198 }
3199 })
3200 .collect::<Vec<_>>()
3201 .join(", ")
3202 })
3203 .unwrap_or_default();
3204 let year = meta.get("date").and_then(Value::as_str).unwrap_or("");
3205 let mut hit = serde_json::json!({
3206 "item_key": chunk.item_key,
3207 "chunk_key": chunk.chunk_key,
3208 "title": title,
3209 "authors": authors,
3210 "year": year,
3211 "text": chunk.text,
3212 "page_range": chunk.page_range,
3213 "section_path": chunk.section_path,
3214 "score": score,
3215 });
3216 if options.include_fulltext_spans {
3217 hit.as_object_mut().unwrap().insert(
3218 "attachment_key".to_string(),
3219 Value::String(chunk.attachment_key.clone()),
3220 );
3221 }
3222 hits.push(hit);
3223 }
3224
3225 if options.output == "jsonl" {
3227 let mut out = String::new();
3228 for hit in &hits {
3229 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3230 out.push('\n');
3231 }
3232 Ok(out)
3233 } else {
3234 let total = hits.len() as u64;
3235 format_json(
3236 &normalize_list_envelope(
3237 serde_json::json!({"items": hits, "total": total}),
3238 "items",
3239 Some(options.top_k),
3240 0,
3241 ),
3242 JsonStyle::Pretty,
3243 )
3244 }
3245}
3246
3247fn rag_status_value(client: &mut impl RpcCaller, collection: &str) -> Result<Value, String> {
3248 let raw_store_path = rag_store_path(collection);
3249 if raw_store_path.exists() {
3250 return rag_status_from_store(collection, &raw_store_path);
3251 }
3252
3253 let mut store_candidates = Vec::new();
3254 let collection_match = find_collection_in_tree(client, collection)?;
3255 if let Some(collection_node) = collection_match.as_ref() {
3256 if let Some(name) = collection_node.get("name").and_then(Value::as_str) {
3257 store_candidates.push(rag_store_path(name));
3258 }
3259 if let Some(key) = collection_node.get("key").and_then(Value::as_str) {
3260 store_candidates.push(rag_store_path(key));
3261 }
3262 }
3263 for store_path in unique_paths(store_candidates) {
3264 if store_path.exists() {
3265 return rag_status_from_store(collection, &store_path);
3266 }
3267 }
3268
3269 rag_status_from_zotero_sidecars(client, collection, collection_match)
3270}
3271
3272fn unique_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
3273 let mut unique = Vec::new();
3274 for path in paths {
3275 if !unique.iter().any(|seen| seen == &path) {
3276 unique.push(path);
3277 }
3278 }
3279 unique
3280}
3281
3282fn rag_status_from_store(collection: &str, store_path: &Path) -> Result<Value, String> {
3283 let raw = fs::read_to_string(store_path)
3284 .map_err(|err| format!("read RAG store {}: {err}", store_path.display()))?;
3285 let store: Value = serde_json::from_str(&raw)
3286 .map_err(|err| format!("parse RAG store {}: {err}", store_path.display()))?;
3287 let chunks = store
3288 .get("chunks")
3289 .and_then(Value::as_array)
3290 .cloned()
3291 .unwrap_or_default();
3292 let mut item_keys = Vec::<Value>::new();
3293 for chunk in &chunks {
3294 let Some(item_key) = chunk.get("item_key") else {
3295 continue;
3296 };
3297 if !item_keys.iter().any(|seen| seen == item_key) {
3298 item_keys.push(item_key.clone());
3299 }
3300 }
3301 Ok(serde_json::json!({
3302 "status": "indexed",
3303 "collection": store.get("collection").and_then(Value::as_str).unwrap_or(collection),
3304 "collection_key": store.get("collection_key").cloned().unwrap_or(Value::Null),
3305 "model": store.get("model").cloned().unwrap_or(Value::String("unknown".to_string())),
3306 "total_chunks": chunks.len(),
3307 "total_items": item_keys.len(),
3308 "store_path": store_path.to_string_lossy(),
3309 }))
3310}
3311
3312fn rag_status_from_zotero_sidecars(
3313 client: &mut impl RpcCaller,
3314 collection: &str,
3315 collection_match: Option<Value>,
3316) -> Result<Value, String> {
3317 let collection_key = collection_match
3318 .as_ref()
3319 .and_then(|node| node.get("key").cloned())
3320 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
3321 let raw = paginate_rpc(
3322 client,
3323 "collections.getItems",
3324 serde_json::json!({"key": collection_key}),
3325 500,
3326 )?;
3327 let items = raw
3328 .get("items")
3329 .and_then(Value::as_array)
3330 .or_else(|| raw.as_array())
3331 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
3332 .clone();
3333
3334 let mut indexed_items = 0usize;
3335 let mut total_chunks = 0usize;
3336 for item in &items {
3337 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
3338 let chunk_count = sidecar_chunk_count_for_item(client, &item_key)?;
3339 if chunk_count > 0 {
3340 indexed_items += 1;
3341 total_chunks += chunk_count;
3342 }
3343 }
3344
3345 if indexed_items == 0 {
3346 return Ok(serde_json::json!({
3347 "status": "not indexed",
3348 "collection": collection,
3349 "total_items": items.len(),
3350 "indexed_items": 0,
3351 }));
3352 }
3353
3354 Ok(serde_json::json!({
3355 "status": "indexed",
3356 "collection": collection,
3357 "total_chunks": total_chunks,
3358 "total_items": indexed_items,
3359 "collection_items": items.len(),
3360 "source": "zotero-sidecar",
3361 }))
3362}
3363
3364fn sidecar_chunk_count_for_item(
3365 client: &mut impl RpcCaller,
3366 item_key: &Value,
3367) -> Result<usize, String> {
3368 let attachments = client.call(
3369 "attachments.list",
3370 Some(serde_json::json!({"parentKey": item_key.clone()})),
3371 )?;
3372 let Some(attachments) = attachments.as_array() else {
3373 return Ok(0);
3374 };
3375
3376 let mut count = 0usize;
3377 for attachment in attachments {
3378 let Some(path) = attachment.get("path").and_then(Value::as_str) else {
3379 continue;
3380 };
3381 let local = local_path_from_zotero_path(path);
3382 let Some(dir) = Path::new(&local).parent() else {
3383 continue;
3384 };
3385 let Ok(bytes) = read_machine_artifact_sidecar(dir, MachineArtifactKind::Chunks) else {
3386 continue;
3387 };
3388 let text = String::from_utf8_lossy(&bytes);
3389 count += text.lines().filter(|line| !line.trim().is_empty()).count();
3390 }
3391 Ok(count)
3392}
3393
3394fn rag_store_path(collection: &str) -> PathBuf {
3395 rag_store_root().join(format!("{collection}.json"))
3396}
3397
3398fn rag_store_root() -> PathBuf {
3399 let xdg_data_home = env::var_os("XDG_DATA_HOME")
3400 .filter(|path| !path.is_empty())
3401 .map(PathBuf::from);
3402 let appdata = env::var_os("APPDATA")
3403 .filter(|path| !path.is_empty())
3404 .map(PathBuf::from);
3405 let userprofile = env::var_os("USERPROFILE")
3406 .filter(|path| !path.is_empty())
3407 .map(PathBuf::from);
3408 let home = env::var_os("HOME")
3409 .filter(|path| !path.is_empty())
3410 .map(PathBuf::from);
3411
3412 rag_store_root_for_platform(
3413 ArtifactStorePlatform::current(),
3414 xdg_data_home.as_deref(),
3415 appdata.as_deref(),
3416 userprofile.as_deref(),
3417 home.as_deref(),
3418 )
3419}
3420
3421fn rag_store_root_for_platform(
3422 platform: ArtifactStorePlatform,
3423 xdg_data_home: Option<&Path>,
3424 appdata: Option<&Path>,
3425 userprofile: Option<&Path>,
3426 home: Option<&Path>,
3427) -> PathBuf {
3428 match platform {
3429 ArtifactStorePlatform::Windows => {
3430 if let Some(path) = appdata {
3431 return path.join("Zotron").join("rag");
3432 }
3433 if let Some(path) = userprofile {
3434 return path
3435 .join("AppData")
3436 .join("Roaming")
3437 .join("Zotron")
3438 .join("rag");
3439 }
3440 if let Some(path) = home {
3441 return path
3442 .join("AppData")
3443 .join("Roaming")
3444 .join("Zotron")
3445 .join("rag");
3446 }
3447 PathBuf::from(".zotron").join("rag")
3448 }
3449 ArtifactStorePlatform::Macos => {
3450 if let Some(path) = home {
3451 return path
3452 .join("Library")
3453 .join("Application Support")
3454 .join("Zotron")
3455 .join("rag");
3456 }
3457 if let Some(path) = xdg_data_home {
3458 return path.join("zotron").join("rag");
3459 }
3460 PathBuf::from(".zotron").join("rag")
3461 }
3462 ArtifactStorePlatform::Linux | ArtifactStorePlatform::Other => xdg_data_home
3463 .map(|path| path.join("zotron").join("rag"))
3464 .or_else(|| {
3465 home.map(|path| path.join(".local").join("share").join("zotron").join("rag"))
3466 })
3467 .unwrap_or_else(|| PathBuf::from(".zotron").join("rag")),
3468 }
3469}
3470
3471fn run_push_command(
3472 json_file: String,
3473 pdf: Option<String>,
3474 collection: Option<String>,
3475 on_duplicate: String,
3476 dry_run: bool,
3477 client: &mut impl RpcCaller,
3478) -> Result<String, String> {
3479 if !matches!(on_duplicate.as_str(), "skip" | "update" | "create") {
3480 return Err(format!(
3481 "INVALID_ARGS: --on-duplicate must be skip|update|create, got {on_duplicate:?}"
3482 ));
3483 }
3484
3485 let payload = if json_file == "-" {
3486 let mut input = String::new();
3487 io::stdin()
3488 .read_to_string(&mut input)
3489 .map_err(|err| format!("read stdin: {err}"))?;
3490 input
3491 } else {
3492 fs::read_to_string(&json_file).map_err(|err| format!("read {json_file}: {err}"))?
3493 };
3494 let item_json = serde_json::from_str::<Value>(&payload)
3495 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
3496
3497 match item_json.get("itemType").and_then(Value::as_str) {
3499 Some(s) if !s.is_empty() => {}
3500 _ => return Err("INVALID_ARGS: input JSON must include a non-empty \"itemType\" field".to_string()),
3501 }
3502
3503 if dry_run {
3504 let collection_key = collection
3505 .as_deref()
3506 .map(|name| resolve_collection(client, name))
3507 .transpose()?;
3508 return format_json(
3509 &serde_json::json!({
3510 "ok": true,
3511 "dryRun": true,
3512 "wouldPush": {
3513 "title": item_json.get("title").cloned().unwrap_or(Value::Null),
3514 "itemType": item_json.get("itemType").cloned().unwrap_or(Value::Null),
3515 "collectionKey": collection_key,
3516 "pdfPath": pdf,
3517 "onDuplicate": on_duplicate,
3518 }
3519 }),
3520 JsonStyle::PythonCompact,
3521 );
3522 }
3523
3524 let result = push_item(
3525 client,
3526 &item_json,
3527 pdf.as_deref(),
3528 collection.as_deref(),
3529 &on_duplicate,
3530 )?;
3531 format_json(&result, JsonStyle::PythonCompact)
3532}
3533
3534fn push_item(
3535 client: &mut impl RpcCaller,
3536 item_json: &Value,
3537 pdf_path: Option<&str>,
3538 collection: Option<&str>,
3539 on_duplicate: &str,
3540) -> Result<Value, String> {
3541 let pdf_size = if let Some(path) = pdf_path {
3542 validate_pdf_magic(path)?
3543 } else {
3544 0
3545 };
3546
3547 let collection_key = match collection {
3548 Some(name) => resolve_collection(client, name)?,
3549 None => resolve_current_collection(client)?,
3550 };
3551
3552 let dup_id = find_duplicate(client, item_json)?;
3553 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "skip") {
3554 if !is_library_root(&collection_key) {
3555 client.call(
3556 "collections.addItems",
3557 Some(serde_json::json!({"key": collection_key, "keys": [dup_id]})),
3558 )?;
3559 }
3560 let mut pdf_attached = false;
3561 if let Some(path) = pdf_path {
3562 if !item_has_pdf_attachment(client, dup_id)? {
3563 attach_pdf(client, dup_id, path)?;
3564 pdf_attached = true;
3565 }
3566 }
3567 return Ok(push_result(
3568 "skipped_duplicate",
3569 Some(dup_id.to_string()),
3570 pdf_attached,
3571 if pdf_attached { pdf_size } else { 0 },
3572 Value::Null,
3573 ));
3574 }
3575
3576 let xpi_payload = to_xpi_payload(item_json, Some(&collection_key));
3577 let (item_key, status) =
3578 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "update") {
3579 let mut params = serde_json::Map::new();
3580 params.insert("key".to_string(), Value::String(dup_id.to_string()));
3581 params.insert(
3582 "fields".to_string(),
3583 xpi_payload
3584 .get("fields")
3585 .cloned()
3586 .unwrap_or_else(|| serde_json::json!({})),
3587 );
3588 if let Some(creators) = xpi_payload.get("creators") {
3589 params.insert("creators".to_string(), creators.clone());
3590 }
3591 if let Some(tags) = xpi_payload.get("tags") {
3592 params.insert("tags".to_string(), tags.clone());
3593 }
3594 client.call("items.update", Some(Value::Object(params)))?;
3595 (dup_id.to_string(), "updated")
3596 } else {
3597 let created = client.call("items.create", Some(xpi_payload))?;
3598 let key = created
3599 .get("key")
3600 .and_then(Value::as_str)
3601 .ok_or_else(|| format!("items.create returned unexpected shape: {created:?}"))?;
3602 (key.to_string(), "created")
3603 };
3604
3605 let mut pdf_attached = false;
3606 if let Some(path) = pdf_path {
3607 if status != "updated" || !item_has_pdf_attachment(client, &item_key)? {
3608 attach_pdf(client, &item_key, path)?;
3609 pdf_attached = true;
3610 }
3611 }
3612
3613 if status == "updated" && !is_library_root(&collection_key) {
3614 client.call(
3615 "collections.addItems",
3616 Some(serde_json::json!({"key": collection_key, "keys": [item_key]})),
3617 )?;
3618 }
3619
3620 Ok(push_result(
3621 status,
3622 Some(item_key),
3623 pdf_attached,
3624 if pdf_attached { pdf_size } else { 0 },
3625 Value::Null,
3626 ))
3627}
3628
3629fn validate_pdf_magic(path: &str) -> Result<u64, String> {
3630 let bytes = fs::read(path)
3631 .map_err(|_| format!("INVALID_PDF: {path} does not start with %PDF- magic bytes"))?;
3632 if !bytes.starts_with(b"%PDF-") {
3633 return Err(format!(
3634 "INVALID_PDF: {path} does not start with %PDF- magic bytes"
3635 ));
3636 }
3637 Ok(bytes.len() as u64)
3638}
3639
3640fn resolve_current_collection(client: &mut impl RpcCaller) -> Result<Value, String> {
3641 let selected = client.call("system.currentCollection", None)?;
3642 Ok(selected
3643 .get("key")
3644 .cloned()
3645 .unwrap_or_else(|| Value::Number(0.into())))
3646}
3647
3648fn find_duplicate(
3649 client: &mut impl RpcCaller,
3650 item_json: &Value,
3651) -> Result<Option<String>, String> {
3652 if let Some(doi) = item_json
3653 .get("DOI")
3654 .and_then(Value::as_str)
3655 .filter(|doi| !doi.is_empty())
3656 {
3657 let hits = client.call("search.byIdentifier", Some(serde_json::json!({"doi": doi})))?;
3658 if let Some(key) = first_hit_key(&hits) {
3659 return Ok(Some(key));
3660 }
3661 }
3662
3663 if let Some(title) = item_json
3664 .get("title")
3665 .and_then(Value::as_str)
3666 .filter(|title| title.len() >= 10)
3667 {
3668 let hits = client.call(
3669 "search.quick",
3670 Some(serde_json::json!({"query": title, "limit": 20})),
3671 )?;
3672 if let Some(items) = response_items(&hits) {
3673 for item in items {
3674 if item.get("title").and_then(Value::as_str) == Some(title) {
3675 if let Some(key) = item.get("key").and_then(Value::as_str) {
3676 return Ok(Some(key.to_string()));
3677 }
3678 }
3679 }
3680 }
3681 }
3682
3683 Ok(None)
3684}
3685
3686fn first_hit_key(response: &Value) -> Option<String> {
3687 response_items(response)?
3688 .first()?
3689 .get("key")?
3690 .as_str()
3691 .map(ToString::to_string)
3692}
3693
3694fn response_items(response: &Value) -> Option<&Vec<Value>> {
3695 response
3696 .get("items")
3697 .and_then(Value::as_array)
3698 .or_else(|| response.as_array())
3699}
3700
3701fn to_xpi_payload(item_json: &Value, collection_key: Option<&Value>) -> Value {
3702 const NON_FIELD_KEYS: &[&str] = &[
3703 "itemType",
3704 "creators",
3705 "tags",
3706 "collections",
3707 "attachments",
3708 "relations",
3709 "notes",
3710 "id",
3711 "key",
3712 "version",
3713 ];
3714
3715 let mut fields = serde_json::Map::new();
3716 if let Some(item) = item_json.as_object() {
3717 for (key, value) in item {
3718 if !NON_FIELD_KEYS.contains(&key.as_str()) && !value.is_null() && value != "" {
3719 fields.insert(key.clone(), value.clone());
3720 }
3721 }
3722 }
3723
3724 let mut payload = serde_json::Map::new();
3725 payload.insert(
3726 "itemType".to_string(),
3727 item_json
3728 .get("itemType")
3729 .cloned()
3730 .unwrap_or_else(|| Value::String("journalArticle".to_string())),
3731 );
3732 payload.insert("fields".to_string(), Value::Object(fields));
3733
3734 if let Some(creators) = item_json.get("creators").and_then(Value::as_array) {
3735 if !creators.is_empty() {
3736 payload.insert(
3737 "creators".to_string(),
3738 Value::Array(
3739 creators
3740 .iter()
3741 .map(|creator| {
3742 serde_json::json!({
3743 "firstName": creator.get("firstName").and_then(Value::as_str).unwrap_or(""),
3744 "lastName": creator.get("lastName").and_then(Value::as_str).unwrap_or(""),
3745 "creatorType": creator.get("creatorType").and_then(Value::as_str).unwrap_or("author"),
3746 })
3747 })
3748 .collect(),
3749 ),
3750 );
3751 }
3752 }
3753
3754 if let Some(tags) = item_json.get("tags").and_then(Value::as_array) {
3755 if !tags.is_empty() {
3756 payload.insert(
3757 "tags".to_string(),
3758 Value::Array(
3759 tags.iter()
3760 .map(|tag| tag.get("tag").cloned().unwrap_or_else(|| tag.clone()))
3761 .collect(),
3762 ),
3763 );
3764 }
3765 }
3766
3767 if let Some(collection_key) = collection_key.filter(|key| !is_library_root(key)) {
3768 payload.insert(
3769 "collections".to_string(),
3770 Value::Array(vec![collection_key.clone()]),
3771 );
3772 }
3773
3774 Value::Object(payload)
3775}
3776
3777fn item_has_pdf_attachment(client: &mut impl RpcCaller, item_key: &str) -> Result<bool, String> {
3778 let attachments = client.call(
3779 "attachments.list",
3780 Some(serde_json::json!({"parentKey": item_key})),
3781 )?;
3782 Ok(has_pdf_attachment(&attachments))
3783}
3784
3785fn attach_pdf(client: &mut impl RpcCaller, item_key: &str, path: &str) -> Result<(), String> {
3786 client.call(
3787 "attachments.add",
3788 Some(serde_json::json!({
3789 "parentKey": item_key,
3790 "path": zotero_path(path),
3791 "title": "Full Text PDF",
3792 })),
3793 )?;
3794 Ok(())
3795}
3796
3797fn zotero_path(path: &str) -> String {
3798 let path = Path::new(path)
3799 .canonicalize()
3800 .unwrap_or_else(|_| Path::new(path).to_path_buf())
3801 .to_string_lossy()
3802 .into_owned();
3803 if is_wsl() {
3804 return ProcessCommand::new("wslpath")
3805 .arg("-w")
3806 .arg(&path)
3807 .output()
3808 .ok()
3809 .filter(|output| output.status.success())
3810 .and_then(|output| String::from_utf8(output.stdout).ok())
3811 .map(|converted| converted.trim().to_string())
3812 .filter(|converted| !converted.is_empty())
3813 .unwrap_or(path);
3814 }
3815 path
3816}
3817
3818fn is_wsl() -> bool {
3819 if env::var_os("WSL_DISTRO_NAME").is_some() {
3820 return true;
3821 }
3822 fs::read_to_string("/proc/sys/kernel/osrelease")
3823 .map(|release| release.to_ascii_lowercase().contains("microsoft"))
3824 .unwrap_or(false)
3825}
3826
3827fn is_library_root(value: &Value) -> bool {
3828 value.as_i64() == Some(0) || value.as_u64() == Some(0)
3829}
3830
3831fn push_result(
3832 status: &str,
3833 zotero_item_key: Option<String>,
3834 pdf_attached: bool,
3835 pdf_size_bytes: u64,
3836 error: Value,
3837) -> Value {
3838 serde_json::json!({
3839 "status": status,
3840 "zotero_item_key": zotero_item_key,
3841 "pdf_attached": pdf_attached,
3842 "pdf_size_bytes": pdf_size_bytes,
3843 "error": error,
3844 })
3845}
3846
3847fn run_search(
3848 args: SearchArgs,
3849 client: &mut impl RpcCaller,
3850) -> Result<(Value, JsonStyle), String> {
3851 let SearchArgs {
3852 query, fulltext, author, after, before, journal, tag,
3853 doi, isbn, issn, collection, limit, offset, ..
3854 } = args;
3855
3856 let has_identifier = doi.is_some() || isbn.is_some() || issn.is_some();
3857 if has_identifier {
3858 let mut params = serde_json::Map::new();
3859 if let Some(doi) = doi { params.insert("doi".into(), Value::String(doi)); }
3860 if let Some(isbn) = isbn { params.insert("isbn".into(), Value::String(isbn)); }
3861 if let Some(issn) = issn { params.insert("issn".into(), Value::String(issn)); }
3862 let value = client.call("search.byIdentifier", Some(Value::Object(params)))?;
3863 return Ok((normalize_list_envelope(value, "items", None, 0), JsonStyle::Pretty));
3864 }
3865
3866 if fulltext {
3867 let query = query.ok_or("INVALID_ARGS: --fulltext requires a search query")?;
3868 let mut params = serde_json::json!({"query": query, "limit": limit});
3869 if let (Some(col), Some(map)) = (collection, params.as_object_mut()) {
3870 map.insert("collection".into(), resolve_collection(client, &col)?);
3871 }
3872 let value = client.call("search.fulltext", Some(params))?;
3873 return Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty));
3874 }
3875
3876 let has_filters = author.is_some() || after.is_some() || before.is_some()
3877 || journal.is_some() || tag.is_some();
3878 if has_filters {
3879 let mut conditions: Vec<Value> = Vec::new();
3880 if let Some(query) = &query {
3881 conditions.push(serde_json::json!({
3882 "field": "quicksearch-titleCreatorYear",
3883 "operator": "contains",
3884 "value": query,
3885 }));
3886 }
3887 if let Some(author) = author {
3888 conditions.push(serde_json::json!({
3889 "field": "creator", "operator": "contains", "value": author,
3890 }));
3891 }
3892 if let Some(after) = after {
3893 conditions.push(serde_json::json!({
3894 "field": "date", "operator": "isAfter", "value": after,
3895 }));
3896 }
3897 if let Some(before) = before {
3898 conditions.push(serde_json::json!({
3899 "field": "date", "operator": "isBefore", "value": before,
3900 }));
3901 }
3902 if let Some(journal) = journal {
3903 conditions.push(serde_json::json!({
3904 "field": "publicationTitle", "operator": "contains", "value": journal,
3905 }));
3906 }
3907 if let Some(tag) = tag {
3908 conditions.push(serde_json::json!({
3909 "field": "tag", "operator": "is", "value": tag,
3910 }));
3911 }
3912 let value = client.call(
3913 "search.advanced",
3914 Some(serde_json::json!({
3915 "conditions": conditions,
3916 "operator": "and",
3917 "limit": limit,
3918 "offset": offset,
3919 })),
3920 )?;
3921 return Ok((normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty));
3922 }
3923
3924 let query = query.ok_or(
3925 "INVALID_ARGS: provide a search query, or use --doi/--isbn/--issn for identifier lookup"
3926 )?;
3927 let value = if let Some(col) = collection {
3928 let key = resolve_collection(client, &col)?;
3929 let response = client.call(
3930 "collections.getItems",
3931 Some(serde_json::json!({"key": key})),
3932 )?;
3933 collection_quick_search_response(&response, &query, limit)
3934 } else {
3935 filter_search_artifacts(client.call(
3936 "search.quick",
3937 Some(serde_json::json!({"query": query, "limit": limit})),
3938 )?)
3939 };
3940 Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty))
3941}
3942
3943fn run_search_management_command(
3944 command: SearchManagementCommand,
3945 client: &mut impl RpcCaller,
3946) -> Result<(Value, JsonStyle), String> {
3947 match command {
3948 SearchManagementCommand::SavedSearches { .. } => Ok((
3949 normalize_list_envelope(client.call("search.savedSearches", None)?, "items", None, 0),
3950 JsonStyle::Pretty,
3951 )),
3952 SearchManagementCommand::CreateSaved {
3953 name, condition, dry_run, ..
3954 } => {
3955 let conditions = condition
3956 .iter()
3957 .map(|raw| parse_search_condition(raw))
3958 .collect::<Result<Vec<_>, _>>()?;
3959 let params = serde_json::json!({"name": name, "conditions": conditions});
3960 if dry_run {
3961 Ok((dry_run_value("search.createSavedSearch", params), JsonStyle::PythonCompact))
3962 } else {
3963 Ok((client.call("search.createSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3964 }
3965 }
3966 SearchManagementCommand::DeleteSaved {
3967 search_key, dry_run, ..
3968 } => {
3969 let params = serde_json::json!({"key": search_key});
3970 if dry_run {
3971 Ok((dry_run_value("search.deleteSavedSearch", params), JsonStyle::PythonCompact))
3972 } else {
3973 Ok((client.call("search.deleteSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3974 }
3975 }
3976 }
3977}
3978
3979fn filter_search_artifacts(mut value: Value) -> Value {
3980 let Some(items) = value.get_mut("items").and_then(Value::as_array_mut) else {
3981 return value;
3982 };
3983 items.retain(|item| match item.get("title").and_then(Value::as_str) {
3984 Some(title) => !is_zotron_evidence_artifact(title),
3985 None => true,
3986 });
3987 let total_items = items.len() as u64;
3988 if let Some(total) = value.get_mut("total") {
3989 *total = Value::from(total_items);
3990 }
3991 value
3992}
3993
3994fn collection_quick_search_response(response: &Value, query: &str, limit: u64) -> Value {
3995 let mut matched = collection_items(response)
3996 .into_iter()
3997 .filter(|item| !item_is_evidence_artifact(item))
3998 .filter(|item| quick_item_matches(item, query))
3999 .collect::<Vec<_>>();
4000 let total = matched.len() as u64;
4001 let limit = usize::try_from(limit).unwrap_or(usize::MAX);
4002 if matched.len() > limit {
4003 matched.truncate(limit);
4004 }
4005 serde_json::json!({"items": matched, "total": total})
4006}
4007
4008fn item_is_evidence_artifact(item: &Value) -> bool {
4009 item.get("title")
4010 .and_then(Value::as_str)
4011 .is_some_and(is_zotron_evidence_artifact)
4012}
4013
4014fn quick_item_matches(item: &Value, query: &str) -> bool {
4015 let terms = query
4016 .split_whitespace()
4017 .map(|term| term.to_lowercase())
4018 .filter(|term| !term.is_empty())
4019 .collect::<Vec<_>>();
4020 if terms.is_empty() {
4021 return true;
4022 }
4023 let mut haystack = String::new();
4024 append_search_text(item, &mut haystack);
4025 let haystack = haystack.to_lowercase();
4026 terms.iter().all(|term| haystack.contains(term))
4027}
4028
4029fn append_search_text(value: &Value, out: &mut String) {
4030 match value {
4031 Value::String(text) => {
4032 out.push(' ');
4033 out.push_str(text);
4034 }
4035 Value::Number(number) => {
4036 out.push(' ');
4037 out.push_str(&number.to_string());
4038 }
4039 Value::Bool(value) => {
4040 out.push(' ');
4041 out.push_str(if *value { "true" } else { "false" });
4042 }
4043 Value::Array(items) => {
4044 for item in items {
4045 append_search_text(item, out);
4046 }
4047 }
4048 Value::Object(map) => {
4049 for item in map.values() {
4050 append_search_text(item, out);
4051 }
4052 }
4053 Value::Null => {}
4054 }
4055}
4056
4057fn parse_search_condition(raw: &str) -> Result<Value, String> {
4058 let mut parts = raw.split_whitespace();
4059 let field = parts.next();
4060 let operator = parts.next();
4061 let value = parts.collect::<Vec<_>>().join(" ");
4062 match (field, operator, value.is_empty()) {
4063 (Some(field), Some(operator), false) => Ok(serde_json::json!({
4064 "field": field,
4065 "operator": operator,
4066 "value": value,
4067 })),
4068 _ => Err(format!(
4069 "INVALID_ARGS: --condition must be 'field operator value', got: {raw:?}"
4070 )),
4071 }
4072}
4073
4074fn normalize_list_envelope(value: Value, list_key: &str, limit: Option<u64>, offset: u64) -> Value {
4075 if let Value::Array(arr) = value {
4076 let total = arr.len() as u64;
4077 let mut obj = serde_json::Map::new();
4078 obj.insert(list_key.to_string(), Value::Array(arr));
4079 obj.insert("total".to_string(), Value::from(total));
4080 if let Some(limit) = limit {
4081 obj.insert("limit".to_string(), Value::from(limit));
4082 }
4083 obj.insert("offset".to_string(), Value::from(offset));
4084 obj.insert("hasMore".to_string(), Value::Bool(false));
4085 return Value::Object(obj);
4086 }
4087
4088 let mut obj = match value {
4089 Value::Object(obj) if obj.contains_key(list_key) => obj,
4090 other => return other,
4091 };
4092
4093 let items_len = obj
4094 .get(list_key)
4095 .and_then(Value::as_array)
4096 .map_or(0, |a| a.len()) as u64;
4097 let total = obj
4098 .get("total")
4099 .and_then(Value::as_u64)
4100 .unwrap_or(items_len);
4101
4102 obj.insert("total".to_string(), Value::from(total));
4103 if let Some(limit) = limit {
4104 obj.insert("limit".to_string(), Value::from(limit));
4105 }
4106 obj.insert("offset".to_string(), Value::from(offset));
4107 obj.insert(
4108 "hasMore".to_string(),
4109 Value::Bool(offset + items_len < total),
4110 );
4111
4112 Value::Object(obj)
4113}
4114
4115const RPC_PAGINATION_SAFETY_CAP: usize = 10_000;
4116const RPC_PAGE_LIST_KEYS: [&str; 4] = ["items", "tags", "results", "data"];
4117
4118fn paginate_rpc(
4119 client: &mut impl RpcCaller,
4120 method: &str,
4121 params: Value,
4122 page_size: usize,
4123) -> Result<Value, String> {
4124 let base = params
4125 .as_object()
4126 .ok_or_else(|| "params must be a JSON object".to_string())?;
4127 let mut out = Vec::new();
4128 let mut prev_page: Option<Vec<Value>> = None;
4129 let mut offset = 0usize;
4130
4131 loop {
4132 let mut page_params = base.clone();
4133 page_params.insert("offset".to_string(), Value::Number(offset.into()));
4134 page_params.insert("limit".to_string(), Value::Number(page_size.into()));
4135 let response = client.call(method, Some(Value::Object(page_params)))?;
4136
4137 let page = match extract_page(&response) {
4138 Some(page) => page,
4139 None if out.is_empty() => return Ok(response),
4140 None if response.is_object() => {
4141 return Err(format!(
4142 "paginate: {method:?} returned a non-paginated dict after {} accumulated rows; aborting",
4143 out.len()
4144 ));
4145 }
4146 None => {
4147 return Err(format!(
4148 "paginate: {method:?} returned non-list/non-dict shape after {} accumulated rows; aborting",
4149 out.len()
4150 ));
4151 }
4152 };
4153
4154 if prev_page.as_ref() == Some(&page) {
4155 return Err(format!(
4156 "paginate: {method:?} returned identical pages — method likely ignores offset; aborting after {} rows",
4157 out.len()
4158 ));
4159 }
4160
4161 let page_len = page.len();
4162 out.extend(page.clone());
4163 if page_len < page_size {
4164 return Ok(Value::Array(out));
4165 }
4166 if out.len() >= RPC_PAGINATION_SAFETY_CAP {
4167 out.truncate(RPC_PAGINATION_SAFETY_CAP);
4168 return Ok(Value::Array(out));
4169 }
4170 prev_page = Some(page);
4171 offset += page_size;
4172 }
4173}
4174
4175fn extract_page(response: &Value) -> Option<Vec<Value>> {
4176 if let Some(page) = response.as_array() {
4177 return Some(page.clone());
4178 }
4179 let object = response.as_object()?;
4180 for key in RPC_PAGE_LIST_KEYS {
4181 if let Some(page) = object.get(key).and_then(Value::as_array) {
4182 return Some(page.clone());
4183 }
4184 }
4185 None
4186}
4187
4188fn run_find_pdfs_command(
4189 client: &mut impl RpcCaller,
4190 collection: String,
4191 limit: usize,
4192) -> Result<(Value, JsonStyle), String> {
4193 let collection_key = resolve_collection(client, &collection)?;
4194 let response = client.call(
4195 "collections.getItems",
4196 Some(serde_json::json!({"key": collection_key})),
4197 )?;
4198 let items = collection_items(&response);
4199
4200 let mut missing = Vec::new();
4201 for item in &items {
4202 let Some(item_key) = item.get("key").and_then(Value::as_str) else {
4203 continue;
4204 };
4205 let attachments = client.call(
4206 "attachments.list",
4207 Some(serde_json::json!({"parentKey": item_key})),
4208 )?;
4209 if !has_pdf_attachment(&attachments) {
4210 missing.push(item.clone());
4211 }
4212 if limit > 0 && missing.len() >= limit {
4213 break;
4214 }
4215 }
4216
4217 let mut results = Vec::new();
4218 for item in &missing {
4219 let item_key = item
4220 .get("key")
4221 .and_then(Value::as_str)
4222 .ok_or_else(|| "missing item lacks key".to_string())?;
4223 let response = client.call(
4224 "attachments.findPDF",
4225 Some(serde_json::json!({"parentKey": item_key})),
4226 )?;
4227 let attachment = response.get("attachment").filter(|value| !value.is_null());
4228 results.push(serde_json::json!({
4229 "item_key": item_key,
4230 "title": item.get("title").cloned().unwrap_or(Value::Null),
4231 "found": attachment.is_some(),
4232 "attachment_key": attachment
4233 .and_then(|attachment| attachment.get("key"))
4234 .cloned()
4235 .unwrap_or(Value::Null),
4236 }));
4237 }
4238
4239 Ok((
4240 serde_json::json!({
4241 "scanned": items.len(),
4242 "attempted": missing.len(),
4243 "results": results,
4244 }),
4245 JsonStyle::Pretty,
4246 ))
4247}
4248
4249fn collection_items(response: &Value) -> Vec<Value> {
4250 if let Some(items) = response.get("items").and_then(Value::as_array) {
4251 return items.clone();
4252 }
4253 response.as_array().cloned().unwrap_or_default()
4254}
4255
4256fn has_pdf_attachment(attachments: &Value) -> bool {
4257 attachments
4258 .as_array()
4259 .is_some_and(|attachments| attachments.iter().any(is_pdf_attachment))
4260}
4261
4262fn is_pdf_attachment(attachment: &Value) -> bool {
4263 let content_type = attachment
4264 .get("contentType")
4265 .and_then(Value::as_str)
4266 .unwrap_or_default()
4267 .to_lowercase();
4268 let path = attachment
4269 .get("path")
4270 .and_then(Value::as_str)
4271 .unwrap_or_default()
4272 .to_lowercase();
4273 matches!(
4274 content_type.as_str(),
4275 "application/pdf" | "application/x-pdf"
4276 ) || path.ends_with(".pdf")
4277}
4278
4279fn call_json(
4280 client: &mut impl RpcCaller,
4281 method: &str,
4282 params: Option<Value>,
4283) -> Result<Value, String> {
4284 client.call(method, params)
4285}
4286
4287fn run_system_command(
4288 command: SystemCommand,
4289 client: &mut impl RpcCaller,
4290) -> Result<(Value, JsonStyle), String> {
4291 let value = match command {
4292 SystemCommand::Version { .. } => client.call("system.version", None)?,
4293 SystemCommand::Libraries { .. } => client.call("system.libraries", None)?,
4294 SystemCommand::LibraryStats { library, .. } => {
4295 let params = library.map(|id| serde_json::json!({"id": id}));
4296 client.call("system.libraryStats", params)?
4297 }
4298 SystemCommand::Schema { item_type, .. } => {
4299 if let Some(item_type) = item_type {
4300 let fields = client.call("system.itemFields", Some(serde_json::json!({"itemType": item_type})))?;
4301 let creators = client.call("system.creatorTypes", Some(serde_json::json!({"itemType": item_type})))?;
4302 let field_names: Vec<Value> = fields.as_array().unwrap_or(&vec![])
4303 .iter()
4304 .filter_map(|f| f.get("field").cloned())
4305 .collect();
4306 let creator_names: Vec<Value> = creators.as_array().unwrap_or(&vec![])
4307 .iter()
4308 .filter_map(|c| c.get("creatorType").cloned())
4309 .collect();
4310 serde_json::json!({
4311 "itemType": item_type,
4312 "fields": field_names,
4313 "creatorTypes": creator_names,
4314 })
4315 } else {
4316 let types = client.call("system.itemTypes", None)?;
4317 let type_names: Vec<Value> = types.as_array().unwrap_or(&vec![])
4318 .iter()
4319 .filter_map(|t| t.get("itemType").cloned())
4320 .collect();
4321 Value::Array(type_names)
4322 }
4323 }
4324 SystemCommand::CurrentCollection { .. } => client.call("system.currentCollection", None)?,
4325 SystemCommand::ListMethods { .. } => client.call("system.listMethods", None)?,
4326 SystemCommand::Describe { method, .. } => {
4327 let params = method.map(|method| serde_json::json!({"method": method}));
4328 client.call("system.describe", params)?
4329 }
4330 };
4331 Ok((value, JsonStyle::Pretty))
4332}
4333
4334fn run_items_command(
4335 command: ItemsCommand,
4336 client: &mut impl RpcCaller,
4337) -> Result<(Value, JsonStyle), String> {
4338 let (value, style) = match command {
4339 ItemsCommand::Add {
4340 doi,
4341 isbn,
4342 from_url,
4343 file,
4344 collection,
4345 dry_run,
4346 ..
4347 } => {
4348 if let Some(doi) = doi {
4349 run_add_identifier_command(client, "items.addByDOI", "doi", doi, collection, dry_run)?
4350 } else if let Some(isbn) = isbn {
4351 run_add_identifier_command(client, "items.addByISBN", "isbn", isbn, collection, dry_run)?
4352 } else if let Some(from_url) = from_url {
4353 run_add_identifier_command(client, "items.addByURL", "url", from_url, collection, dry_run)?
4354 } else if let Some(file) = file {
4355 let mut params = serde_json::json!({"path": zotero_path(&file)});
4356 maybe_insert_collection(client, &mut params, collection)?;
4357 run_mutation_command(client, "items.addFromFile", params, dry_run)?
4358 } else {
4359 return Err("INVALID_ARGS: provide one of --doi, --isbn, --from-url, or --file".into());
4360 }
4361 }
4362 ItemsCommand::Create {
4363 item_type,
4364 fields,
4365 dry_run,
4366 ..
4367 } => {
4368 let parsed_fields = parse_field_options(&fields)?;
4369 let mut params = serde_json::json!({"itemType": item_type});
4370 if !parsed_fields.is_empty() {
4371 if let Some(map) = params.as_object_mut() {
4372 map.insert("fields".to_string(), Value::Object(parsed_fields));
4373 }
4374 }
4375 run_mutation_command(client, "items.create", params, dry_run)?
4376 }
4377 ItemsCommand::Update {
4378 key,
4379 fields,
4380 dry_run,
4381 ..
4382 } => {
4383 let parsed_fields = parse_field_options(&fields)?;
4384 let mut params = serde_json::json!({"key": key});
4385 if !parsed_fields.is_empty() {
4386 if let Some(map) = params.as_object_mut() {
4387 map.insert("fields".to_string(), Value::Object(parsed_fields));
4388 }
4389 }
4390 run_mutation_command(client, "items.update", params, dry_run)?
4391 }
4392 ItemsCommand::Delete { key, dry_run, .. } => run_mutation_command(
4393 client,
4394 "items.delete",
4395 serde_json::json!({"key": key}),
4396 dry_run,
4397 )?,
4398 ItemsCommand::Trash {
4399 items, dry_run, ..
4400 } => {
4401 if items.len() == 1 {
4402 run_mutation_command(
4403 client,
4404 "items.trash",
4405 serde_json::json!({"key": items[0]}),
4406 dry_run,
4407 )?
4408 } else {
4409 run_mutation_command(
4410 client,
4411 "items.batchTrash",
4412 serde_json::json!({"keys": items}),
4413 dry_run,
4414 )?
4415 }
4416 }
4417 ItemsCommand::Restore { item, dry_run, .. } => run_mutation_command(
4418 client,
4419 "items.restore",
4420 serde_json::json!({"key": item}),
4421 dry_run,
4422 )?,
4423 ItemsCommand::MergeDuplicates { keys, dry_run, .. } => {
4424 if keys.len() < 2 {
4425 return Err("INVALID_ARGS: need at least 2 keys to merge".to_string());
4426 }
4427 run_mutation_command(
4428 client,
4429 "items.mergeDuplicates",
4430 serde_json::json!({"keys": keys}),
4431 dry_run,
4432 )?
4433 }
4434 ItemsCommand::AddRelated {
4435 key,
4436 target,
4437 dry_run,
4438 ..
4439 } => run_mutation_command(
4440 client,
4441 "items.addRelated",
4442 serde_json::json!({"key": key, "targetKey": target}),
4443 dry_run,
4444 )?,
4445 ItemsCommand::RemoveRelated {
4446 key,
4447 target,
4448 dry_run,
4449 ..
4450 } => run_mutation_command(
4451 client,
4452 "items.removeRelated",
4453 serde_json::json!({"key": key, "targetKey": target}),
4454 dry_run,
4455 )?,
4456 ItemsCommand::Get { item, .. } => (
4457 client.call("items.get", Some(serde_json::json!({"key": item})))?,
4458 JsonStyle::Pretty,
4459 ),
4460 ItemsCommand::List {
4461 limit,
4462 offset,
4463 sort,
4464 direction,
4465 trash,
4466 ..
4467 } => {
4468 if trash {
4469 let value = client.call(
4470 "items.getTrash",
4471 Some(serde_json::json!({"limit": limit, "offset": offset})),
4472 )?;
4473 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4474 } else {
4475 let mut params = serde_json::json!({
4476 "limit": limit,
4477 "offset": offset,
4478 "direction": direction,
4479 });
4480 if let (Some(sort), Some(map)) = (sort, params.as_object_mut()) {
4481 map.insert("sort".to_string(), Value::String(sort));
4482 }
4483 let value = client.call("items.list", Some(params))?;
4484 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4485 }
4486 }
4487 ItemsCommand::FindDuplicates { .. } => (
4488 client.call("items.findDuplicates", None)?,
4489 JsonStyle::Pretty,
4490 ),
4491 ItemsCommand::Recent {
4492 limit,
4493 offset,
4494 recent_type,
4495 ..
4496 } => {
4497 if recent_type != "added" && recent_type != "modified" {
4498 return Err(format!(
4499 "--type must be added or modified, got {recent_type:?}"
4500 ));
4501 }
4502 let value = client.call(
4503 "items.getRecent",
4504 Some(
4505 serde_json::json!({"limit": limit, "offset": offset, "type": recent_type}),
4506 ),
4507 )?;
4508 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4509 }
4510 ItemsCommand::Fulltext { key, .. } => (
4511 client.call("items.getFullText", Some(serde_json::json!({"key": key})))?,
4512 JsonStyle::Pretty,
4513 ),
4514 ItemsCommand::Related { key, .. } => (
4515 normalize_list_envelope(
4516 client.call("items.getRelated", Some(serde_json::json!({"key": key})))?,
4517 "items",
4518 None,
4519 0,
4520 ),
4521 JsonStyle::Pretty,
4522 ),
4523 ItemsCommand::CitationKey { key, .. } => (
4524 client.call("items.citationKey", Some(serde_json::json!({"key": key})))?,
4525 JsonStyle::Pretty,
4526 ),
4527 };
4528 Ok((value, style))
4529}
4530
4531fn run_add_identifier_command(
4532 client: &mut impl RpcCaller,
4533 method: &str,
4534 param_name: &str,
4535 param_value: String,
4536 collection: Option<String>,
4537 dry_run: bool,
4538) -> Result<(Value, JsonStyle), String> {
4539 let mut params = Value::Object(serde_json::Map::from_iter([(
4540 param_name.to_string(),
4541 Value::String(param_value),
4542 )]));
4543 maybe_insert_collection(client, &mut params, collection)?;
4544 run_mutation_command(client, method, params, dry_run)
4545}
4546
4547fn run_mutation_command(
4548 client: &mut impl RpcCaller,
4549 method: &str,
4550 params: Value,
4551 dry_run: bool,
4552) -> Result<(Value, JsonStyle), String> {
4553 let value = if dry_run {
4554 serde_json::json!({
4555 "ok": true,
4556 "dryRun": true,
4557 "wouldCall": method,
4558 "wouldCallParams": params,
4559 })
4560 } else {
4561 client.call(method, Some(params))?
4562 };
4563 Ok((value, JsonStyle::PythonCompact))
4564}
4565
4566fn parse_field_options(fields: &[String]) -> Result<serde_json::Map<String, Value>, String> {
4567 let mut parsed = serde_json::Map::new();
4568 for field in fields {
4569 let (key, value) = field
4570 .split_once('=')
4571 .ok_or_else(|| format!("INVALID_ARGS: --field must be key=value, got: {field:?}"))?;
4572 parsed.insert(key.to_string(), Value::String(value.to_string()));
4573 }
4574 Ok(parsed)
4575}
4576
4577fn maybe_insert_collection(
4578 client: &mut impl RpcCaller,
4579 params: &mut Value,
4580 collection: Option<String>,
4581) -> Result<(), String> {
4582 let Some(collection) = collection else {
4583 return Ok(());
4584 };
4585 let collection = resolve_collection(client, &collection)?;
4586 let include = match &collection {
4587 Value::Null => false,
4588 Value::Number(number) => number.as_i64() != Some(0),
4589 _ => true,
4590 };
4591 if include {
4592 params
4593 .as_object_mut()
4594 .expect("mutation params are always objects")
4595 .insert("collection".to_string(), collection);
4596 }
4597 Ok(())
4598}
4599
4600fn run_settings_command(
4601 command: SettingsCommand,
4602 client: &mut impl RpcCaller,
4603) -> Result<(Value, JsonStyle), String> {
4604 let (value, style) = match command {
4605 SettingsCommand::Get { key, .. } => (
4606 client.call("settings.get", Some(serde_json::json!({"key": key})))?,
4607 JsonStyle::Pretty,
4608 ),
4609 SettingsCommand::List { .. } => (client.call("settings.getAll", None)?, JsonStyle::Pretty),
4610 SettingsCommand::Set {
4611 pairs,
4612 file,
4613 dry_run,
4614 ..
4615 } => {
4616 if let Some(file) = file {
4617 let raw = fs::read_to_string(&file)
4619 .map_err(|err| format!("INVALID_JSON: Could not read JSON: {err}"))?;
4620 let settings: Value = serde_json::from_str(&raw)
4621 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
4622 if dry_run {
4623 (
4624 dry_run_value("settings.setAll", settings),
4625 JsonStyle::PythonCompact,
4626 )
4627 } else {
4628 (
4629 client.call("settings.setAll", Some(settings))?,
4630 JsonStyle::PythonCompact,
4631 )
4632 }
4633 } else if pairs.len() == 2 {
4634 let key = &pairs[0];
4636 let value = &pairs[1];
4637 let parsed_value = serde_json::from_str::<Value>(value)
4638 .unwrap_or(Value::String(value.clone()));
4639 let params = serde_json::json!({"key": key, "value": parsed_value});
4640 if dry_run {
4641 (
4642 dry_run_value("settings.set", params),
4643 JsonStyle::PythonCompact,
4644 )
4645 } else {
4646 (
4647 client.call("settings.set", Some(params))?,
4648 JsonStyle::PythonCompact,
4649 )
4650 }
4651 } else if pairs.len() > 2 && pairs.len() % 2 == 0 {
4652 let mut map = serde_json::Map::new();
4654 for chunk in pairs.chunks(2) {
4655 let parsed = serde_json::from_str::<Value>(&chunk[1])
4656 .unwrap_or(Value::String(chunk[1].clone()));
4657 map.insert(chunk[0].clone(), parsed);
4658 }
4659 let settings = Value::Object(map);
4660 if dry_run {
4661 (
4662 dry_run_value("settings.setAll", settings),
4663 JsonStyle::PythonCompact,
4664 )
4665 } else {
4666 (
4667 client.call("settings.setAll", Some(settings))?,
4668 JsonStyle::PythonCompact,
4669 )
4670 }
4671 } else {
4672 return Err(
4673 "INVALID_ARGS: provide key value pairs (even number of args) or --file".into(),
4674 );
4675 }
4676 }
4677 };
4678 Ok((value, style))
4679}
4680
4681fn run_tags_command(
4682 command: TagsCommand,
4683 client: &mut impl RpcCaller,
4684) -> Result<(Value, JsonStyle), String> {
4685 let (value, style) = match command {
4686 TagsCommand::List { limit, .. } => {
4687 let value = client.call("tags.list", Some(serde_json::json!({"limit": limit})))?;
4688 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
4689 }
4690 TagsCommand::Rename {
4691 old, new, dry_run, ..
4692 } => run_tag_mutation(
4693 client,
4694 "tags.rename",
4695 serde_json::json!({"oldName": old, "newName": new}),
4696 dry_run,
4697 )?,
4698 TagsCommand::Delete { tag, dry_run, .. } => run_tag_mutation(
4699 client,
4700 "tags.delete",
4701 serde_json::json!({"tag": tag}),
4702 dry_run,
4703 )?,
4704 TagsCommand::Add {
4705 keys, tags, dry_run, ..
4706 } => {
4707 if keys.len() == 1 {
4708 run_tag_mutation(
4709 client,
4710 "tags.add",
4711 serde_json::json!({"key": keys[0], "tags": tags}),
4712 dry_run,
4713 )?
4714 } else {
4715 run_tag_mutation(
4716 client,
4717 "tags.batchUpdate",
4718 serde_json::json!({"keys": keys, "add": tags}),
4719 dry_run,
4720 )?
4721 }
4722 }
4723 TagsCommand::Remove {
4724 keys, tags, dry_run, ..
4725 } => {
4726 if keys.len() == 1 {
4727 run_tag_mutation(
4728 client,
4729 "tags.remove",
4730 serde_json::json!({"key": keys[0], "tags": tags}),
4731 dry_run,
4732 )?
4733 } else {
4734 run_tag_mutation(
4735 client,
4736 "tags.batchUpdate",
4737 serde_json::json!({"keys": keys, "remove": tags}),
4738 dry_run,
4739 )?
4740 }
4741 }
4742 };
4743 Ok((value, style))
4744}
4745
4746fn run_tag_mutation(
4747 client: &mut impl RpcCaller,
4748 method: &str,
4749 params: Value,
4750 dry_run: bool,
4751) -> Result<(Value, JsonStyle), String> {
4752 if dry_run {
4753 Ok((dry_run_value(method, params), JsonStyle::PythonCompact))
4754 } else {
4755 Ok((client.call(method, Some(params))?, JsonStyle::PythonCompact))
4756 }
4757}
4758
4759fn dry_run_value(method: &str, params: Value) -> Value {
4760 serde_json::json!({
4761 "ok": true,
4762 "dryRun": true,
4763 "wouldCall": method,
4764 "wouldCallParams": params,
4765 })
4766}
4767
4768fn run_annotations_command(
4769 command: AnnotationsCommand,
4770 client: &mut impl RpcCaller,
4771) -> Result<(Value, JsonStyle), String> {
4772 let (value, style) = match command {
4773 AnnotationsCommand::List { parent, .. } => {
4774 let value = client.call(
4775 "annotations.list",
4776 Some(serde_json::json!({"parentKey": parent})),
4777 )?;
4778 let total = value
4779 .get("items")
4780 .and_then(Value::as_array)
4781 .map_or(0, |a| a.len()) as u64;
4782 (normalize_list_envelope(value, "items", Some(total), 0), JsonStyle::Pretty)
4783 }
4784 AnnotationsCommand::Create {
4785 parent,
4786 annotation_type,
4787 position,
4788 quote,
4789 page,
4790 sort_index,
4791 text,
4792 comment,
4793 color,
4794 dry_run,
4795 ..
4796 } => {
4797 let annotation_type = annotation_type.unwrap_or_else(|| "highlight".to_string());
4798 if !matches!(
4799 annotation_type.as_str(),
4800 "highlight" | "note" | "underline" | "image" | "ink"
4801 ) {
4802 return Err(format!(
4803 "INVALID_ARGS: --type must be highlight|note|underline|image|ink, got {annotation_type:?}"
4804 ));
4805 }
4806 let mut params = serde_json::Map::new();
4807 params.insert("parentKey".to_string(), Value::String(parent));
4808 params.insert("type".to_string(), Value::String(annotation_type.clone()));
4809 params.insert("color".to_string(), Value::String(color));
4810
4811 if let Some(ref quote_text) = quote {
4812 if !matches!(annotation_type.as_str(), "highlight" | "underline") {
4813 return Err(format!(
4814 "INVALID_ARGS: --quote is only valid for highlight|underline, got {annotation_type:?}"
4815 ));
4816 }
4817 params.insert("quote".to_string(), Value::String(quote_text.clone()));
4818 if let Some(page_idx) = page {
4819 params.insert(
4820 "pageIndex".to_string(),
4821 Value::Number(page_idx.into()),
4822 );
4823 }
4824 if let Some(raw) = position {
4826 let pos = serde_json::from_str::<Value>(&raw)
4827 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))?;
4828 validate_annotation_position(annotation_type.as_str(), &pos)?;
4829 params.insert("position".to_string(), pos);
4830 }
4831 } else {
4832 let position = position
4833 .ok_or_else(|| "INVALID_ARGS: --position JSON is required (or use --quote)".to_string())
4834 .and_then(|raw| {
4835 serde_json::from_str::<Value>(&raw)
4836 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))
4837 })?;
4838 validate_annotation_position(annotation_type.as_str(), &position)?;
4839 params.insert("position".to_string(), position);
4840 }
4841
4842 if let Some(sort_index) = sort_index {
4843 params.insert(
4844 "sortIndex".to_string(),
4845 parse_annotation_sort_index(sort_index)?,
4846 );
4847 }
4848 if let Some(text) = text {
4849 params.insert("text".to_string(), Value::String(text));
4850 }
4851 if let Some(comment) = comment {
4852 params.insert("comment".to_string(), Value::String(comment));
4853 }
4854 run_mutating_command(client, "annotations.create", Value::Object(params), dry_run)?
4855 }
4856 AnnotationsCommand::Delete {
4857 annotation_key,
4858 dry_run,
4859 ..
4860 } => run_mutating_command(
4861 client,
4862 "annotations.delete",
4863 serde_json::json!({"key": annotation_key}),
4864 dry_run,
4865 )?,
4866 };
4867 Ok((value, style))
4868}
4869
4870fn validate_annotation_position(annotation_type: &str, position: &Value) -> Result<(), String> {
4871 position
4872 .get("pageIndex")
4873 .and_then(Value::as_i64)
4874 .filter(|value| *value >= 0)
4875 .ok_or_else(|| {
4876 "INVALID_ARGS: --position must include a non-negative integer pageIndex".to_string()
4877 })?;
4878
4879 if annotation_type == "ink" {
4880 let has_paths = position
4881 .get("paths")
4882 .and_then(Value::as_array)
4883 .is_some_and(|paths| !paths.is_empty());
4884 if !has_paths {
4885 return Err("INVALID_ARGS: ink --position must include non-empty paths".to_string());
4886 }
4887 return Ok(());
4888 }
4889
4890 let valid_rects = position
4891 .get("rects")
4892 .and_then(Value::as_array)
4893 .is_some_and(|rects| !rects.is_empty() && rects.iter().all(is_annotation_rect));
4894 if !valid_rects {
4895 return Err(
4896 "INVALID_ARGS: --position must include non-empty rects of [x1, y1, x2, y2]".to_string(),
4897 );
4898 }
4899 Ok(())
4900}
4901
4902fn is_annotation_rect(value: &Value) -> bool {
4903 value.as_array().is_some_and(|coords| {
4904 coords.len() == 4
4905 && coords
4906 .iter()
4907 .all(|coord| coord.as_f64().is_some_and(f64::is_finite))
4908 })
4909}
4910
4911fn parse_annotation_sort_index(raw: String) -> Result<Value, String> {
4912 let parsed = serde_json::from_str::<Value>(&raw).unwrap_or_else(|_| Value::String(raw));
4913 let valid = match &parsed {
4914 Value::Number(number) => number.as_f64().is_some_and(f64::is_finite),
4915 Value::String(value) => {
4916 is_zotero_pdf_sort_index(value.trim())
4917 || (!value.trim().is_empty()
4918 && value.trim().parse::<f64>().is_ok_and(f64::is_finite))
4919 }
4920 _ => false,
4921 };
4922 if valid {
4923 Ok(parsed)
4924 } else {
4925 Err(format!(
4926 "INVALID_ARGS: --sort-index must be a finite number or numeric string, got {parsed}"
4927 ))
4928 }
4929}
4930
4931fn is_zotero_pdf_sort_index(value: &str) -> bool {
4932 let mut parts = value.split('|');
4933 matches!(
4934 (parts.next(), parts.next(), parts.next(), parts.next()),
4935 (Some(page), Some(offset), Some(y), None)
4936 if page.len() == 5
4937 && offset.len() == 6
4938 && y.len() == 5
4939 && page.chars().all(|ch| ch.is_ascii_digit())
4940 && offset.chars().all(|ch| ch.is_ascii_digit())
4941 && y.chars().all(|ch| ch.is_ascii_digit())
4942 )
4943}
4944
4945fn run_attachments_command(
4946 command: AttachmentsCommand,
4947 client: &mut impl RpcCaller,
4948) -> Result<(Value, JsonStyle), String> {
4949 let value = match command {
4950 AttachmentsCommand::List {
4951 parent,
4952 limit,
4953 offset,
4954 ..
4955 } => normalize_list_envelope(
4956 client.call(
4957 "attachments.list",
4958 Some(serde_json::json!({"parentKey": parent})),
4959 )?,
4960 "items",
4961 Some(limit),
4962 offset,
4963 ),
4964 AttachmentsCommand::Get { key, .. } => {
4965 client.call("attachments.get", Some(serde_json::json!({"key": key})))?
4966 }
4967 AttachmentsCommand::Fulltext { key, .. } => client.call(
4968 "attachments.getFulltext",
4969 Some(serde_json::json!({"key": key})),
4970 )?,
4971 AttachmentsCommand::Path { key, .. } => localize_attachment_path_response(
4972 client.call("attachments.getPath", Some(serde_json::json!({"key": key})))?,
4973 ),
4974 AttachmentsCommand::Add {
4975 parent,
4976 path,
4977 from_url,
4978 title,
4979 dry_run,
4980 ..
4981 } => {
4982 match (path, from_url) {
4983 (Some(p), None) => {
4984 let mut params = serde_json::json!({"parentKey": parent, "path": zotero_path(&p)});
4985 insert_optional_string(&mut params, "title", title);
4986 if dry_run {
4987 return Ok((
4988 dry_run_value("attachments.add", params),
4989 JsonStyle::PythonCompact,
4990 ));
4991 }
4992 return Ok((
4993 client.call("attachments.add", Some(params))?,
4994 JsonStyle::PythonCompact,
4995 ));
4996 }
4997 (None, Some(u)) => {
4998 let mut params = serde_json::json!({"parentKey": parent, "url": u});
4999 insert_optional_string(&mut params, "title", title);
5000 if dry_run {
5001 return Ok((
5002 dry_run_value("attachments.addByURL", params),
5003 JsonStyle::PythonCompact,
5004 ));
5005 }
5006 return Ok((
5007 client.call("attachments.addByURL", Some(params))?,
5008 JsonStyle::PythonCompact,
5009 ));
5010 }
5011 (Some(_), Some(_)) => {
5012 return Err("INVALID_ARGS: --path and --from-url are mutually exclusive".to_string());
5013 }
5014 (None, None) => {
5015 return Err("INVALID_ARGS: either --path or --from-url is required".to_string());
5016 }
5017 }
5018 }
5019 AttachmentsCommand::Delete { key, dry_run, .. } => {
5020 let params = serde_json::json!({"key": key});
5021 if dry_run {
5022 return Ok((
5023 dry_run_value("attachments.delete", params),
5024 JsonStyle::PythonCompact,
5025 ));
5026 }
5027 return Ok((
5028 client.call("attachments.delete", Some(params))?,
5029 JsonStyle::PythonCompact,
5030 ));
5031 }
5032 AttachmentsCommand::FindPdf { parent, .. } => client.call(
5033 "attachments.findPDF",
5034 Some(serde_json::json!({"parentKey": parent})),
5035 )?,
5036 };
5037 Ok((value, JsonStyle::Pretty))
5038}
5039
5040fn localize_attachment_path_response(mut value: Value) -> Value {
5041 if let Some(path) = value.get("path").and_then(Value::as_str) {
5042 let local = local_path_from_zotero_path(path);
5043 if let Some(map) = value.as_object_mut() {
5044 map.insert("path".to_string(), Value::String(local));
5045 }
5046 }
5047 value
5048}
5049
5050fn run_notes_command(
5051 command: NotesCommand,
5052 client: &mut impl RpcCaller,
5053) -> Result<(Value, JsonStyle), String> {
5054 let (value, style) = match command {
5055 NotesCommand::List {
5056 parent,
5057 limit,
5058 offset,
5059 ..
5060 } => {
5061 let value = client.call(
5062 "notes.list",
5063 Some(serde_json::json!({"parentKey": parent})),
5064 )?;
5065 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
5066 }
5067 NotesCommand::Get { note_key, .. } => {
5068 let value = client.call("notes.get", Some(serde_json::json!({"key": note_key})))?;
5069 (value, JsonStyle::Pretty)
5070 }
5071 NotesCommand::Create {
5072 parent,
5073 content,
5074 tags,
5075 dry_run,
5076 ..
5077 } => {
5078 let mut params = serde_json::Map::new();
5079 params.insert("parentKey".to_string(), Value::String(parent));
5080 params.insert("content".to_string(), Value::String(content));
5081 if !tags.is_empty() {
5082 params.insert(
5083 "tags".to_string(),
5084 Value::Array(tags.into_iter().map(Value::String).collect()),
5085 );
5086 }
5087 run_mutating_command(client, "notes.create", Value::Object(params), dry_run)?
5088 }
5089 NotesCommand::Update {
5090 note_key,
5091 content,
5092 dry_run,
5093 ..
5094 } => run_mutating_command(
5095 client,
5096 "notes.update",
5097 serde_json::json!({"key": note_key, "content": content}),
5098 dry_run,
5099 )?,
5100 NotesCommand::Delete {
5101 note_key, dry_run, ..
5102 } => {
5103 run_mutating_command(
5105 client,
5106 "items.delete",
5107 serde_json::json!({"key": note_key}),
5108 dry_run,
5109 )?
5110 }
5111 NotesCommand::Search { query, limit, .. } => {
5112 let value = client.call(
5113 "notes.search",
5114 Some(serde_json::json!({"query": query, "limit": limit})),
5115 )?;
5116 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
5117 }
5118 };
5119 Ok((value, style))
5120}
5121
5122fn run_mutating_command(
5123 client: &mut impl RpcCaller,
5124 method: &str,
5125 params: Value,
5126 dry_run: bool,
5127) -> Result<(Value, JsonStyle), String> {
5128 if dry_run {
5129 Ok((
5130 serde_json::json!({
5131 "ok": true,
5132 "dryRun": true,
5133 "wouldCall": method,
5134 "wouldCallParams": params,
5135 }),
5136 JsonStyle::PythonCompact,
5137 ))
5138 } else {
5139 client
5140 .call(method, Some(params))
5141 .map(|value| (value, JsonStyle::PythonCompact))
5142 }
5143}
5144
5145fn run_collections_command(
5146 command: CollectionsCommand,
5147 client: &mut impl RpcCaller,
5148) -> Result<(Value, JsonStyle), String> {
5149 let value = match command {
5150 CollectionsCommand::List { .. } => normalize_list_envelope(
5151 client.call("collections.list", None)?,
5152 "items",
5153 None,
5154 0,
5155 ),
5156 CollectionsCommand::Tree { .. } => client.call("collections.tree", None)?,
5157 CollectionsCommand::Get { name_or_id, .. } => {
5158 let key = resolve_collection(client, &name_or_id)?;
5159 client.call("collections.get", Some(serde_json::json!({"key": key})))?
5160 }
5161 CollectionsCommand::GetItems {
5162 name_or_id,
5163 limit,
5164 offset,
5165 ..
5166 } => {
5167 let key = resolve_collection(client, &name_or_id)?;
5168 let mut params = serde_json::json!({"key": key});
5169 if let Some(map) = params.as_object_mut() {
5170 if let Some(limit) = limit {
5171 map.insert("limit".to_string(), Value::Number(limit.into()));
5172 }
5173 if offset > 0 {
5174 map.insert("offset".to_string(), Value::Number(offset.into()));
5175 }
5176 }
5177 normalize_list_envelope(
5178 client.call("collections.getItems", Some(params))?,
5179 "items",
5180 limit,
5181 offset,
5182 )
5183 }
5184 CollectionsCommand::Stats { name_or_id, .. } => {
5185 let key = resolve_collection(client, &name_or_id)?;
5186 client.call("collections.stats", Some(serde_json::json!({"key": key})))?
5187 }
5188 CollectionsCommand::Rename {
5189 old_name,
5190 new_name,
5191 dry_run,
5192 ..
5193 } => {
5194 let key = resolve_mutable_collection(client, &old_name, "rename")?;
5195 let params = serde_json::json!({"key": key, "name": new_name});
5196 if dry_run {
5197 return Ok((
5198 dry_run_value("collections.rename", params),
5199 JsonStyle::PythonCompact,
5200 ));
5201 }
5202 return Ok((
5203 client.call("collections.rename", Some(params))?,
5204 JsonStyle::PythonCompact,
5205 ));
5206 }
5207 CollectionsCommand::Create {
5208 name,
5209 parent,
5210 dry_run,
5211 ..
5212 } => {
5213 let mut params = serde_json::json!({"name": name});
5214 if let Some(parent) = parent {
5215 let parent_key = resolve_mutable_collection(client, &parent, "use as parent")?;
5216 if let Some(map) = params.as_object_mut() {
5217 map.insert("parentKey".to_string(), parent_key);
5218 }
5219 }
5220 if dry_run {
5221 return Ok((
5222 dry_run_value("collections.create", params),
5223 JsonStyle::PythonCompact,
5224 ));
5225 }
5226 return Ok((
5227 client.call("collections.create", Some(params))?,
5228 JsonStyle::PythonCompact,
5229 ));
5230 }
5231 CollectionsCommand::Delete {
5232 name_or_id,
5233 dry_run,
5234 ..
5235 } => {
5236 let key = resolve_mutable_collection(client, &name_or_id, "delete")?;
5237 let params = serde_json::json!({"key": key});
5238 if dry_run {
5239 return Ok((
5240 dry_run_value("collections.delete", params),
5241 JsonStyle::PythonCompact,
5242 ));
5243 }
5244 return Ok((
5245 client.call("collections.delete", Some(params))?,
5246 JsonStyle::PythonCompact,
5247 ));
5248 }
5249 CollectionsCommand::AddItems {
5250 collection,
5251 item_keys,
5252 dry_run,
5253 ..
5254 } => {
5255 let key = resolve_mutable_collection(client, &collection, "add to")?;
5256 let params = serde_json::json!({"key": key, "keys": item_keys});
5257 if dry_run {
5258 return Ok((
5259 dry_run_value("collections.addItems", params),
5260 JsonStyle::PythonCompact,
5261 ));
5262 }
5263 return Ok((
5264 client.call("collections.addItems", Some(params))?,
5265 JsonStyle::PythonCompact,
5266 ));
5267 }
5268 CollectionsCommand::RemoveItems {
5269 collection,
5270 item_keys,
5271 dry_run,
5272 ..
5273 } => {
5274 let key = resolve_mutable_collection(client, &collection, "operate on")?;
5275 let params = serde_json::json!({"key": key, "keys": item_keys});
5276 if dry_run {
5277 return Ok((
5278 dry_run_value("collections.removeItems", params),
5279 JsonStyle::PythonCompact,
5280 ));
5281 }
5282 return Ok((
5283 client.call("collections.removeItems", Some(params))?,
5284 JsonStyle::PythonCompact,
5285 ));
5286 }
5287 };
5288 Ok((value, JsonStyle::Pretty))
5289}
5290
5291fn resolve_export_keys(
5292 client: &mut impl RpcCaller,
5293 mut keys: Vec<String>,
5294 collection: Option<String>,
5295) -> Result<Vec<String>, String> {
5296 if let Some(name) = collection {
5297 let col_key = resolve_collection(client, &name)?;
5298 let response = client.call(
5299 "collections.getItems",
5300 Some(serde_json::json!({"key": col_key})),
5301 )?;
5302 let items = collection_items(&response);
5303 for item in items {
5304 if let Some(key) = item.get("key").and_then(Value::as_str) {
5305 if !keys.contains(&key.to_string()) {
5306 keys.push(key.to_string());
5307 }
5308 }
5309 }
5310 }
5311 if keys.is_empty() {
5312 return Err("No item keys provided. Pass positional keys and/or --collection.".to_string());
5313 }
5314 Ok(keys)
5315}
5316
5317fn run_export(args: ExportArgs, client: &mut impl RpcCaller) -> Result<String, String> {
5318 let keys = resolve_export_keys(client, args.keys, args.collection)?;
5319 match args.format.as_str() {
5320 "bibtex" => run_export_content_command(client, "export.bibtex", keys),
5321 "ris" => run_export_content_command(client, "export.ris", keys),
5322 "csl-json" => {
5323 let response =
5324 client.call("export.cslJson", Some(serde_json::json!({"keys": keys})))?;
5325 if let Some(content) = response.get("content") {
5326 format_json(content, JsonStyle::Pretty)
5327 } else {
5328 format_json(&response, JsonStyle::PythonCompact)
5329 }
5330 }
5331 "bibliography" => {
5332 let response = client.call(
5333 "export.bibliography",
5334 Some(serde_json::json!({"keys": keys, "style": args.style})),
5335 )?;
5336 if let Some(object) = response.as_object() {
5337 let field = if args.html { "html" } else { "text" };
5338 if object.contains_key("html") || object.contains_key("text") {
5339 return raw_value_output(
5340 object.get(field).unwrap_or(&Value::String(String::new())),
5341 );
5342 }
5343 }
5344 format_json(&response, JsonStyle::PythonCompact)
5345 }
5346 other => Err(format!(
5347 "INVALID_ARGS: unknown format {other:?}, expected bibtex/ris/csl-json/bibliography"
5348 )),
5349 }
5350}
5351
5352fn run_export_content_command(
5353 client: &mut impl RpcCaller,
5354 method: &str,
5355 keys: Vec<String>,
5356) -> Result<String, String> {
5357 let response = client.call(method, Some(serde_json::json!({"keys": keys})))?;
5358 if let Some(content) = response.get("content") {
5359 raw_value_output(content)
5360 } else {
5361 format_json(&response, JsonStyle::PythonCompact)
5362 }
5363}
5364
5365fn raw_value_output(value: &Value) -> Result<String, String> {
5366 let mut out = match value {
5367 Value::Null => String::new(),
5368 Value::String(content) => content.clone(),
5369 other => to_python_repr(other),
5370 };
5371 out.push('\n');
5372 Ok(out)
5373}
5374
5375fn to_python_repr(value: &Value) -> String {
5376 match value {
5377 Value::Null => "None".to_string(),
5378 Value::Bool(value) => {
5379 if *value {
5380 "True".to_string()
5381 } else {
5382 "False".to_string()
5383 }
5384 }
5385 Value::Number(value) => value.to_string(),
5386 Value::String(value) => format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\'")),
5387 Value::Array(values) => {
5388 let inner = values
5389 .iter()
5390 .map(to_python_repr)
5391 .collect::<Vec<_>>()
5392 .join(", ");
5393 format!("[{inner}]")
5394 }
5395 Value::Object(entries) => {
5396 let inner = entries
5397 .iter()
5398 .map(|(key, value)| {
5399 format!("'{}': {}", key.replace('\'', "\\'"), to_python_repr(value))
5400 })
5401 .collect::<Vec<_>>()
5402 .join(", ");
5403 format!("{{{inner}}}")
5404 }
5405 }
5406}
5407
5408fn resolve_collection(client: &mut impl RpcCaller, name_or_id: &str) -> Result<Value, String> {
5409 let trimmed = name_or_id.trim();
5410 if let Ok(id) = trimmed.parse::<i64>() {
5411 return Ok(Value::Number(id.into()));
5412 }
5413
5414 let collections = client.call("collections.list", None)?;
5415 let items = collections
5416 .get("items")
5417 .and_then(Value::as_array)
5418 .or_else(|| collections.as_array())
5419 .ok_or_else(|| "collections.list returned non-array result".to_string())?;
5420
5421 if let Some(collection) = items
5422 .iter()
5423 .find(|collection| collection.get("key").and_then(Value::as_str) == Some(trimmed))
5424 {
5425 return collection_key(collection);
5426 }
5427
5428 let exact = items
5429 .iter()
5430 .filter(|collection| collection.get("name").and_then(Value::as_str) == Some(trimmed))
5431 .collect::<Vec<_>>();
5432 if exact.len() == 1 {
5433 return collection_key(exact[0]);
5434 }
5435
5436 let needle = normalize_collection_name(trimmed);
5437 let fuzzy = items
5438 .iter()
5439 .filter(|collection| {
5440 collection
5441 .get("name")
5442 .and_then(Value::as_str)
5443 .map(normalize_collection_name)
5444 .is_some_and(|name| name.contains(&needle))
5445 })
5446 .collect::<Vec<_>>();
5447
5448 match fuzzy.len() {
5449 1 => collection_key(fuzzy[0]),
5450 0 => Err(format!(
5451 "COLLECTION_NOT_FOUND: No collection named {trimmed:?}"
5452 )),
5453 _ => Err(format!(
5454 "COLLECTION_AMBIGUOUS: Multiple collections match {trimmed:?}"
5455 )),
5456 }
5457}
5458
5459fn collection_key(collection: &Value) -> Result<Value, String> {
5460 collection
5461 .get("key")
5462 .cloned()
5463 .ok_or_else(|| "collection result is missing key".to_string())
5464}
5465
5466fn resolve_mutable_collection(
5467 client: &mut impl RpcCaller,
5468 name_or_id: &str,
5469 operation: &str,
5470) -> Result<Value, String> {
5471 let key = resolve_collection(client, name_or_id)?;
5472 if key.as_i64() == Some(0) {
5473 return Err(format!(
5474 "COLLECTION_NOT_FOUND: {name_or_id:?} resolved to library root (cannot {operation})"
5475 ));
5476 }
5477 Ok(key)
5478}
5479
5480fn insert_optional_string(value: &mut Value, key: &str, maybe: Option<String>) {
5481 if let (Some(map), Some(content)) = (value.as_object_mut(), maybe) {
5482 map.insert(key.to_string(), Value::String(content));
5483 }
5484}
5485
5486fn normalize_collection_name(name: &str) -> String {
5487 name.split_whitespace()
5488 .collect::<Vec<_>>()
5489 .join(" ")
5490 .to_lowercase()
5491}
5492
5493fn format_json(value: &Value, style: JsonStyle) -> Result<String, String> {
5494 let mut out = match style {
5495 JsonStyle::PythonCompact => to_python_compact_json(value),
5496 JsonStyle::Pretty => serde_json::to_string_pretty(value).map_err(|err| err.to_string())?,
5497 };
5498 out.push('\n');
5499 Ok(out)
5500}
5501
5502fn to_python_compact_json(value: &Value) -> String {
5503 match value {
5504 Value::Null => "null".to_string(),
5505 Value::Bool(value) => value.to_string(),
5506 Value::Number(value) => value.to_string(),
5507 Value::String(value) => {
5508 serde_json::to_string(value).expect("string serialization cannot fail")
5509 }
5510 Value::Array(values) => {
5511 let inner = values
5512 .iter()
5513 .map(to_python_compact_json)
5514 .collect::<Vec<_>>()
5515 .join(", ");
5516 format!("[{inner}]")
5517 }
5518 Value::Object(entries) => {
5519 let inner = entries
5520 .iter()
5521 .map(|(key, value)| {
5522 let key = serde_json::to_string(key).expect("string serialization cannot fail");
5523 format!("{key}: {}", to_python_compact_json(value))
5524 })
5525 .collect::<Vec<_>>()
5526 .join(", ");
5527 format!("{{{inner}}}")
5528 }
5529 }
5530}