1use std::{
4 env, fs,
5 io::{self, Read},
6 path::{Path, PathBuf},
7 process::Command as ProcessCommand,
8 thread,
9 time::{Duration, Instant, SystemTime, UNIX_EPOCH},
10};
11
12use clap::{error::ErrorKind, Parser, Subcommand};
13use serde_json::Value;
14use zotron_rpc::{StdProviderCommandRunner, UreqProviderHttpTransport, ZoteroRpc};
15use zotron_types::{
16 bm25_score_chunks, build_embedding_provider_request, build_ocr_provider_request,
17 builtin_ocr_provider_specs, cosine_similarity, execute_embedding_provider_request,
18 is_zotron_evidence_artifact, machine_artifact_exists_for_item,
19 machine_artifact_exists_in_sidecar, machine_artifact_store_root,
20 ocr_provider_spec as raw_ocr_provider_spec, parse_embedding_provider_response,
21 parse_ocr_provider_response, read_machine_artifact_sidecar, rrf_merge,
22 write_machine_artifact_sidecar, ArtifactStorePlatform, EmbeddingChunkInput,
23 EmbeddingRequestInput, EmbeddingVector, MachineArtifactKind, OcrRequestInput,
24 ProviderCommandRunner, ProviderHttpInvocation, ProviderHttpTransport, StructureChunk,
25 DEFAULT_RPC_URL,
26};
27
28pub trait RpcCaller {
29 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String>;
30}
31
32#[derive(Debug, Clone, PartialEq, serde::Serialize)]
33pub struct CliOcrProviderSpec {
34 pub id: &'static str,
35 pub provider: &'static str,
36 pub request_style: &'static str,
37 pub auth: &'static str,
38 pub auth_header: &'static str,
39 pub supports_pdf_direct: bool,
40 pub key_field: &'static str,
41}
42
43#[derive(Debug, Clone, PartialEq, serde::Serialize)]
44pub struct CliEmbeddingProviderSpec {
45 pub id: &'static str,
46 pub provider: &'static str,
47 pub request_style: &'static str,
48 pub default_url: String,
49 pub default_model: &'static str,
50 pub auth: &'static str,
51 pub key_field: &'static str,
52}
53
54pub fn ocr_provider_specs() -> Vec<CliOcrProviderSpec> {
55 builtin_ocr_provider_specs()
56 .into_iter()
57 .map(cli_ocr_provider_spec)
58 .collect()
59}
60
61pub fn ocr_provider_spec(provider: &str) -> Result<CliOcrProviderSpec, String> {
62 zotron_types::ocr_provider_spec(provider).map(cli_ocr_provider_spec)
63}
64
65pub fn embedding_provider_spec(provider: &str) -> Result<CliEmbeddingProviderSpec, String> {
66 let spec = zotron_types::embedding_provider_spec(provider)?;
67 Ok(CliEmbeddingProviderSpec {
68 id: spec.id,
69 provider: spec.provider_key,
70 request_style: if spec.provider_key == "alibaba" {
71 "dashscope"
72 } else {
73 spec.request_style.as_str()
74 },
75 default_url: spec.default_url.unwrap_or("").to_string(),
76 default_model: spec.default_model,
77 auth: spec.auth,
78 key_field: spec.key_field,
79 })
80}
81
82pub fn chunks_from_blocks(blocks: &[Value], max_chars: usize) -> Result<Vec<Value>, String> {
83 let typed = blocks
84 .iter()
85 .map(json_block_to_pdf_block)
86 .collect::<Result<Vec<_>, _>>()?;
87 let chunks = zotron_types::chunks_from_blocks(&typed, max_chars);
88 chunks
89 .into_iter()
90 .map(|chunk| chunk_to_cli_value(&chunk, &typed))
91 .collect()
92}
93
94fn cli_ocr_provider_spec(spec: zotron_types::OcrProviderSpec) -> CliOcrProviderSpec {
95 CliOcrProviderSpec {
96 id: spec.provider_key,
97 provider: spec.provider_key,
98 request_style: spec.request_style.as_str(),
99 auth: spec.auth,
100 auth_header: spec.auth_header,
101 supports_pdf_direct: spec.supports_pdf_direct,
102 key_field: spec.key_field,
103 }
104}
105
106fn json_block_to_pdf_block(value: &Value) -> Result<zotron_types::PdfEvidenceBlock, String> {
107 let block_key = value
108 .get("block_key")
109 .and_then(Value::as_str)
110 .ok_or_else(|| "block missing block_key".to_string())?
111 .to_string();
112 let item_key = value
113 .get("item_key")
114 .and_then(Value::as_str)
115 .ok_or_else(|| "block missing item_key".to_string())?
116 .to_string();
117 let attachment_key = value
118 .get("attachment_key")
119 .and_then(Value::as_str)
120 .ok_or_else(|| "block missing attachment_key".to_string())?
121 .to_string();
122 let page_idx = value
123 .get("page_idx")
124 .or_else(|| value.get("page"))
125 .and_then(Value::as_u64)
126 .unwrap_or(1);
127 let block_type = value
128 .get("type")
129 .or_else(|| value.get("block_type"))
130 .and_then(Value::as_str)
131 .unwrap_or("paragraph")
132 .to_string();
133 let section_path = value
134 .get("section_path")
135 .and_then(Value::as_array)
136 .map(|items| {
137 items
138 .iter()
139 .filter_map(Value::as_str)
140 .map(ToString::to_string)
141 .collect::<Vec<_>>()
142 })
143 .unwrap_or_default();
144 let text = value
145 .get("text")
146 .and_then(Value::as_str)
147 .unwrap_or("")
148 .to_string();
149 let bbox = value.get("bbox").and_then(value_bbox4);
150
151 Ok(zotron_types::PdfEvidenceBlock {
152 block_key,
153 item_key,
154 attachment_key,
155 page_idx,
156 block_type,
157 bbox,
158 section_path,
159 text,
160 })
161}
162
163fn chunk_to_cli_value(
164 chunk: &zotron_types::StructureChunk,
165 blocks: &[zotron_types::PdfEvidenceBlock],
166) -> Result<Value, String> {
167 let refs = chunk
168 .block_keys
169 .iter()
170 .filter_map(|key| blocks.iter().find(|block| &block.block_key == key))
171 .map(|block| {
172 serde_json::json!({
173 "block_key": block.block_key,
174 "page_idx": block.page_idx,
175 "bbox": block.bbox.map(|bbox| bbox.iter().map(|n| {
176 if n.fract() == 0.0 {
177 Value::from(*n as i64)
178 } else {
179 Value::from(*n)
180 }
181 }).collect::<Vec<_>>()),
182 })
183 })
184 .collect::<Vec<_>>();
185 Ok(serde_json::json!({
186 "chunk_key": chunk.chunk_key,
187 "item_key": chunk.item_key,
188 "attachment_key": chunk.attachment_key,
189 "block_keys": chunk.block_keys,
190 "section_path": chunk.section_path,
191 "text": chunk.text,
192 "page_start": chunk.page_start,
193 "page_end": chunk.page_end,
194 "evidence_refs": refs,
195 }))
196}
197
198fn value_bbox4(value: &Value) -> Option<[f64; 4]> {
199 let arr = value.as_array()?;
200 if arr.len() != 4 {
201 return None;
202 }
203 Some([
204 arr[0].as_f64()?,
205 arr[1].as_f64()?,
206 arr[2].as_f64()?,
207 arr[3].as_f64()?,
208 ])
209}
210
211impl RpcCaller for ZoteroRpc {
212 fn call(&mut self, method: &str, params: Option<Value>) -> Result<Value, String> {
213 self.call(method, params).map_err(|err| err.to_string())
214 }
215}
216
217#[derive(Debug, Parser)]
218#[command(name = "zotron", about = "Rust client + CLI for the Zotron XPI")]
219struct Cli {
220 #[command(subcommand)]
221 command: Command,
222}
223
224#[derive(Debug, Subcommand)]
225enum OcrCommand {
226 Providers,
228 #[command(name = "run")]
230 Run {
231 #[arg(long)]
232 provider: String,
233 #[arg(long)]
235 input: Option<String>,
236 #[arg(long)]
238 file: Option<String>,
239 #[arg(long = "item-key")]
241 item_key: Option<String>,
242 #[arg(long = "attachment-key")]
244 attachment_key: Option<String>,
245 #[arg(long = "mime-type")]
247 mime_type: Option<String>,
248 #[arg(long)]
250 endpoint: Option<String>,
251 #[arg(long = "api-key-env")]
253 api_key_env: Option<String>,
254 },
255 Status {
257 #[arg(long)]
258 collection: String,
259 #[arg(long, default_value = DEFAULT_RPC_URL)]
260 url: String,
261 },
262 #[command(name = "process")]
264 Process {
265 #[arg(long, default_value = "mineru")]
266 provider: String,
267 #[arg(long)]
269 parent: String,
270 #[arg(long)]
272 attachment: Option<String>,
273 #[arg(long = "source-url")]
275 source_url: Option<String>,
276 #[arg(long = "result-dir")]
278 result_dir: Option<String>,
279 #[arg(long = "result-zip")]
281 result_zip: Option<String>,
282 #[arg(long = "provider-endpoint")]
284 provider_endpoint: Option<String>,
285 #[arg(long = "api-key-env", default_value = "ZOTRON_MINERU_API_KEY")]
287 api_key_env: String,
288 #[arg(long = "poll-interval-seconds", default_value_t = 5)]
289 poll_interval_seconds: u64,
290 #[arg(long = "timeout-seconds", default_value_t = 900)]
291 timeout_seconds: u64,
292 #[arg(long = "chunk-chars", default_value_t = 1200)]
293 chunk_chars: usize,
294 #[arg(long, default_value = DEFAULT_RPC_URL)]
295 url: String,
296 },
297}
298
299#[derive(Debug, Subcommand)]
300enum Command {
301 Ping {
303 #[arg(long, default_value = DEFAULT_RPC_URL)]
304 url: String,
305 },
306 Rpc {
308 method: String,
309 #[arg(default_value = "{}")]
310 params_json: String,
311 #[arg(long, default_value = DEFAULT_RPC_URL)]
312 url: String,
313 #[arg(long)]
314 paginate: bool,
315 #[arg(long, default_value_t = 100)]
316 page_size: usize,
317 },
318 Push {
320 json_file: String,
322 #[arg(long)]
324 pdf: Option<String>,
325 #[arg(long)]
327 collection: Option<String>,
328 #[arg(long = "on-duplicate", default_value = "skip")]
330 on_duplicate: String,
331 #[arg(long, default_value = DEFAULT_RPC_URL)]
332 url: String,
333 #[arg(long = "dry-run")]
335 dry_run: bool,
336 },
337 System {
339 #[command(subcommand)]
340 command: SystemCommand,
341 },
342 Search(SearchArgs),
344 Items {
346 #[command(subcommand)]
347 command: ItemsCommand,
348 },
349 Collections {
351 #[command(subcommand)]
352 command: CollectionsCommand,
353 },
354 Notes {
356 #[command(subcommand)]
357 command: NotesCommand,
358 },
359 Attachments {
361 #[command(subcommand)]
362 command: AttachmentsCommand,
363 },
364 Settings {
366 #[command(subcommand)]
367 command: SettingsCommand,
368 },
369 Tags {
371 #[command(subcommand)]
372 command: TagsCommand,
373 },
374 Export(ExportArgs),
376 Annotations {
378 #[command(subcommand)]
379 command: AnnotationsCommand,
380 },
381 Ocr {
383 #[command(subcommand)]
384 command: OcrCommand,
385 },
386 Rag {
388 #[command(subcommand)]
389 command: RagCommand,
390 },
391 #[command(name = "find-pdfs")]
393 FindPdfs {
394 #[arg(long)]
395 collection: String,
396 #[arg(long, default_value_t = 0)]
397 limit: usize,
398 #[arg(long, default_value = DEFAULT_RPC_URL)]
399 url: String,
400 },
401}
402
403struct RagSearchOptions {
404 query: String,
405 collection: Option<String>,
406 keys: Vec<String>,
407 zotero: bool,
408 top_spans_per_item: u64,
409 include_fulltext_spans: bool,
410 top_k: u64,
411 output: String,
412}
413
414#[derive(Debug, Subcommand)]
415enum RagCommand {
416 #[command(name = "providers")]
418 Providers,
419 #[command(name = "embed")]
421 Embed {
422 #[arg(long)]
423 provider: String,
424 #[arg(long)]
426 input: String,
427 #[arg(long)]
429 endpoint: Option<String>,
430 #[arg(long)]
432 model: Option<String>,
433 #[arg(long = "input-type")]
435 input_type: Option<String>,
436 #[arg(long = "api-key-env")]
438 api_key_env: Option<String>,
439 },
440 Status {
442 #[arg(long)]
443 collection: String,
444 #[arg(long, default_value = DEFAULT_RPC_URL)]
445 url: String,
446 },
447 #[command(name = "search")]
449 Search {
450 query: String,
451 #[arg(long)]
452 collection: Option<String>,
453 #[arg(long = "key", alias = "keys")]
455 keys: Vec<String>,
456 #[arg(long)]
457 zotero: bool,
458 #[arg(long = "top-spans-per-item", default_value_t = 3)]
459 top_spans_per_item: u64,
460 #[arg(long = "include-fulltext-spans")]
461 include_fulltext_spans: bool,
462 #[arg(long = "limit", alias = "top-k", default_value_t = 50)]
463 top_k: u64,
464 #[arg(long, default_value = "json", value_parser = ["json", "jsonl"])]
465 output: String,
466 #[arg(long, default_value = DEFAULT_RPC_URL)]
467 url: String,
468 },
469}
470
471#[derive(Debug, Subcommand)]
472enum SystemCommand {
473 Version {
475 #[arg(long, default_value = DEFAULT_RPC_URL)]
476 url: String,
477 },
478 Libraries {
480 #[arg(long, default_value = DEFAULT_RPC_URL)]
481 url: String,
482 },
483 #[command(name = "library-stats")]
485 LibraryStats {
486 #[arg(long)]
487 library: Option<i64>,
488 #[arg(long, default_value = DEFAULT_RPC_URL)]
489 url: String,
490 },
491 Schema {
493 #[arg(long = "type")]
494 item_type: Option<String>,
495 #[arg(long, default_value = DEFAULT_RPC_URL)]
496 url: String,
497 },
498 #[command(name = "current-collection")]
500 CurrentCollection {
501 #[arg(long, default_value = DEFAULT_RPC_URL)]
502 url: String,
503 },
504 #[command(name = "list-methods")]
506 ListMethods {
507 #[arg(long, default_value = DEFAULT_RPC_URL)]
508 url: String,
509 },
510 Describe {
512 method: Option<String>,
513 #[arg(long, default_value = DEFAULT_RPC_URL)]
514 url: String,
515 },
516}
517
518#[derive(Debug, clap::Args)]
519struct SearchArgs {
520 query: Option<String>,
522 #[arg(long)]
524 fulltext: bool,
525 #[arg(long)]
527 author: Option<String>,
528 #[arg(long)]
530 after: Option<String>,
531 #[arg(long)]
533 before: Option<String>,
534 #[arg(long)]
536 journal: Option<String>,
537 #[arg(long)]
539 tag: Option<String>,
540 #[arg(long)]
542 doi: Option<String>,
543 #[arg(long)]
545 isbn: Option<String>,
546 #[arg(long)]
548 issn: Option<String>,
549 #[arg(long)]
551 collection: Option<String>,
552 #[arg(long, default_value_t = 50)]
553 limit: u64,
554 #[arg(long, default_value_t = 0)]
555 offset: u64,
556 #[arg(long, default_value = DEFAULT_RPC_URL)]
557 url: String,
558 #[command(subcommand)]
559 management: Option<SearchManagementCommand>,
560}
561
562#[derive(Debug, Subcommand)]
563enum SearchManagementCommand {
564 #[command(name = "saved-searches")]
566 SavedSearches {
567 #[arg(long, default_value = DEFAULT_RPC_URL)]
568 url: String,
569 },
570 #[command(name = "create-saved")]
572 CreateSaved {
573 name: String,
574 #[arg(long = "condition", required = true)]
575 condition: Vec<String>,
576 #[arg(long)]
577 dry_run: bool,
578 #[arg(long, default_value = DEFAULT_RPC_URL)]
579 url: String,
580 },
581 #[command(name = "delete-saved")]
583 DeleteSaved {
584 search_key: String,
585 #[arg(long)]
586 dry_run: bool,
587 #[arg(long, default_value = DEFAULT_RPC_URL)]
588 url: String,
589 },
590}
591
592#[derive(Debug, Subcommand)]
593enum ItemsCommand {
594 Add {
596 #[arg(long)]
597 doi: Option<String>,
598 #[arg(long)]
599 isbn: Option<String>,
600 #[arg(long = "from-url")]
602 from_url: Option<String>,
603 #[arg(long)]
605 file: Option<String>,
606 #[arg(long)]
607 collection: Option<String>,
608 #[arg(long)]
609 dry_run: bool,
610 #[arg(long, default_value = DEFAULT_RPC_URL)]
611 url: String,
612 },
613 Create {
615 #[arg(long = "type")]
616 item_type: String,
617 #[arg(long = "field")]
618 fields: Vec<String>,
619 #[arg(long)]
620 dry_run: bool,
621 #[arg(long, default_value = DEFAULT_RPC_URL)]
622 url: String,
623 },
624 Update {
626 key: String,
627 #[arg(long = "field")]
628 fields: Vec<String>,
629 #[arg(long)]
630 dry_run: bool,
631 #[arg(long, default_value = DEFAULT_RPC_URL)]
632 url: String,
633 },
634 Delete {
636 key: String,
637 #[arg(long)]
638 dry_run: bool,
639 #[arg(long, default_value = DEFAULT_RPC_URL)]
640 url: String,
641 },
642 Trash {
644 items: Vec<String>,
645 #[arg(long)]
646 dry_run: bool,
647 #[arg(long, default_value = DEFAULT_RPC_URL)]
648 url: String,
649 },
650 Restore {
652 item: String,
653 #[arg(long)]
654 dry_run: bool,
655 #[arg(long, default_value = DEFAULT_RPC_URL)]
656 url: String,
657 },
658 #[command(name = "merge-duplicates")]
660 MergeDuplicates {
661 keys: Vec<String>,
662 #[arg(long)]
663 dry_run: bool,
664 #[arg(long, default_value = DEFAULT_RPC_URL)]
665 url: String,
666 },
667 #[command(name = "add-related")]
669 AddRelated {
670 key: String,
671 #[arg(long)]
672 target: String,
673 #[arg(long)]
674 dry_run: bool,
675 #[arg(long, default_value = DEFAULT_RPC_URL)]
676 url: String,
677 },
678 #[command(name = "remove-related")]
680 RemoveRelated {
681 key: String,
682 #[arg(long)]
683 target: String,
684 #[arg(long)]
685 dry_run: bool,
686 #[arg(long, default_value = DEFAULT_RPC_URL)]
687 url: String,
688 },
689 Get {
691 item: String,
692 #[arg(long, default_value = DEFAULT_RPC_URL)]
693 url: String,
694 },
695 List {
697 #[arg(long, default_value_t = 50)]
698 limit: u64,
699 #[arg(long, default_value_t = 0)]
700 offset: u64,
701 #[arg(long)]
702 sort: Option<String>,
703 #[arg(long, default_value = "asc")]
704 direction: String,
705 #[arg(long)]
707 trash: bool,
708 #[arg(long, default_value = DEFAULT_RPC_URL)]
709 url: String,
710 },
711 #[command(name = "find-duplicates")]
713 FindDuplicates {
714 #[arg(long, default_value = DEFAULT_RPC_URL)]
715 url: String,
716 },
717 Recent {
719 #[arg(long, default_value_t = 20)]
720 limit: u64,
721 #[arg(long, default_value_t = 0)]
722 offset: u64,
723 #[arg(long = "type", default_value = "added")]
724 recent_type: String,
725 #[arg(long, default_value = DEFAULT_RPC_URL)]
726 url: String,
727 },
728 Fulltext {
730 key: String,
731 #[arg(long, default_value = DEFAULT_RPC_URL)]
732 url: String,
733 },
734 Related {
736 key: String,
737 #[arg(long, default_value = DEFAULT_RPC_URL)]
738 url: String,
739 },
740 #[command(name = "citation-key")]
742 CitationKey {
743 key: String,
744 #[arg(long, default_value = DEFAULT_RPC_URL)]
745 url: String,
746 },
747}
748
749#[derive(Debug, Subcommand)]
750enum SettingsCommand {
751 Get {
753 key: String,
754 #[arg(long, default_value = DEFAULT_RPC_URL)]
755 url: String,
756 },
757 #[command(visible_alias = "get-all")]
759 List {
760 #[arg(long, default_value = DEFAULT_RPC_URL)]
761 url: String,
762 },
763 Set {
765 pairs: Vec<String>,
767 #[arg(long)]
769 file: Option<String>,
770 #[arg(long)]
771 dry_run: bool,
772 #[arg(long, default_value = DEFAULT_RPC_URL)]
773 url: String,
774 },
775}
776
777#[derive(Debug, Subcommand)]
778enum TagsCommand {
779 List {
781 #[arg(long, default_value_t = 200)]
782 limit: u64,
783 #[arg(long, default_value = DEFAULT_RPC_URL)]
784 url: String,
785 },
786 Rename {
788 old: String,
789 new: String,
790 #[arg(long)]
791 dry_run: bool,
792 #[arg(long, default_value = DEFAULT_RPC_URL)]
793 url: String,
794 },
795 Delete {
797 tag: String,
798 #[arg(long)]
799 dry_run: bool,
800 #[arg(long, default_value = DEFAULT_RPC_URL)]
801 url: String,
802 },
803 Add {
805 keys: Vec<String>,
806 #[arg(long = "tag", required = true)]
807 tags: Vec<String>,
808 #[arg(long)]
809 dry_run: bool,
810 #[arg(long, default_value = DEFAULT_RPC_URL)]
811 url: String,
812 },
813 Remove {
815 keys: Vec<String>,
816 #[arg(long = "tag", required = true)]
817 tags: Vec<String>,
818 #[arg(long)]
819 dry_run: bool,
820 #[arg(long, default_value = DEFAULT_RPC_URL)]
821 url: String,
822 },
823}
824
825#[derive(Debug, clap::Args)]
826struct ExportArgs {
827 keys: Vec<String>,
829 #[arg(long, default_value = "bibtex")]
831 format: String,
832 #[arg(long)]
834 collection: Option<String>,
835 #[arg(long, default_value = "http://www.zotero.org/styles/apa")]
837 style: String,
838 #[arg(long)]
840 html: bool,
841 #[arg(long, default_value = DEFAULT_RPC_URL)]
842 url: String,
843}
844
845#[derive(Debug, Subcommand)]
846enum AnnotationsCommand {
847 List {
849 parent: String,
850 #[arg(long, default_value = DEFAULT_RPC_URL)]
851 url: String,
852 },
853 Create {
855 parent: String,
856 #[arg(long = "type")]
857 annotation_type: Option<String>,
858 #[arg(long)]
861 position: Option<String>,
862 #[arg(long)]
866 quote: Option<String>,
867 #[arg(long)]
869 page: Option<u32>,
870 #[arg(long = "sort-index")]
872 sort_index: Option<String>,
873 #[arg(long)]
874 text: Option<String>,
875 #[arg(long)]
876 comment: Option<String>,
877 #[arg(long, default_value = "#ffd400")]
878 color: String,
879 #[arg(long)]
880 dry_run: bool,
881 #[arg(long, default_value = DEFAULT_RPC_URL)]
882 url: String,
883 },
884 Delete {
886 annotation_key: String,
887 #[arg(long)]
888 dry_run: bool,
889 #[arg(long, default_value = DEFAULT_RPC_URL)]
890 url: String,
891 },
892}
893
894#[derive(Debug, Subcommand)]
895enum AttachmentsCommand {
896 List {
898 #[arg(long)]
899 parent: String,
900 #[arg(long, default_value_t = 50)]
901 limit: u64,
902 #[arg(long, default_value_t = 0)]
903 offset: u64,
904 #[arg(long, default_value = DEFAULT_RPC_URL)]
905 url: String,
906 },
907 Get {
909 key: String,
910 #[arg(long, default_value = DEFAULT_RPC_URL)]
911 url: String,
912 },
913 Fulltext {
915 key: String,
916 #[arg(long, default_value = DEFAULT_RPC_URL)]
917 url: String,
918 },
919 Path {
921 key: String,
922 #[arg(long, default_value = DEFAULT_RPC_URL)]
923 url: String,
924 },
925 Add {
927 #[arg(long)]
928 parent: String,
929 #[arg(long)]
931 path: Option<String>,
932 #[arg(long = "from-url")]
934 from_url: Option<String>,
935 #[arg(long)]
936 title: Option<String>,
937 #[arg(long)]
938 dry_run: bool,
939 #[arg(long, default_value = DEFAULT_RPC_URL)]
940 url: String,
941 },
942 Delete {
944 key: String,
945 #[arg(long, default_value = DEFAULT_RPC_URL)]
946 url: String,
947 #[arg(long)]
948 dry_run: bool,
949 },
950 #[command(name = "find-pdf")]
952 FindPdf {
953 #[arg(long)]
954 parent: String,
955 #[arg(long, default_value = DEFAULT_RPC_URL)]
956 url: String,
957 },
958}
959
960#[derive(Debug, Subcommand)]
961enum NotesCommand {
962 List {
964 #[arg(long)]
965 parent: String,
966 #[arg(long, default_value_t = 50)]
967 limit: u64,
968 #[arg(long, default_value_t = 0)]
969 offset: u64,
970 #[arg(long, default_value = DEFAULT_RPC_URL)]
971 url: String,
972 },
973 Get {
975 note_key: String,
976 #[arg(long, default_value = DEFAULT_RPC_URL)]
977 url: String,
978 },
979 Create {
981 #[arg(long)]
982 parent: String,
983 #[arg(long)]
984 content: String,
985 #[arg(long = "tag")]
986 tags: Vec<String>,
987 #[arg(long)]
988 dry_run: bool,
989 #[arg(long, default_value = DEFAULT_RPC_URL)]
990 url: String,
991 },
992 Update {
994 note_key: String,
995 #[arg(long)]
996 content: String,
997 #[arg(long)]
998 dry_run: bool,
999 #[arg(long, default_value = DEFAULT_RPC_URL)]
1000 url: String,
1001 },
1002 Delete {
1004 note_key: String,
1005 #[arg(long)]
1006 dry_run: bool,
1007 #[arg(long, default_value = DEFAULT_RPC_URL)]
1008 url: String,
1009 },
1010 Search {
1012 query: String,
1013 #[arg(long, default_value_t = 50)]
1014 limit: u64,
1015 #[arg(long, default_value = DEFAULT_RPC_URL)]
1016 url: String,
1017 },
1018}
1019
1020#[derive(Debug, Subcommand)]
1021enum CollectionsCommand {
1022 List {
1024 #[arg(long, default_value = DEFAULT_RPC_URL)]
1025 url: String,
1026 },
1027 Tree {
1029 #[arg(long, default_value = DEFAULT_RPC_URL)]
1030 url: String,
1031 },
1032 Get {
1034 name_or_id: String,
1035 #[arg(long, default_value = DEFAULT_RPC_URL)]
1036 url: String,
1037 },
1038 #[command(name = "get-items", visible_alias = "items")]
1040 GetItems {
1041 name_or_id: String,
1042 #[arg(long)]
1043 limit: Option<u64>,
1044 #[arg(long, default_value_t = 0)]
1045 offset: u64,
1046 #[arg(long, default_value = DEFAULT_RPC_URL)]
1047 url: String,
1048 },
1049 Stats {
1051 name_or_id: String,
1052 #[arg(long, default_value = DEFAULT_RPC_URL)]
1053 url: String,
1054 },
1055 Rename {
1057 old_name: String,
1058 new_name: String,
1059 #[arg(long, default_value = DEFAULT_RPC_URL)]
1060 url: String,
1061 #[arg(long)]
1062 dry_run: bool,
1063 },
1064 Create {
1066 name: String,
1067 #[arg(long)]
1068 parent: Option<String>,
1069 #[arg(long, default_value = DEFAULT_RPC_URL)]
1070 url: String,
1071 #[arg(long)]
1072 dry_run: bool,
1073 },
1074 Delete {
1076 name_or_id: String,
1077 #[arg(long, default_value = DEFAULT_RPC_URL)]
1078 url: String,
1079 #[arg(long)]
1080 dry_run: bool,
1081 },
1082 #[command(name = "add-items")]
1084 AddItems {
1085 collection: String,
1086 item_keys: Vec<String>,
1087 #[arg(long, default_value = DEFAULT_RPC_URL)]
1088 url: String,
1089 #[arg(long)]
1090 dry_run: bool,
1091 },
1092 #[command(name = "remove-items")]
1094 RemoveItems {
1095 collection: String,
1096 item_keys: Vec<String>,
1097 #[arg(long, default_value = DEFAULT_RPC_URL)]
1098 url: String,
1099 #[arg(long)]
1100 dry_run: bool,
1101 },
1102}
1103
1104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1105enum JsonStyle {
1106 PythonCompact,
1108 Pretty,
1110}
1111
1112enum ParseOutcome<T> {
1113 Command(T),
1114 Display(String),
1115}
1116
1117fn parse_cli<T>(
1118 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1119) -> Result<ParseOutcome<T>, String>
1120where
1121 T: Parser,
1122{
1123 match T::try_parse_from(args) {
1124 Ok(cli) => Ok(ParseOutcome::Command(cli)),
1125 Err(err)
1126 if matches!(
1127 err.kind(),
1128 ErrorKind::DisplayHelp | ErrorKind::DisplayVersion
1129 ) =>
1130 {
1131 Ok(ParseOutcome::Display(err.to_string()))
1132 }
1133 Err(err) => Err(err.to_string()),
1134 }
1135}
1136
1137pub fn format_error_json(message: &str) -> String {
1138 let message = message.trim_end();
1139 let (code, message) = split_error_code(message).unwrap_or(("RUNTIME_ERROR", message));
1140 serde_json::json!({"error": {"code": code, "message": message}}).to_string()
1141}
1142
1143fn split_error_code(message: &str) -> Option<(&str, &str)> {
1144 let (code, rest) = message.split_once(':')?;
1145 if !code.is_empty()
1146 && code
1147 .chars()
1148 .all(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_')
1149 {
1150 Some((code, rest.trim_start()))
1151 } else {
1152 None
1153 }
1154}
1155
1156pub fn run(
1157 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1158) -> Result<String, String> {
1159 if std::env::var("CLAUDECODE").as_deref() == Ok("1") {
1161 eprintln!(r#"<claude-code-hint v="1" type="plugin" value="zotron@dianzuan/zotron" />"#);
1162 }
1163
1164 let cli = match parse_cli::<Cli>(args)? {
1165 ParseOutcome::Command(cli) => cli,
1166 ParseOutcome::Display(output) => return Ok(output),
1167 };
1168 let url = command_url(&cli.command);
1169 let mut client = ZoteroRpc::new(url);
1170 run_command(cli.command, &mut client)
1171}
1172
1173pub fn run_with_client(
1174 args: impl IntoIterator<Item = impl Into<std::ffi::OsString> + Clone>,
1175 client: &mut impl RpcCaller,
1176) -> Result<String, String> {
1177 let cli = match parse_cli::<Cli>(args)? {
1178 ParseOutcome::Command(cli) => cli,
1179 ParseOutcome::Display(output) => return Ok(output),
1180 };
1181 run_command(cli.command, client)
1182}
1183
1184fn rag_command_url(command: &RagCommand) -> String {
1185 match command {
1186 RagCommand::Providers => DEFAULT_RPC_URL.to_string(),
1187 RagCommand::Embed { .. } => DEFAULT_RPC_URL.to_string(),
1188 RagCommand::Status { url, .. } => url.clone(),
1189 RagCommand::Search { url, .. } => url.clone(),
1190 }
1191}
1192
1193fn command_url(command: &Command) -> String {
1194 match command {
1195 Command::Ping { url }
1196 | Command::Rpc { url, .. }
1197 | Command::Push { url, .. }
1198 | Command::FindPdfs { url, .. } => url.clone(),
1199 Command::Ocr { command } => match command {
1200 OcrCommand::Providers => DEFAULT_RPC_URL.to_string(),
1201 OcrCommand::Run { .. } => DEFAULT_RPC_URL.to_string(),
1202 OcrCommand::Status { url, .. } => url.clone(),
1203 OcrCommand::Process { url, .. } => url.clone(),
1204 },
1205 Command::Rag { command } => rag_command_url(command),
1206 Command::System { command } => match command {
1207 SystemCommand::Version { url }
1208 | SystemCommand::Libraries { url }
1209 | SystemCommand::LibraryStats { url, .. }
1210 | SystemCommand::Schema { url, .. }
1211 | SystemCommand::CurrentCollection { url }
1212 | SystemCommand::ListMethods { url }
1213 | SystemCommand::Describe { url, .. } => url.clone(),
1214 },
1215 Command::Search(ref args) => match &args.management {
1216 Some(SearchManagementCommand::SavedSearches { url })
1217 | Some(SearchManagementCommand::CreateSaved { url, .. })
1218 | Some(SearchManagementCommand::DeleteSaved { url, .. }) => url.clone(),
1219 None => args.url.clone(),
1220 },
1221 Command::Items { command } => match command {
1222 ItemsCommand::Add { url, .. }
1223 | ItemsCommand::Create { url, .. }
1224 | ItemsCommand::Update { url, .. }
1225 | ItemsCommand::Delete { url, .. }
1226 | ItemsCommand::Trash { url, .. }
1227 | ItemsCommand::Restore { url, .. }
1228 | ItemsCommand::MergeDuplicates { url, .. }
1229 | ItemsCommand::AddRelated { url, .. }
1230 | ItemsCommand::RemoveRelated { url, .. }
1231 | ItemsCommand::Get { url, .. }
1232 | ItemsCommand::List { url, .. }
1233 | ItemsCommand::FindDuplicates { url }
1234 | ItemsCommand::Recent { url, .. }
1235 | ItemsCommand::Fulltext { url, .. }
1236 | ItemsCommand::Related { url, .. }
1237 | ItemsCommand::CitationKey { url, .. } => url.clone(),
1238 },
1239 Command::Collections { command } => match command {
1240 CollectionsCommand::List { url }
1241 | CollectionsCommand::Tree { url }
1242 | CollectionsCommand::Get { url, .. }
1243 | CollectionsCommand::GetItems { url, .. }
1244 | CollectionsCommand::Stats { url, .. }
1245 | CollectionsCommand::Rename { url, .. }
1246 | CollectionsCommand::Create { url, .. }
1247 | CollectionsCommand::Delete { url, .. }
1248 | CollectionsCommand::AddItems { url, .. }
1249 | CollectionsCommand::RemoveItems { url, .. } => url.clone(),
1250 },
1251 Command::Notes { command } => match command {
1252 NotesCommand::List { url, .. }
1253 | NotesCommand::Get { url, .. }
1254 | NotesCommand::Create { url, .. }
1255 | NotesCommand::Update { url, .. }
1256 | NotesCommand::Delete { url, .. }
1257 | NotesCommand::Search { url, .. } => url.clone(),
1258 },
1259 Command::Attachments { command } => match command {
1260 AttachmentsCommand::List { url, .. }
1261 | AttachmentsCommand::Get { url, .. }
1262 | AttachmentsCommand::Fulltext { url, .. }
1263 | AttachmentsCommand::Path { url, .. }
1264 | AttachmentsCommand::Add { url, .. }
1265 | AttachmentsCommand::Delete { url, .. }
1266 | AttachmentsCommand::FindPdf { url, .. } => url.clone(),
1267 },
1268 Command::Settings { command } => match command {
1269 SettingsCommand::Get { url, .. }
1270 | SettingsCommand::List { url }
1271 | SettingsCommand::Set { url, .. } => url.clone(),
1272 },
1273 Command::Tags { command } => match command {
1274 TagsCommand::List { url, .. }
1275 | TagsCommand::Rename { url, .. }
1276 | TagsCommand::Delete { url, .. }
1277 | TagsCommand::Add { url, .. }
1278 | TagsCommand::Remove { url, .. } => url.clone(),
1279 },
1280 Command::Export(ref args) => args.url.clone(),
1281 Command::Annotations { command } => match command {
1282 AnnotationsCommand::List { url, .. }
1283 | AnnotationsCommand::Create { url, .. }
1284 | AnnotationsCommand::Delete { url, .. } => url.clone(),
1285 },
1286 }
1287}
1288
1289fn run_ocr_command(command: OcrCommand, client: &mut impl RpcCaller) -> Result<String, String> {
1290 if let OcrCommand::Providers = &command {
1291 return format_json(
1292 &serde_json::json!({ "providers": ocr_provider_specs() }),
1293 JsonStyle::Pretty,
1294 );
1295 }
1296 let value = match command {
1297 OcrCommand::Providers => unreachable!(),
1298 OcrCommand::Run {
1299 provider,
1300 input,
1301 file,
1302 item_key,
1303 attachment_key,
1304 mime_type,
1305 endpoint,
1306 api_key_env,
1307 } => run_ocr_run_command(OcrRunOptions {
1308 provider,
1309 input,
1310 file,
1311 item_key,
1312 attachment_key,
1313 mime_type,
1314 endpoint,
1315 api_key_env,
1316 })?,
1317 OcrCommand::Status { collection, .. } => run_ocr_status_command(client, collection)?,
1318 OcrCommand::Process {
1319 provider,
1320 parent,
1321 attachment,
1322 source_url,
1323 result_dir,
1324 result_zip,
1325 provider_endpoint,
1326 api_key_env,
1327 poll_interval_seconds,
1328 timeout_seconds,
1329 chunk_chars,
1330 ..
1331 } => run_ocr_process_command(
1332 client,
1333 OcrProcessOptions {
1334 provider,
1335 parent,
1336 attachment,
1337 source_url,
1338 result_dir,
1339 result_zip,
1340 provider_endpoint,
1341 api_key_env,
1342 poll_interval_seconds,
1343 timeout_seconds,
1344 chunk_chars,
1345 },
1346 )?,
1347 };
1348 format_json(&value, JsonStyle::PythonCompact)
1349}
1350
1351struct OcrProcessOptions {
1352 provider: String,
1353 parent: String,
1354 attachment: Option<String>,
1355 source_url: Option<String>,
1356 result_dir: Option<String>,
1357 result_zip: Option<String>,
1358 provider_endpoint: Option<String>,
1359 api_key_env: String,
1360 poll_interval_seconds: u64,
1361 timeout_seconds: u64,
1362 chunk_chars: usize,
1363}
1364
1365struct OcrRunOptions {
1366 provider: String,
1367 input: Option<String>,
1368 file: Option<String>,
1369 item_key: Option<String>,
1370 attachment_key: Option<String>,
1371 mime_type: Option<String>,
1372 endpoint: Option<String>,
1373 api_key_env: Option<String>,
1374}
1375
1376fn run_ocr_run_command(options: OcrRunOptions) -> Result<Value, String> {
1377 let input: OcrRequestInput = match (options.input, options.file) {
1378 (Some(input), None) => read_json_input(&input)?,
1379 (None, Some(file)) => ocr_input_from_file(
1380 file,
1381 options.item_key,
1382 options.attachment_key,
1383 options.mime_type,
1384 )?,
1385 (Some(_), Some(_)) => {
1386 return Err("INVALID_ARGS: use either --input or --file, not both".to_string())
1387 }
1388 (None, None) => return Err("INVALID_ARGS: provide --input JSON or --file".to_string()),
1389 };
1390 let request = build_ocr_provider_request(&options.provider, &input)?;
1391 let payload = if request.command.is_empty() {
1392 let method = request
1393 .method
1394 .ok_or_else(|| format!("OCR provider {} missing HTTP method", request.provider))?;
1395 let auth_scheme = raw_ocr_provider_spec(&options.provider)?.auth;
1396 let mut transport =
1397 provider_http_transport_with_auth(options.api_key_env.as_deref(), auth_scheme)?;
1398 transport.post_json(&ProviderHttpInvocation {
1399 provider: request.provider.to_string(),
1400 style: request.style.to_string(),
1401 method: method.to_string(),
1402 url: options
1403 .endpoint
1404 .or_else(|| request.url.map(ToString::to_string)),
1405 auth_header_name: request.auth_header.map(ToString::to_string),
1406 auth_header_value: None,
1407 body: request.body,
1408 })?
1409 } else {
1410 let mut command_runner = StdProviderCommandRunner;
1411 command_runner.run_json(&request.command)?
1412 };
1413 let blocks = match parse_ocr_provider_response(
1414 request.provider,
1415 &payload,
1416 &input.item_key,
1417 &input.attachment_key,
1418 ) {
1419 Ok(blocks) => blocks,
1420 Err(err) => {
1421 if let Some(task) = ocr_async_task_result(request.provider, &payload) {
1422 return Ok(task);
1423 }
1424 return Err(err);
1425 }
1426 };
1427
1428 Ok(serde_json::json!({
1429 "provider": request.provider,
1430 "blocks": blocks,
1431 }))
1432}
1433
1434fn run_ocr_process_command(
1435 client: &mut impl RpcCaller,
1436 mut options: OcrProcessOptions,
1437) -> Result<Value, String> {
1438 let spec = raw_ocr_provider_spec(&options.provider)?;
1439
1440 let attachment = match options.attachment.take() {
1441 Some(key) => key,
1442 None => resolve_first_pdf_attachment_key(client, &options.parent)?,
1443 };
1444 options.attachment = Some(attachment.clone());
1445
1446 let attachment_path = resolve_attachment_path(client, &attachment)?;
1447 let storage_dir = attachment_path
1448 .parent()
1449 .ok_or_else(|| {
1450 format!(
1451 "ATTACHMENT_PATH_INVALID: attachment path has no parent directory: {}",
1452 attachment_path.display()
1453 )
1454 })?
1455 .to_path_buf();
1456
1457 match spec.provider_key {
1458 "mineru" | "mineru-cli" => {
1459 if options.result_dir.is_some() && options.result_zip.is_some() {
1460 return Err("INVALID_ARGS: use either --result-dir or --result-zip, not both".to_string());
1461 }
1462 if options.source_url.is_some()
1463 && (options.result_dir.is_some() || options.result_zip.is_some())
1464 {
1465 return Err(
1466 "INVALID_ARGS: --source-url cannot be combined with --result-dir/--result-zip"
1467 .to_string(),
1468 );
1469 }
1470 let file_name = attachment_path
1471 .file_name()
1472 .and_then(|name| name.to_str())
1473 .unwrap_or("document.pdf")
1474 .to_string();
1475 let source = load_mineru_result_source(&options, &attachment_path, &file_name)?;
1476 let artifacts = persist_mineru_result_sidecars(
1477 &storage_dir, &options.parent, &attachment,
1478 &options.provider, &source, options.chunk_chars,
1479 )?;
1480 Ok(serde_json::json!({
1481 "provider": spec.provider_key,
1482 "status": "indexed",
1483 "item_key": options.parent,
1484 "attachment_key": attachment,
1485 "attachment_path": attachment_path,
1486 "storage_dir": storage_dir,
1487 "task_id": source.task_id,
1488 "state": source.state,
1489 "blocks": artifacts.block_count,
1490 "chunks": artifacts.chunk_count,
1491 "artifacts": artifacts.artifacts,
1492 }))
1493 }
1494 _ => {
1495 run_ocr_process_sync(
1496 client, &options, spec.provider_key,
1497 &attachment, &attachment_path, &storage_dir,
1498 )
1499 }
1500 }
1501}
1502
1503fn run_ocr_process_sync(
1504 client: &mut impl RpcCaller,
1505 options: &OcrProcessOptions,
1506 provider: &str,
1507 attachment_key: &str,
1508 attachment_path: &Path,
1509 storage_dir: &Path,
1510) -> Result<Value, String> {
1511 let api_url = if let Some(endpoint) = &options.provider_endpoint {
1512 endpoint.clone()
1513 } else {
1514 let settings = client.call("settings.getAll", None)?;
1515 settings.get("ocr.apiUrl")
1516 .and_then(Value::as_str)
1517 .unwrap_or("")
1518 .to_string()
1519 };
1520 if api_url.is_empty() {
1521 return Err(format!("MISSING_CONFIG: ocr.apiUrl not configured for provider {provider}"));
1522 }
1523
1524 let api_key = {
1525 let from_env = if !options.api_key_env.is_empty() {
1526 env::var(&options.api_key_env).ok().filter(|v| !v.is_empty())
1527 } else {
1528 None
1529 };
1530 from_env.unwrap_or_else(|| {
1531 client.call("settings.getRaw", Some(serde_json::json!({"key": "ocr.apiKey"})))
1532 .ok()
1533 .and_then(|raw| raw.get("ocr.apiKey").and_then(Value::as_str).map(String::from))
1534 .unwrap_or_default()
1535 })
1536 };
1537
1538 let pdf_bytes = fs::read(attachment_path)
1539 .map_err(|e| format!("READ_PDF_FAILED: {}: {e}", attachment_path.display()))?;
1540
1541 const MAX_PDF_SIZE: usize = 100 * 1024 * 1024; if pdf_bytes.len() > MAX_PDF_SIZE {
1543 return Err(format!(
1544 "PDF_TOO_LARGE: {} is {} MB, max {} MB",
1545 attachment_path.display(),
1546 pdf_bytes.len() / (1024 * 1024),
1547 MAX_PDF_SIZE / (1024 * 1024),
1548 ));
1549 }
1550
1551 let base64_pdf = format!("data:application/pdf;base64,{}", base64_encode(&pdf_bytes));
1552
1553 let input = OcrRequestInput {
1554 content_base64: base64_pdf,
1555 file_name: attachment_path
1556 .file_name()
1557 .and_then(|n| n.to_str())
1558 .unwrap_or("document.pdf")
1559 .to_string(),
1560 mime_type: "application/pdf".to_string(),
1561 item_key: options.parent.clone(),
1562 attachment_key: attachment_key.to_string(),
1563 source_url: None,
1564 local_path: Some(attachment_path.to_string_lossy().to_string()),
1565 output_dir: None,
1566 };
1567 let request = build_ocr_provider_request(provider, &input)?;
1568
1569 let payload = if request.command.is_empty() {
1570 let method = request
1571 .method
1572 .ok_or_else(|| format!("OCR provider {provider} missing HTTP method"))?;
1573 let spec = raw_ocr_provider_spec(provider)?;
1574
1575 let mut transport = if !api_key.is_empty() {
1576 match spec.auth {
1577 "bearer" => UreqProviderHttpTransport::with_bearer_token(&api_key),
1578 "token" => UreqProviderHttpTransport::with_api_key(format!("token {api_key}")),
1579 _ => UreqProviderHttpTransport::new(),
1580 }
1581 } else {
1582 UreqProviderHttpTransport::new()
1583 };
1584
1585 transport.post_json(&ProviderHttpInvocation {
1586 provider: request.provider.to_string(),
1587 style: request.style.to_string(),
1588 method: method.to_string(),
1589 url: Some(api_url),
1590 auth_header_name: request.auth_header.map(ToString::to_string),
1591 auth_header_value: None,
1592 body: request.body,
1593 })?
1594 } else {
1595 let mut runner = StdProviderCommandRunner;
1596 runner.run_json(&request.command)?
1597 };
1598
1599 let blocks = parse_ocr_provider_response(provider, &payload, &options.parent, attachment_key)?;
1600 let chunks = zotron_types::chunks_from_blocks(&blocks, options.chunk_chars);
1601
1602 let artifacts = vec![
1603 write_sidecar_json(
1604 storage_dir, &options.parent, attachment_key,
1605 MachineArtifactKind::OcrRaw, &payload,
1606 )?,
1607 write_sidecar_jsonl(
1608 storage_dir, &options.parent, attachment_key,
1609 MachineArtifactKind::Blocks, &blocks,
1610 )?,
1611 write_sidecar_jsonl(
1612 storage_dir, &options.parent, attachment_key,
1613 MachineArtifactKind::Chunks, &chunks,
1614 )?,
1615 ];
1616
1617 let embedding_count = embed_sidecar_chunks(client, storage_dir, &options.parent, attachment_key, &chunks);
1618
1619 Ok(serde_json::json!({
1620 "provider": provider,
1621 "status": "indexed",
1622 "item_key": options.parent,
1623 "attachment_key": attachment_key,
1624 "embeddings": embedding_count,
1625 "attachment_path": attachment_path,
1626 "storage_dir": storage_dir,
1627 "blocks": blocks.len(),
1628 "chunks": chunks.len(),
1629 "artifacts": artifacts,
1630 }))
1631}
1632
1633fn embed_sidecar_chunks(
1634 client: &mut impl RpcCaller,
1635 storage_dir: &Path,
1636 item_key: &str,
1637 _attachment_key: &str,
1638 chunks: &[zotron_types::StructureChunk],
1639) -> usize {
1640 let Ok((provider, model, api_url, api_key)) = fetch_embedding_settings(client) else {
1641 return 0;
1642 };
1643 if provider.is_empty() || (api_key.is_empty() && provider != "ollama") {
1644 return 0;
1645 }
1646 let emb_chunks: Vec<EmbeddingChunkInput> = chunks
1647 .iter()
1648 .map(|c| EmbeddingChunkInput {
1649 chunk_key: c.chunk_key.clone(),
1650 text: c.text.clone(),
1651 })
1652 .collect();
1653 if emb_chunks.is_empty() {
1654 return 0;
1655 }
1656 let batch_size = 20;
1658 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
1659 for batch in emb_chunks.chunks(batch_size) {
1660 let input = EmbeddingRequestInput {
1661 item_key: item_key.to_string(),
1662 chunks: batch.to_vec(),
1663 model: if model.is_empty() { None } else { Some(model.clone()) },
1664 url: if api_url.is_empty() { None } else { Some(api_url.clone()) },
1665 input_type: Some("document".to_string()),
1666 };
1667 let Ok(request) = build_embedding_provider_request(&provider, &input) else {
1668 break;
1669 };
1670 let Some(url) = request.url.as_deref() else { break };
1671 let mut http = ureq::post(url).set("Content-Type", "application/json");
1672 if let Some(auth) = request.auth_header {
1673 if !api_key.is_empty() {
1674 http = http.set(auth, &format!("Bearer {api_key}"));
1675 }
1676 }
1677 let Ok(resp) = http.send_json(&request.body) else { break };
1678 let Ok(payload): Result<Value, _> = resp.into_json() else { break };
1679 let Ok(vectors) = parse_embedding_provider_response(&provider, &payload, item_key, batch)
1680 else {
1681 break;
1682 };
1683 all_vectors.extend(vectors);
1684 }
1685 let count = all_vectors.len();
1686 if count > 0 {
1687 let filename = embedding_vector_filename(&provider, &model);
1688 let vectors_dir = storage_dir.join(".zotron").join("embeddings");
1689 fs::create_dir_all(&vectors_dir).map_err(|e| {
1690 eprintln!("warning: cannot create embeddings dir {}: {e}", vectors_dir.display());
1691 e
1692 }).ok();
1693 let vectors_path = vectors_dir.join(&filename);
1694 let mut out = String::new();
1695 for v in &all_vectors {
1696 if let Ok(line) = serde_json::to_string(v) {
1697 out.push_str(&line);
1698 out.push('\n');
1699 }
1700 }
1701 if let Err(e) = fs::write(&vectors_path, &out) {
1702 eprintln!("warning: failed to persist embeddings to {}: {e}", vectors_path.display());
1703 }
1704 }
1705 count
1706}
1707
1708struct MineruResultSource {
1709 task_id: Option<String>,
1710 state: String,
1711 result_dir: PathBuf,
1712 raw_zip_bytes: Option<Vec<u8>>,
1713 task_status: Option<Value>,
1714 payload: Value,
1715 content_list_file: Option<PathBuf>,
1716 markdown: Option<String>,
1717}
1718
1719struct PersistedOcrArtifacts {
1720 block_count: usize,
1721 chunk_count: usize,
1722 artifacts: Vec<Value>,
1723}
1724
1725fn resolve_attachment_path(
1726 client: &mut impl RpcCaller,
1727 attachment_key: &str,
1728) -> Result<PathBuf, String> {
1729 let payload = client.call(
1730 "attachments.getPath",
1731 Some(serde_json::json!({"key": attachment_key})),
1732 )?;
1733 let raw_path = payload
1734 .get("path")
1735 .and_then(Value::as_str)
1736 .filter(|path| !path.trim().is_empty())
1737 .ok_or_else(|| {
1738 format!("ATTACHMENT_PATH_NOT_FOUND: attachment {attachment_key} has no local PDF path")
1739 })?;
1740 Ok(PathBuf::from(local_path_from_zotero_path(raw_path)))
1741}
1742
1743fn resolve_first_pdf_attachment_key(
1745 client: &mut impl RpcCaller,
1746 parent_key: &str,
1747) -> Result<String, String> {
1748 let response = client.call(
1749 "attachments.list",
1750 Some(serde_json::json!({"parentKey": parent_key})),
1751 )?;
1752 let attachments = response
1754 .get("items")
1755 .and_then(Value::as_array)
1756 .or_else(|| response.as_array())
1757 .ok_or_else(|| {
1758 format!("NO_PDF_ATTACHMENT: no attachments found for item {parent_key}")
1759 })?;
1760 for attachment in attachments {
1761 if is_pdf_attachment(attachment) {
1762 if let Some(key) = attachment.get("key").and_then(Value::as_str) {
1763 return Ok(key.to_string());
1764 }
1765 }
1766 }
1767 Err(format!(
1768 "NO_PDF_ATTACHMENT: no PDF attachment found for item {parent_key}"
1769 ))
1770}
1771
1772fn load_mineru_result_source(
1773 options: &OcrProcessOptions,
1774 attachment_path: &Path,
1775 file_name: &str,
1776) -> Result<MineruResultSource, String> {
1777 if let Some(result_dir) = options.result_dir.as_deref() {
1778 return mineru_result_source_from_dir(PathBuf::from(result_dir), None, None, None);
1779 }
1780 if let Some(result_zip) = options.result_zip.as_deref() {
1781 let zip_path = PathBuf::from(result_zip);
1782 let zip_bytes = fs::read(&zip_path)
1783 .map_err(|err| format!("read MinerU result zip {}: {err}", zip_path.display()))?;
1784 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1785 return mineru_result_source_from_dir(result_dir, Some(zip_bytes), None, None);
1786 }
1787
1788 let Some(source_url) = options
1789 .source_url
1790 .as_deref()
1791 .filter(|value| !value.trim().is_empty())
1792 else {
1793 return submit_mineru_local_file(options, attachment_path, file_name);
1794 };
1795 let input = OcrRequestInput {
1796 item_key: options.parent.clone(),
1797 attachment_key: options.attachment.clone().expect("attachment resolved"),
1798 file_name: file_name.to_string(),
1799 mime_type: "application/pdf".to_string(),
1800 content_base64: format!("url:{source_url}"),
1801 source_url: Some(source_url.to_string()),
1802 local_path: None,
1803 output_dir: None,
1804 };
1805 let task = submit_mineru_task(
1806 &options.provider,
1807 &input,
1808 options.provider_endpoint.clone(),
1809 &options.api_key_env,
1810 )?;
1811 let task_id = task
1812 .get("data")
1813 .and_then(|data| data.get("task_id"))
1814 .and_then(Value::as_str)
1815 .ok_or_else(|| "MinerU submit response missing data.task_id".to_string())?
1816 .to_string();
1817 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1818 let status = poll_mineru_task(
1819 options.provider_endpoint.as_deref(),
1820 &task_id,
1821 &auth_header,
1822 options.poll_interval_seconds,
1823 options.timeout_seconds,
1824 )?;
1825 let zip_url = status
1826 .pointer("/data/full_zip_url")
1827 .or_else(|| status.pointer("/data/result/full_zip_url"))
1828 .and_then(Value::as_str)
1829 .ok_or_else(|| "MinerU completed task missing data.full_zip_url".to_string())?;
1830 let zip_bytes = download_bytes(zip_url)?;
1831 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1832 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(task_id))
1833}
1834
1835fn submit_mineru_local_file(
1836 options: &OcrProcessOptions,
1837 attachment_path: &Path,
1838 file_name: &str,
1839) -> Result<MineruResultSource, String> {
1840 let auth_header = provider_auth_header_value(&options.api_key_env, "bearer")?;
1841 let upload_request = create_mineru_file_upload(
1842 options.provider_endpoint.as_deref(),
1843 file_name,
1844 options.attachment.as_deref().expect("attachment resolved"),
1845 &auth_header,
1846 )?;
1847 let upload_url = upload_request
1848 .pointer("/data/file_urls/0")
1849 .or_else(|| upload_request.pointer("/data/fileUrls/0"))
1850 .and_then(Value::as_str)
1851 .ok_or_else(|| "MinerU upload URL response missing data.file_urls[0]".to_string())?;
1852 let batch_id = upload_request
1853 .pointer("/data/batch_id")
1854 .or_else(|| upload_request.pointer("/data/batchId"))
1855 .and_then(Value::as_str)
1856 .ok_or_else(|| "MinerU upload URL response missing data.batch_id".to_string())?
1857 .to_string();
1858 let bytes = fs::read(attachment_path)
1859 .map_err(|err| format!("read attachment PDF {}: {err}", attachment_path.display()))?;
1860 put_bytes(upload_url, &bytes)?;
1861 let status = poll_mineru_batch(
1862 options.provider_endpoint.as_deref(),
1863 &batch_id,
1864 &auth_header,
1865 options.poll_interval_seconds,
1866 options.timeout_seconds,
1867 )?;
1868 let zip_url = mineru_batch_zip_url(&status)
1869 .ok_or_else(|| "MinerU completed batch missing full_zip_url".to_string())?;
1870 let zip_bytes = download_bytes(&zip_url)?;
1871 let result_dir = extract_zip_bytes_to_temp("zotron-mineru-result", &zip_bytes)?;
1872 mineru_result_source_from_dir(result_dir, Some(zip_bytes), Some(status), Some(batch_id))
1873}
1874
1875fn create_mineru_file_upload(
1876 endpoint: Option<&str>,
1877 file_name: &str,
1878 data_id: &str,
1879 auth_header: &str,
1880) -> Result<Value, String> {
1881 let url = mineru_file_urls_url(endpoint);
1882 let body = serde_json::json!({
1883 "files": [{"name": file_name, "data_id": data_id}],
1884 "model_version": "vlm",
1885 "is_ocr": false,
1886 "enable_formula": true,
1887 "enable_table": true,
1888 "language": "ch",
1889 "page_ranges": "1-200",
1890 });
1891 ureq::post(&url)
1892 .set("Authorization", auth_header)
1893 .send_json(body)
1894 .map_err(|err| format!("POST {url} failed: {err}"))?
1895 .into_json::<Value>()
1896 .map_err(|err| format!("POST {url} returned invalid JSON: {err}"))
1897}
1898
1899fn put_bytes(url: &str, bytes: &[u8]) -> Result<(), String> {
1900 ureq::put(url)
1901 .send_bytes(bytes)
1902 .map_err(|err| format!("PUT {url} failed: {err}"))?;
1903 Ok(())
1904}
1905
1906fn submit_mineru_task(
1907 provider: &str,
1908 input: &OcrRequestInput,
1909 endpoint: Option<String>,
1910 api_key_env: &str,
1911) -> Result<Value, String> {
1912 let request = build_ocr_provider_request(provider, input)?;
1913 let method = request
1914 .method
1915 .ok_or_else(|| "MinerU provider missing HTTP method".to_string())?;
1916 let mut transport = provider_http_transport_with_auth(Some(api_key_env), "bearer")?;
1917 transport.post_json(&ProviderHttpInvocation {
1918 provider: request.provider.to_string(),
1919 style: request.style.to_string(),
1920 method: method.to_string(),
1921 url: endpoint.or_else(|| request.url.map(ToString::to_string)),
1922 auth_header_name: request.auth_header.map(ToString::to_string),
1923 auth_header_value: None,
1924 body: request.body,
1925 })
1926}
1927
1928fn poll_mineru_task(
1929 endpoint: Option<&str>,
1930 task_id: &str,
1931 auth_header: &str,
1932 poll_interval_seconds: u64,
1933 timeout_seconds: u64,
1934) -> Result<Value, String> {
1935 let url = mineru_task_status_url(endpoint, task_id);
1936 let started = Instant::now();
1937 loop {
1938 let status = get_json_with_auth(&url, auth_header)?;
1939 let state = status
1940 .pointer("/data/state")
1941 .or_else(|| status.pointer("/data/status"))
1942 .and_then(Value::as_str)
1943 .unwrap_or("unknown");
1944 match state {
1945 "done" | "finished" | "success" => return Ok(status),
1946 "failed" | "error" => return Err(format!("MinerU task {task_id} failed: {status}")),
1947 _ => {
1948 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
1949 return Err(format!(
1950 "MinerU task {task_id} timed out after {timeout_seconds}s with state {state}"
1951 ));
1952 }
1953 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
1954 }
1955 }
1956 }
1957}
1958
1959fn mineru_task_status_url(endpoint: Option<&str>, task_id: &str) -> String {
1960 let base = endpoint
1961 .unwrap_or("https://mineru.net/api/v4/extract/task")
1962 .trim_end_matches('/');
1963 if base.ends_with("/extract/task") {
1964 format!("{base}/{task_id}")
1965 } else {
1966 format!("{base}/extract/task/{task_id}")
1967 }
1968}
1969
1970fn mineru_file_urls_url(endpoint: Option<&str>) -> String {
1971 let base = mineru_api_base(endpoint);
1972 format!("{base}/file-urls/batch")
1973}
1974
1975fn mineru_batch_status_url(endpoint: Option<&str>, batch_id: &str) -> String {
1976 let base = mineru_api_base(endpoint);
1977 format!("{base}/extract-results/batch/{batch_id}")
1978}
1979
1980fn mineru_api_base(endpoint: Option<&str>) -> String {
1981 let base = endpoint
1982 .unwrap_or("https://mineru.net/api/v4/extract/task")
1983 .trim_end_matches('/');
1984 if let Some(stripped) = base.strip_suffix("/extract/task") {
1985 return stripped.to_string();
1986 }
1987 if let Some(stripped) = base.strip_suffix("/extract") {
1988 return stripped.to_string();
1989 }
1990 base.to_string()
1991}
1992
1993fn poll_mineru_batch(
1994 endpoint: Option<&str>,
1995 batch_id: &str,
1996 auth_header: &str,
1997 poll_interval_seconds: u64,
1998 timeout_seconds: u64,
1999) -> Result<Value, String> {
2000 let url = mineru_batch_status_url(endpoint, batch_id);
2001 let started = Instant::now();
2002 loop {
2003 let status = get_json_with_auth(&url, auth_header)?;
2004 let state = mineru_batch_state(&status).unwrap_or("unknown");
2005 match state {
2006 "done" | "finished" | "success" => return Ok(status),
2007 "failed" | "error" => return Err(format!("MinerU batch {batch_id} failed: {status}")),
2008 _ => {
2009 if started.elapsed() >= Duration::from_secs(timeout_seconds) {
2010 return Err(format!(
2011 "MinerU batch {batch_id} timed out after {timeout_seconds}s with state {state}"
2012 ));
2013 }
2014 thread::sleep(Duration::from_secs(poll_interval_seconds.max(1)));
2015 }
2016 }
2017 }
2018}
2019
2020fn mineru_batch_state(status: &Value) -> Option<&str> {
2021 status
2022 .pointer("/data/extract_result/0/state")
2023 .or_else(|| status.pointer("/data/extractResult/0/state"))
2024 .or_else(|| status.pointer("/data/state"))
2025 .and_then(Value::as_str)
2026}
2027
2028fn mineru_batch_zip_url(status: &Value) -> Option<String> {
2029 status
2030 .pointer("/data/extract_result/0/full_zip_url")
2031 .or_else(|| status.pointer("/data/extractResult/0/full_zip_url"))
2032 .or_else(|| status.pointer("/data/full_zip_url"))
2033 .and_then(Value::as_str)
2034 .map(ToString::to_string)
2035}
2036
2037fn provider_auth_header_value(api_key_env: &str, auth_scheme: &str) -> Result<String, String> {
2038 let token = env::var(api_key_env)
2039 .map_err(|_| format!("missing provider credential env var {api_key_env}"))?;
2040 let token = token.trim();
2041 if token.is_empty() {
2042 return Err(format!(
2043 "provider credential env var {api_key_env} is empty"
2044 ));
2045 }
2046 Ok(match auth_scheme {
2047 "bearer" if token.starts_with("Bearer ") => token.to_string(),
2048 "bearer" => format!("Bearer {token}"),
2049 "token" if token.starts_with("token ") => token.to_string(),
2050 "token" => format!("token {token}"),
2051 _ => token.to_string(),
2052 })
2053}
2054
2055fn get_json_with_auth(url: &str, auth_header: &str) -> Result<Value, String> {
2056 ureq::get(url)
2057 .set("Authorization", auth_header)
2058 .call()
2059 .map_err(|err| format!("GET {url} failed: {err}"))?
2060 .into_json::<Value>()
2061 .map_err(|err| format!("GET {url} returned invalid JSON: {err}"))
2062}
2063
2064fn download_bytes(url: &str) -> Result<Vec<u8>, String> {
2065 let response = ureq::get(url)
2066 .call()
2067 .map_err(|err| format!("download {url} failed: {err}"))?;
2068 let mut bytes = Vec::new();
2069 response
2070 .into_reader()
2071 .read_to_end(&mut bytes)
2072 .map_err(|err| format!("read download {url}: {err}"))?;
2073 Ok(bytes)
2074}
2075
2076fn extract_zip_bytes_to_temp(prefix: &str, zip_bytes: &[u8]) -> Result<PathBuf, String> {
2077 let dir = unique_temp_path(prefix);
2078 fs::create_dir_all(&dir).map_err(|err| format!("create temp dir {}: {err}", dir.display()))?;
2079 let zip_path = dir.with_extension("zip");
2080 fs::write(&zip_path, zip_bytes)
2081 .map_err(|err| format!("write temp zip {}: {err}", zip_path.display()))?;
2082 let output = ProcessCommand::new("unzip")
2083 .arg("-q")
2084 .arg("-o")
2085 .arg(&zip_path)
2086 .arg("-d")
2087 .arg(&dir)
2088 .output()
2089 .map_err(|err| format!("run unzip: {err}"))?;
2090 if !output.status.success() {
2091 return Err(format!(
2092 "unzip {} failed: {}",
2093 zip_path.display(),
2094 String::from_utf8_lossy(&output.stderr).trim()
2095 ));
2096 }
2097 Ok(dir)
2098}
2099
2100fn unique_temp_path(prefix: &str) -> PathBuf {
2101 let nanos = SystemTime::now()
2102 .duration_since(UNIX_EPOCH)
2103 .map(|duration| duration.as_nanos())
2104 .unwrap_or(0);
2105 env::temp_dir().join(format!("{prefix}-{}-{nanos}", std::process::id()))
2106}
2107
2108fn mineru_result_source_from_dir(
2109 result_dir: PathBuf,
2110 raw_zip_bytes: Option<Vec<u8>>,
2111 task_status: Option<Value>,
2112 task_id: Option<String>,
2113) -> Result<MineruResultSource, String> {
2114 let (payload, content_list_file) = mineru_payload_from_result_dir(&result_dir)?;
2115 let markdown = find_first_file_by_name(&result_dir, "full.md")
2116 .map(|path| {
2117 fs::read_to_string(&path)
2118 .map_err(|err| format!("read native markdown {}: {err}", path.display()))
2119 })
2120 .transpose()?;
2121 Ok(MineruResultSource {
2122 task_id,
2123 state: "done".to_string(),
2124 result_dir,
2125 raw_zip_bytes,
2126 task_status,
2127 payload,
2128 content_list_file,
2129 markdown,
2130 })
2131}
2132
2133fn mineru_payload_from_result_dir(result_dir: &Path) -> Result<(Value, Option<PathBuf>), String> {
2134 let v2 = find_first_file_with_suffix(result_dir, "_content_list_v2.json");
2135 if let Some(path) = v2 {
2136 let value = read_json_file(&path)?;
2137 return Ok((serde_json::json!({"content_list_v2": value}), Some(path)));
2138 }
2139 let content_list = find_first_file_with_suffix(result_dir, "_content_list.json");
2140 if let Some(path) = content_list {
2141 let value = read_json_file(&path)?;
2142 return Ok((serde_json::json!({"content_list": value}), Some(path)));
2143 }
2144 let layout = find_first_file_by_name(result_dir, "layout.json");
2145 if let Some(path) = layout {
2146 return Ok((read_json_file(&path)?, Some(path)));
2147 }
2148 let markdown = find_first_file_by_name(result_dir, "full.md");
2149 if let Some(path) = markdown {
2150 let text = fs::read_to_string(&path)
2151 .map_err(|err| format!("read native markdown {}: {err}", path.display()))?;
2152 return Ok((serde_json::json!({"result": text}), Some(path)));
2153 }
2154 Err(format!(
2155 "MinerU result directory {} missing content_list_v2/content_list/layout/full.md",
2156 result_dir.display()
2157 ))
2158}
2159
2160fn read_json_file(path: &Path) -> Result<Value, String> {
2161 let raw = fs::read_to_string(path).map_err(|err| format!("read {}: {err}", path.display()))?;
2162 serde_json::from_str(&raw).map_err(|err| format!("parse JSON {}: {err}", path.display()))
2163}
2164
2165fn persist_mineru_result_sidecars(
2166 storage_dir: &Path,
2167 item_key: &str,
2168 attachment_key: &str,
2169 provider: &str,
2170 source: &MineruResultSource,
2171 chunk_chars: usize,
2172) -> Result<PersistedOcrArtifacts, String> {
2173 let blocks = parse_ocr_provider_response(provider, &source.payload, item_key, attachment_key)?;
2174 let chunks = zotron_types::chunks_from_blocks(&blocks, chunk_chars);
2175 let assets = copy_mineru_assets(&source.result_dir, storage_dir)?;
2176 let raw_bundle = serde_json::json!({
2177 "provider": provider,
2178 "item_key": item_key,
2179 "attachment_key": attachment_key,
2180 "task_id": source.task_id,
2181 "state": source.state,
2182 "task_status": source.task_status,
2183 "content_list_file": source.content_list_file,
2184 "payload": source.payload,
2185 });
2186
2187 let mut artifacts = Vec::new();
2188 artifacts.push(write_sidecar_json(
2189 storage_dir,
2190 item_key,
2191 attachment_key,
2192 MachineArtifactKind::OcrRaw,
2193 &raw_bundle,
2194 )?);
2195 artifacts.push(write_sidecar_jsonl(
2196 storage_dir,
2197 item_key,
2198 attachment_key,
2199 MachineArtifactKind::Blocks,
2200 &blocks,
2201 )?);
2202 artifacts.push(write_sidecar_jsonl(
2203 storage_dir,
2204 item_key,
2205 attachment_key,
2206 MachineArtifactKind::Chunks,
2207 &chunks,
2208 )?);
2209 if let Some(markdown) = source.markdown.as_deref() {
2210 artifacts.push(write_sidecar_bytes(
2211 storage_dir,
2212 item_key,
2213 attachment_key,
2214 MachineArtifactKind::OcrNativeMarkdown,
2215 markdown.as_bytes(),
2216 )?);
2217 }
2218 artifacts.push(write_sidecar_json(
2219 storage_dir,
2220 item_key,
2221 attachment_key,
2222 MachineArtifactKind::OcrNativeAssets,
2223 &assets,
2224 )?);
2225 if let Some(bytes) = source.raw_zip_bytes.as_deref() {
2226 artifacts.push(write_extra_sidecar_bytes(
2227 storage_dir,
2228 ".zotron/ocr/latest.raw.zip",
2229 bytes,
2230 )?);
2231 }
2232
2233 Ok(PersistedOcrArtifacts {
2234 block_count: blocks.len(),
2235 chunk_count: chunks.len(),
2236 artifacts,
2237 })
2238}
2239
2240fn write_sidecar_json(
2241 storage_dir: &Path,
2242 item_key: &str,
2243 attachment_key: &str,
2244 kind: MachineArtifactKind,
2245 value: &Value,
2246) -> Result<Value, String> {
2247 let bytes = serde_json::to_vec_pretty(value).map_err(|err| err.to_string())?;
2248 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, &bytes)
2249}
2250
2251fn write_sidecar_jsonl<T: serde::Serialize>(
2252 storage_dir: &Path,
2253 item_key: &str,
2254 attachment_key: &str,
2255 kind: MachineArtifactKind,
2256 values: &[T],
2257) -> Result<Value, String> {
2258 let mut out = String::new();
2259 for value in values {
2260 out.push_str(&serde_json::to_string(value).map_err(|err| err.to_string())?);
2261 out.push('\n');
2262 }
2263 write_sidecar_bytes(storage_dir, item_key, attachment_key, kind, out.as_bytes())
2264}
2265
2266fn write_sidecar_bytes(
2267 storage_dir: &Path,
2268 item_key: &str,
2269 attachment_key: &str,
2270 kind: MachineArtifactKind,
2271 bytes: &[u8],
2272) -> Result<Value, String> {
2273 let record = write_machine_artifact_sidecar(storage_dir, item_key, attachment_key, kind, bytes)
2274 .map_err(|err| format!("write sidecar {:?}: {err}", kind))?;
2275 Ok(serde_json::json!({
2276 "kind": kind,
2277 "relative_path": record.relative_path,
2278 "absolute_path": record.absolute_path,
2279 }))
2280}
2281
2282fn write_extra_sidecar_bytes(
2283 storage_dir: &Path,
2284 relative_path: &str,
2285 bytes: &[u8],
2286) -> Result<Value, String> {
2287 let absolute_path = storage_dir.join(relative_path);
2288 if let Some(parent) = absolute_path.parent() {
2289 fs::create_dir_all(parent).map_err(|err| format!("create {}: {err}", parent.display()))?;
2290 }
2291 fs::write(&absolute_path, bytes)
2292 .map_err(|err| format!("write sidecar {}: {err}", absolute_path.display()))?;
2293 Ok(serde_json::json!({
2294 "kind": "ocr_raw_zip",
2295 "relative_path": relative_path,
2296 "absolute_path": absolute_path,
2297 }))
2298}
2299
2300fn copy_mineru_assets(result_dir: &Path, storage_dir: &Path) -> Result<Value, String> {
2301 let mut images = Vec::new();
2302 for file in collect_files(result_dir)? {
2303 if !is_image_file(&file) {
2304 continue;
2305 }
2306 let relative = file.strip_prefix(result_dir).unwrap_or(&file).to_path_buf();
2307 let destination = storage_dir.join(".zotron").join("ocr").join(&relative);
2308 if let Some(parent) = destination.parent() {
2309 fs::create_dir_all(parent)
2310 .map_err(|err| format!("create {}: {err}", parent.display()))?;
2311 }
2312 fs::copy(&file, &destination).map_err(|err| {
2313 format!(
2314 "copy MinerU asset {} to {}: {err}",
2315 file.display(),
2316 destination.display()
2317 )
2318 })?;
2319 images.push(serde_json::json!({
2320 "source_relative": relative,
2321 "sidecar_relative": PathBuf::from(".zotron").join("ocr").join(&relative),
2322 "absolute_path": destination,
2323 }));
2324 }
2325 Ok(serde_json::json!({
2326 "provider": "mineru",
2327 "images": images,
2328 }))
2329}
2330
2331fn is_image_file(path: &Path) -> bool {
2332 matches!(
2333 path.extension()
2334 .and_then(|ext| ext.to_str())
2335 .unwrap_or_default()
2336 .to_ascii_lowercase()
2337 .as_str(),
2338 "png" | "jpg" | "jpeg" | "webp" | "gif"
2339 )
2340}
2341
2342fn find_first_file_with_suffix(root: &Path, suffix: &str) -> Option<PathBuf> {
2343 collect_files(root).ok()?.into_iter().find(|path| {
2344 path.file_name()
2345 .and_then(|name| name.to_str())
2346 .is_some_and(|name| name.ends_with(suffix))
2347 })
2348}
2349
2350fn find_first_file_by_name(root: &Path, name: &str) -> Option<PathBuf> {
2351 collect_files(root).ok()?.into_iter().find(|path| {
2352 path.file_name()
2353 .and_then(|file_name| file_name.to_str())
2354 .is_some_and(|file_name| file_name == name)
2355 })
2356}
2357
2358fn collect_files(root: &Path) -> Result<Vec<PathBuf>, String> {
2359 let mut files = Vec::new();
2360 collect_files_into(root, &mut files)?;
2361 files.sort();
2362 Ok(files)
2363}
2364
2365fn collect_files_into(root: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
2366 for entry in fs::read_dir(root).map_err(|err| format!("read dir {}: {err}", root.display()))? {
2367 let entry = entry.map_err(|err| format!("read dir entry {}: {err}", root.display()))?;
2368 let path = entry.path();
2369 let file_type = entry
2370 .file_type()
2371 .map_err(|err| format!("stat {}: {err}", path.display()))?;
2372 if file_type.is_dir() {
2373 collect_files_into(&path, files)?;
2374 } else if file_type.is_file() {
2375 files.push(path);
2376 }
2377 }
2378 Ok(())
2379}
2380
2381fn ocr_async_task_result(provider: &str, payload: &Value) -> Option<Value> {
2382 let data = payload.get("data")?;
2383 let task_id = data.get("task_id").and_then(Value::as_str)?;
2384 Some(serde_json::json!({
2385 "provider": provider,
2386 "status": "submitted",
2387 "task_id": task_id,
2388 "state": data.get("state").and_then(Value::as_str).unwrap_or("submitted"),
2389 "result_url": data.get("full_zip_url").or_else(|| data.get("markdown_url")).cloned(),
2390 "raw": payload,
2391 }))
2392}
2393
2394fn ocr_input_from_file(
2395 file: String,
2396 item_key: Option<String>,
2397 attachment_key: Option<String>,
2398 mime_type: Option<String>,
2399) -> Result<OcrRequestInput, String> {
2400 let item_key = item_key
2401 .ok_or_else(|| "INVALID_ARGS: --item-key is required when using --file".to_string())?;
2402 let attachment_key = attachment_key.ok_or_else(|| {
2403 "INVALID_ARGS: --attachment-key is required when using --file".to_string()
2404 })?;
2405 let path = PathBuf::from(&file);
2406 let bytes = fs::read(&path).map_err(|err| format!("read {file}: {err}"))?;
2407 let file_name = path
2408 .file_name()
2409 .and_then(|name| name.to_str())
2410 .unwrap_or("document.pdf")
2411 .to_string();
2412 let mime_type = mime_type.unwrap_or_else(|| guess_mime_type(&path).to_string());
2413 Ok(OcrRequestInput {
2414 item_key,
2415 attachment_key,
2416 file_name,
2417 mime_type,
2418 content_base64: base64_encode(&bytes),
2419 source_url: None,
2420 local_path: Some(file),
2421 output_dir: None,
2422 })
2423}
2424
2425fn guess_mime_type(path: &Path) -> &'static str {
2426 match path
2427 .extension()
2428 .and_then(|ext| ext.to_str())
2429 .unwrap_or_default()
2430 .to_ascii_lowercase()
2431 .as_str()
2432 {
2433 "png" => "image/png",
2434 "jpg" | "jpeg" => "image/jpeg",
2435 "webp" => "image/webp",
2436 _ => "application/pdf",
2437 }
2438}
2439
2440fn base64_encode(bytes: &[u8]) -> String {
2441 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2442 let mut out = String::with_capacity(bytes.len().div_ceil(3) * 4);
2443 for chunk in bytes.chunks(3) {
2444 let b0 = chunk[0];
2445 let b1 = *chunk.get(1).unwrap_or(&0);
2446 let b2 = *chunk.get(2).unwrap_or(&0);
2447 out.push(TABLE[(b0 >> 2) as usize] as char);
2448 out.push(TABLE[(((b0 & 0b0000_0011) << 4) | (b1 >> 4)) as usize] as char);
2449 if chunk.len() > 1 {
2450 out.push(TABLE[(((b1 & 0b0000_1111) << 2) | (b2 >> 6)) as usize] as char);
2451 } else {
2452 out.push('=');
2453 }
2454 if chunk.len() > 2 {
2455 out.push(TABLE[(b2 & 0b0011_1111) as usize] as char);
2456 } else {
2457 out.push('=');
2458 }
2459 }
2460 out
2461}
2462
2463fn run_ocr_status_command(
2464 client: &mut impl RpcCaller,
2465 collection: String,
2466) -> Result<Value, String> {
2467 let collection_key = find_collection_in_tree(client, &collection)?
2468 .and_then(|node| node.get("key").cloned())
2469 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
2470 let raw = paginate_rpc(
2471 client,
2472 "collections.getItems",
2473 serde_json::json!({"key": collection_key}),
2474 500,
2475 )?;
2476 let items = raw
2477 .get("items")
2478 .and_then(Value::as_array)
2479 .or_else(|| raw.as_array())
2480 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
2481 .clone();
2482
2483 let mut has_ocr = 0usize;
2484 for item in &items {
2485 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
2486 if has_ocr_artifact(client, &item_key)? || has_ocr_note(client, &item_key)? {
2487 has_ocr += 1;
2488 }
2489 }
2490
2491 Ok(serde_json::json!({
2492 "collection": collection,
2493 "total": items.len(),
2494 "has_ocr": has_ocr,
2495 "missing_ocr": items.len() - has_ocr,
2496 }))
2497}
2498
2499fn has_ocr_artifact(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2500 if let Some(item_key) = item_key.as_str() {
2501 if machine_artifact_exists_for_item(
2504 machine_artifact_store_root(),
2505 item_key,
2506 MachineArtifactKind::Chunks,
2507 ) {
2508 return Ok(true);
2509 }
2510 }
2511
2512 let attachments = client.call(
2513 "attachments.list",
2514 Some(serde_json::json!({"parentKey": item_key.clone()})),
2515 )?;
2516 Ok(attachments.as_array().is_some_and(|attachments| {
2517 attachments.iter().any(|attachment| {
2518 let has_sidecar_chunks = attachment
2519 .get("path")
2520 .and_then(Value::as_str)
2521 .map(local_path_from_zotero_path)
2522 .as_deref()
2523 .map(Path::new)
2524 .and_then(Path::parent)
2525 .is_some_and(|dir| {
2526 machine_artifact_exists_in_sidecar(dir, MachineArtifactKind::Chunks)
2527 });
2528 if has_sidecar_chunks {
2529 return true;
2530 }
2531
2532 attachment
2534 .get("title")
2535 .and_then(Value::as_str)
2536 .is_some_and(|title| title.ends_with("zotron-chunks.jsonl"))
2537 })
2538 }))
2539}
2540
2541fn local_path_from_zotero_path(path: &str) -> String {
2542 if is_wsl() && path.as_bytes().get(1) == Some(&b':') {
2543 return ProcessCommand::new("wslpath")
2544 .arg("-u")
2545 .arg(path)
2546 .output()
2547 .ok()
2548 .filter(|output| output.status.success())
2549 .and_then(|output| String::from_utf8(output.stdout).ok())
2550 .map(|converted| converted.trim().to_string())
2551 .filter(|converted| !converted.is_empty())
2552 .unwrap_or_else(|| path.to_string());
2553 }
2554 path.to_string()
2555}
2556
2557fn has_ocr_note(client: &mut impl RpcCaller, item_key: &Value) -> Result<bool, String> {
2558 let notes = client.call(
2559 "notes.list",
2560 Some(serde_json::json!({"parentKey": item_key.clone()})),
2561 )?;
2562 Ok(notes.as_array().is_some_and(|notes| {
2563 notes.iter().any(|note| {
2564 note.get("tags")
2565 .and_then(Value::as_array)
2566 .is_some_and(|tags| tags.iter().any(tag_is_ocr))
2567 })
2568 }))
2569}
2570
2571fn tag_is_ocr(tag: &Value) -> bool {
2572 tag.as_str() == Some("ocr")
2573 || tag
2574 .get("tag")
2575 .and_then(Value::as_str)
2576 .is_some_and(|tag| tag == "ocr")
2577}
2578
2579fn find_collection_in_tree(
2580 client: &mut impl RpcCaller,
2581 collection: &str,
2582) -> Result<Option<Value>, String> {
2583 let tree = client.call("collections.tree", None)?;
2584 let nodes = tree
2585 .as_array()
2586 .ok_or_else(|| "collections.tree returned non-array result".to_string())?;
2587 Ok(search_collection_tree(nodes, collection).cloned())
2588}
2589
2590fn search_collection_tree<'a>(nodes: &'a [Value], collection: &str) -> Option<&'a Value> {
2591 for node in nodes {
2592 if node.get("key").and_then(Value::as_str) == Some(collection)
2593 || node.get("name").and_then(Value::as_str) == Some(collection)
2594 {
2595 return Some(node);
2596 }
2597 if let Some(children) = node.get("children").and_then(Value::as_array) {
2598 if let Some(found) = search_collection_tree(children, collection) {
2599 return Some(found);
2600 }
2601 }
2602 }
2603 None
2604}
2605
2606fn run_command(command: Command, client: &mut impl RpcCaller) -> Result<String, String> {
2607 if let Command::Export(args) = command {
2608 return run_export(args, client);
2609 }
2610
2611 let (value, style) = match command {
2612 Command::Ping { .. } => (
2613 call_json(client, "system.ping", None)?,
2614 JsonStyle::PythonCompact,
2615 ),
2616 Command::Rpc {
2617 method,
2618 params_json,
2619 paginate,
2620 page_size,
2621 ..
2622 } => {
2623 let params = serde_json::from_str::<Value>(¶ms_json)
2624 .map_err(|err| format!("INVALID_JSON: params must be a JSON object: {err}"))?;
2625 if !params.is_object() {
2626 return Err("INVALID_JSON: params must be a JSON object".to_string());
2627 }
2628 if paginate {
2629 (
2630 paginate_rpc(client, &method, params, page_size)?,
2631 JsonStyle::Pretty,
2632 )
2633 } else {
2634 (call_json(client, &method, Some(params))?, JsonStyle::Pretty)
2635 }
2636 }
2637 Command::Push {
2638 json_file,
2639 pdf,
2640 collection,
2641 on_duplicate,
2642 dry_run,
2643 ..
2644 } => return run_push_command(json_file, pdf, collection, on_duplicate, dry_run, client),
2645 Command::System { command } => run_system_command(command, client)?,
2646 Command::Search(args) => {
2647 if let Some(mgmt) = args.management {
2648 run_search_management_command(mgmt, client)?
2649 } else {
2650 run_search(args, client)?
2651 }
2652 }
2653 Command::Items { command } => run_items_command(command, client)?,
2654 Command::Collections { command } => run_collections_command(command, client)?,
2655 Command::Notes { command } => run_notes_command(command, client)?,
2656 Command::Attachments { command } => run_attachments_command(command, client)?,
2657 Command::Settings { command } => run_settings_command(command, client)?,
2658 Command::Tags { command } => run_tags_command(command, client)?,
2659 Command::Annotations { command } => run_annotations_command(command, client)?,
2660 Command::Ocr { command } => {
2661 return run_ocr_command(command, client);
2662 }
2663 Command::Rag { command } => {
2664 return run_rag_command(command, client);
2665 }
2666 Command::Export(_) => unreachable!("export commands return raw output above"),
2667 Command::FindPdfs {
2668 collection, limit, ..
2669 } => run_find_pdfs_command(client, collection, limit)?,
2670 };
2671
2672 format_json(&value, style)
2673}
2674
2675fn run_rag_command(command: RagCommand, client: &mut impl RpcCaller) -> Result<String, String> {
2676 match command {
2677 RagCommand::Providers => format_json(
2678 &serde_json::json!({
2679 "providers": [
2680 embedding_provider_spec("volcengine")?,
2681 embedding_provider_spec("alibaba")?,
2682 embedding_provider_spec("custom")?,
2683 ],
2684 }),
2685 JsonStyle::Pretty,
2686 ),
2687 RagCommand::Embed {
2688 provider,
2689 input,
2690 endpoint,
2691 model,
2692 input_type,
2693 api_key_env,
2694 } => {
2695 let value = run_embedding_provider_json_command(
2696 provider,
2697 input,
2698 endpoint,
2699 model,
2700 input_type,
2701 api_key_env,
2702 )?;
2703 format_json(&value, JsonStyle::PythonCompact)
2704 }
2705 RagCommand::Status { collection, .. } => {
2706 let value = rag_status_value(client, &collection)?;
2707 format_json(&value, JsonStyle::PythonCompact)
2708 }
2709 RagCommand::Search {
2710 query,
2711 collection,
2712 keys,
2713 zotero,
2714 top_spans_per_item,
2715 include_fulltext_spans,
2716 top_k,
2717 output,
2718 ..
2719 } => run_rag_search_command(
2720 client,
2721 RagSearchOptions {
2722 query,
2723 collection,
2724 keys,
2725 zotero,
2726 top_spans_per_item,
2727 include_fulltext_spans,
2728 top_k,
2729 output,
2730 },
2731 ),
2732 }
2733}
2734
2735fn run_embedding_provider_json_command(
2736 provider: String,
2737 input: String,
2738 endpoint: Option<String>,
2739 model: Option<String>,
2740 input_type: Option<String>,
2741 api_key_env: Option<String>,
2742) -> Result<Value, String> {
2743 let mut input: EmbeddingRequestInput = read_json_input(&input)?;
2744 if endpoint.is_some() {
2745 input.url = endpoint;
2746 }
2747 if model.is_some() {
2748 input.model = model;
2749 }
2750 if input_type.is_some() {
2751 input.input_type = input_type;
2752 }
2753 let mut transport = provider_http_transport(api_key_env.as_deref())?;
2754 let vectors = execute_embedding_provider_request(&provider, &input, &mut transport)?;
2755
2756 Ok(serde_json::json!({
2757 "provider": provider,
2758 "vectors": vectors,
2759 }))
2760}
2761
2762fn provider_http_transport(api_key_env: Option<&str>) -> Result<UreqProviderHttpTransport, String> {
2763 provider_http_transport_with_auth(api_key_env, "bearer")
2764}
2765
2766fn provider_http_transport_with_auth(
2767 api_key_env: Option<&str>,
2768 auth_scheme: &str,
2769) -> Result<UreqProviderHttpTransport, String> {
2770 let Some(env_name) = api_key_env else {
2771 return Ok(UreqProviderHttpTransport::new());
2772 };
2773 let token = env::var(env_name)
2774 .map_err(|_| format!("missing provider credential env var {env_name}"))?;
2775 if token.trim().is_empty() {
2776 return Err(format!("provider credential env var {env_name} is empty"));
2777 }
2778 let token = token.trim();
2779 match auth_scheme {
2780 "token" if token.starts_with("token ") => {
2781 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2782 }
2783 "token" => Ok(UreqProviderHttpTransport::with_api_key(format!(
2784 "token {token}"
2785 ))),
2786 "bearer" if token.starts_with("Bearer ") => {
2787 Ok(UreqProviderHttpTransport::with_api_key(token.to_string()))
2788 }
2789 "bearer" => Ok(UreqProviderHttpTransport::with_bearer_token(token)),
2790 "none" => Ok(UreqProviderHttpTransport::new()),
2791 other => Err(format!("unsupported provider auth scheme {other}")),
2792 }
2793}
2794
2795fn read_json_input<T: serde::de::DeserializeOwned>(path: &str) -> Result<T, String> {
2796 let payload = if path == "-" {
2797 let mut input = String::new();
2798 io::stdin()
2799 .read_to_string(&mut input)
2800 .map_err(|err| format!("read stdin: {err}"))?;
2801 input
2802 } else {
2803 fs::read_to_string(path).map_err(|err| format!("read {path}: {err}"))?
2804 };
2805 serde_json::from_str::<T>(&payload)
2806 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))
2807}
2808
2809fn fetch_embedding_settings(
2810 client: &mut impl RpcCaller,
2811) -> Result<(String, String, String, String), String> {
2812 let settings = client.call("settings.getAll", None)?;
2813 let provider = settings
2814 .get("embedding.provider")
2815 .and_then(Value::as_str)
2816 .unwrap_or("ollama")
2817 .to_string();
2818 let model = settings
2819 .get("embedding.model")
2820 .and_then(Value::as_str)
2821 .unwrap_or("")
2822 .to_string();
2823 let api_url = settings
2824 .get("embedding.apiUrl")
2825 .and_then(Value::as_str)
2826 .unwrap_or("")
2827 .to_string();
2828 let raw = client.call("settings.getRaw", Some(serde_json::json!({"key": "embedding.apiKey"})))?;
2829 let api_key = raw
2830 .get("embedding.apiKey")
2831 .and_then(Value::as_str)
2832 .unwrap_or("")
2833 .to_string();
2834 Ok((provider, model, api_url, api_key))
2835}
2836
2837fn fetch_retrieval_mode(client: &mut impl RpcCaller) -> String {
2838 client
2839 .call(
2840 "settings.get",
2841 Some(serde_json::json!({"key": "rag.retrievalMode"})),
2842 )
2843 .ok()
2844 .and_then(|v| {
2845 v.get("rag.retrievalMode")
2846 .and_then(Value::as_str)
2847 .map(String::from)
2848 })
2849 .unwrap_or_else(|| "hybrid".to_string())
2850}
2851
2852fn resolve_sidecar_paths(
2853 client: &mut impl RpcCaller,
2854 collection: Option<&str>,
2855 keys: &[String],
2856) -> Result<Vec<(String, String, PathBuf)>, String> {
2857 let items = if !keys.is_empty() {
2858 let mut items = Vec::new();
2859 for key in keys {
2860 let item = client.call("items.get", Some(serde_json::json!({"key": key})))?;
2861 items.push(item);
2862 }
2863 items
2864 } else if let Some(col) = collection {
2865 let col_key = resolve_collection(client, col)?;
2866 let response = client.call(
2867 "collections.getItems",
2868 Some(serde_json::json!({"key": col_key})),
2869 )?;
2870 collection_items(&response)
2871 } else {
2872 return Err("INVALID_ARGS: --collection or --key required".into());
2873 };
2874
2875 let mut results = Vec::new();
2876 for item in &items {
2877 let item_key = item.get("key").and_then(Value::as_str).unwrap_or_default();
2878 let attachments = client.call(
2879 "attachments.list",
2880 Some(serde_json::json!({"parentKey": item_key})),
2881 )?;
2882 let att_list = attachments
2883 .get("items")
2884 .and_then(Value::as_array)
2885 .or_else(|| attachments.as_array())
2886 .cloned()
2887 .unwrap_or_default();
2888 for att in &att_list {
2889 let content_type = att
2890 .get("contentType")
2891 .and_then(Value::as_str)
2892 .unwrap_or("");
2893 if content_type != "application/pdf" {
2894 continue;
2895 }
2896 let att_key = att.get("key").and_then(Value::as_str).unwrap_or_default();
2897 let path = att.get("path").and_then(Value::as_str).unwrap_or_default();
2898 if path.is_empty() {
2899 continue;
2900 }
2901 let local_path = local_path_from_zotero_path(path);
2902 let pdf_path = PathBuf::from(&local_path);
2903 if let Some(parent) = pdf_path.parent() {
2904 let sidecar_root = parent.join(".zotron");
2905 if sidecar_root.exists() {
2906 results.push((item_key.to_string(), att_key.to_string(), sidecar_root));
2907 }
2908 }
2909 }
2910 }
2911 Ok(results)
2912}
2913
2914fn load_sidecar_chunks(sidecar_root: &Path) -> Vec<StructureChunk> {
2915 let chunks_path = sidecar_root.join("chunks").join("chunks.v1.jsonl");
2916 let Ok(content) = fs::read_to_string(&chunks_path) else {
2917 return Vec::new();
2918 };
2919 content
2920 .lines()
2921 .filter(|line| !line.trim().is_empty())
2922 .filter_map(|line| serde_json::from_str::<StructureChunk>(line).ok())
2923 .collect()
2924}
2925
2926fn embedding_vector_filename(provider: &str, model: &str) -> String {
2927 let p = provider.trim().to_lowercase().replace('/', "-");
2928 let m = model.trim().to_lowercase().replace('/', "-");
2929 if p.is_empty() && m.is_empty() {
2930 return "vectors.jsonl".to_string();
2931 }
2932 format!("{p}--{m}.jsonl")
2933}
2934
2935fn load_sidecar_vectors(sidecar_root: &Path, provider: &str, model: &str) -> Vec<EmbeddingVector> {
2936 let embeddings_dir = sidecar_root.join("embeddings");
2937 let target = embedding_vector_filename(provider, model);
2938 let target_path = embeddings_dir.join(&target);
2939 if let Ok(content) = fs::read_to_string(&target_path) {
2940 let vecs: Vec<EmbeddingVector> = content
2941 .lines()
2942 .filter(|line| !line.trim().is_empty())
2943 .filter_map(|line| serde_json::from_str(line).ok())
2944 .collect();
2945 if !vecs.is_empty() {
2946 return vecs;
2947 }
2948 }
2949 for legacy in &["vectors.v1.jsonl", "vectors.jsonl"] {
2951 let path = embeddings_dir.join(legacy);
2952 if let Ok(content) = fs::read_to_string(&path) {
2953 let vecs: Vec<EmbeddingVector> = content
2954 .lines()
2955 .filter(|line| !line.trim().is_empty())
2956 .filter_map(|line| serde_json::from_str::<EmbeddingVector>(line).ok())
2957 .filter(|v| v.source_provider == provider || provider.is_empty())
2958 .collect();
2959 if !vecs.is_empty() {
2960 return vecs;
2961 }
2962 }
2963 }
2964 Vec::new()
2965}
2966
2967fn embed_query_text(
2968 query: &str,
2969 provider: &str,
2970 model: &str,
2971 api_url: &str,
2972 api_key: &str,
2973) -> Result<Vec<f64>, String> {
2974 let input = EmbeddingRequestInput {
2975 item_key: "query".to_string(),
2976 chunks: vec![EmbeddingChunkInput {
2977 chunk_key: "q0".to_string(),
2978 text: query.to_string(),
2979 }],
2980 model: if model.is_empty() {
2981 None
2982 } else {
2983 Some(model.to_string())
2984 },
2985 url: if api_url.is_empty() {
2986 None
2987 } else {
2988 Some(api_url.to_string())
2989 },
2990 input_type: Some("query".to_string()),
2991 };
2992 let request = build_embedding_provider_request(provider, &input)?;
2993 let url = request
2994 .url
2995 .as_deref()
2996 .ok_or("no embedding URL configured")?;
2997 let mut http = ureq::post(url).set("Content-Type", "application/json");
2998 if let Some(auth) = request.auth_header {
2999 if !api_key.is_empty() {
3000 http = http.set(auth, &format!("Bearer {api_key}"));
3001 }
3002 }
3003 let resp = http
3004 .send_json(&request.body)
3005 .map_err(|e| format!("embedding request failed: {e}"))?;
3006 let payload: Value = resp
3007 .into_json()
3008 .map_err(|e| format!("embedding response parse: {e}"))?;
3009 let vectors =
3010 parse_embedding_provider_response(provider, &payload, "query", &input.chunks)?;
3011 vectors
3012 .into_iter()
3013 .next()
3014 .map(|v| v.vector)
3015 .ok_or_else(|| "no embedding vector returned".to_string())
3016}
3017
3018fn run_rag_search_xpi_fallback(
3019 client: &mut impl RpcCaller,
3020 options: &RagSearchOptions,
3021) -> Result<String, String> {
3022 let mut params = serde_json::json!({
3023 "query": options.query,
3024 "limit": options.top_k,
3025 "top_spans_per_item": options.top_spans_per_item,
3026 "include_fulltext_spans": options.include_fulltext_spans,
3027 });
3028 if let Some(map) = params.as_object_mut() {
3029 if let Some(col) = &options.collection {
3030 map.insert("collection".into(), Value::String(col.clone()));
3031 }
3032 if !options.keys.is_empty() {
3033 map.insert(
3034 "keys".into(),
3035 Value::Array(options.keys.iter().map(|k| Value::String(k.clone())).collect()),
3036 );
3037 }
3038 }
3039 let payload = client.call("rag.searchHits", Some(params))?;
3040 let hits = payload
3041 .get("hits")
3042 .and_then(Value::as_array)
3043 .cloned()
3044 .unwrap_or_default();
3045 if options.output == "jsonl" {
3046 let mut out = String::new();
3047 for hit in &hits {
3048 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3049 out.push('\n');
3050 }
3051 Ok(out)
3052 } else {
3053 let total = hits.len() as u64;
3054 format_json(
3055 &normalize_list_envelope(
3056 serde_json::json!({"items": hits, "total": total}),
3057 "items",
3058 Some(options.top_k),
3059 0,
3060 ),
3061 JsonStyle::Pretty,
3062 )
3063 }
3064}
3065
3066fn run_rag_search_command(
3067 client: &mut impl RpcCaller,
3068 options: RagSearchOptions,
3069) -> Result<String, String> {
3070 if options.zotero {
3072 if options.collection.is_none() && options.keys.is_empty() {
3073 return Err(
3074 "INVALID_ARGS: --collection or --key is required".to_string(),
3075 );
3076 }
3077 return run_rag_search_xpi_fallback(client, &options);
3078 }
3079
3080 if options.collection.is_none() && options.keys.is_empty() {
3082 return Err("INVALID_ARGS: --collection or --key required".to_string());
3083 }
3084
3085 let sidecars = resolve_sidecar_paths(
3087 client,
3088 options.collection.as_deref(),
3089 &options.keys,
3090 );
3091
3092 let sidecars = match sidecars {
3095 Ok(ref s) if !s.is_empty() => s,
3096 Err(ref e) if e.contains("COLLECTION_NOT_FOUND") => return Err(e.clone()),
3097 _ => return run_rag_search_xpi_fallback(client, &options),
3098 };
3099
3100 let (emb_provider, emb_model, emb_url, emb_key) = fetch_embedding_settings(client)?;
3102 let mut all_chunks: Vec<StructureChunk> = Vec::new();
3103 let mut all_vectors: Vec<EmbeddingVector> = Vec::new();
3104 for (_item_key, _att_key, sidecar_root) in sidecars {
3105 all_chunks.extend(load_sidecar_chunks(sidecar_root));
3106 all_vectors.extend(load_sidecar_vectors(sidecar_root, &emb_provider, &emb_model));
3107 }
3108
3109 if all_chunks.is_empty() {
3110 return run_rag_search_xpi_fallback(client, &options);
3111 }
3112
3113 let mode = fetch_retrieval_mode(client);
3115
3116 let bm25_ranked = if mode != "dense" {
3118 bm25_score_chunks(&all_chunks, &options.query, 1.2, 0.75)
3119 } else {
3120 Vec::new()
3121 };
3122
3123 let dense_ranked = if mode != "lexical" && !all_vectors.is_empty() {
3125 match embed_query_text(&options.query, &emb_provider, &emb_model, &emb_url, &emb_key) {
3126 Ok(query_vec) => {
3127 let vec_map: std::collections::HashMap<&str, &[f64]> = all_vectors
3128 .iter()
3129 .map(|v| (v.chunk_key.as_str(), v.vector.as_slice()))
3130 .collect();
3131 let mut scores: Vec<(usize, f64)> = all_chunks
3132 .iter()
3133 .enumerate()
3134 .filter_map(|(i, chunk)| {
3135 vec_map.get(chunk.chunk_key.as_str()).map(|stored| {
3136 (i, cosine_similarity(&query_vec, stored))
3137 })
3138 })
3139 .filter(|(_, s)| *s > 0.0)
3140 .collect();
3141 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
3142 scores
3143 }
3144 Err(_) => Vec::new(),
3145 }
3146 } else {
3147 Vec::new()
3148 };
3149
3150 let limit = options.top_k as usize;
3152 let ranked = if !bm25_ranked.is_empty() && !dense_ranked.is_empty() {
3153 rrf_merge(&bm25_ranked, &dense_ranked, 60.0, limit)
3154 } else if !bm25_ranked.is_empty() {
3155 bm25_ranked.into_iter().take(limit).collect()
3156 } else {
3157 dense_ranked.into_iter().take(limit).collect()
3158 };
3159
3160 let mut per_item_count: std::collections::HashMap<&str, u64> =
3162 std::collections::HashMap::new();
3163 let mut selected: Vec<(usize, f64)> = Vec::new();
3164 for (idx, score) in &ranked {
3165 let item_key = all_chunks[*idx].item_key.as_str();
3166 let count = per_item_count.entry(item_key).or_insert(0);
3167 if *count < options.top_spans_per_item {
3168 *count += 1;
3169 selected.push((*idx, *score));
3170 }
3171 }
3172
3173 let mut meta_cache: std::collections::HashMap<String, Value> =
3175 std::collections::HashMap::new();
3176 let mut hits: Vec<Value> = Vec::new();
3177 for (idx, score) in &selected {
3178 let chunk = &all_chunks[*idx];
3179 let meta = if let Some(cached) = meta_cache.get(&chunk.item_key) {
3180 cached.clone()
3181 } else {
3182 let fetched = client
3183 .call(
3184 "items.get",
3185 Some(serde_json::json!({"key": chunk.item_key})),
3186 )
3187 .unwrap_or(Value::Null);
3188 meta_cache.insert(chunk.item_key.clone(), fetched.clone());
3189 fetched
3190 };
3191 let title = meta
3192 .get("title")
3193 .and_then(Value::as_str)
3194 .unwrap_or("")
3195 .to_string();
3196 let authors = meta
3197 .get("creators")
3198 .and_then(Value::as_array)
3199 .map(|creators| {
3200 creators
3201 .iter()
3202 .filter_map(|c| {
3203 let last = c.get("lastName").and_then(Value::as_str).unwrap_or("");
3204 let first = c.get("firstName").and_then(Value::as_str).unwrap_or("");
3205 if last.is_empty() && first.is_empty() {
3206 None
3207 } else {
3208 Some(format!("{last}{first}"))
3209 }
3210 })
3211 .collect::<Vec<_>>()
3212 .join(", ")
3213 })
3214 .unwrap_or_default();
3215 let year = meta.get("date").and_then(Value::as_str).unwrap_or("");
3216 let mut hit = serde_json::json!({
3217 "item_key": chunk.item_key,
3218 "chunk_key": chunk.chunk_key,
3219 "title": title,
3220 "authors": authors,
3221 "year": year,
3222 "text": chunk.text,
3223 "page_range": chunk.page_range,
3224 "section_path": chunk.section_path,
3225 "score": score,
3226 });
3227 if options.include_fulltext_spans {
3228 hit.as_object_mut().unwrap().insert(
3229 "attachment_key".to_string(),
3230 Value::String(chunk.attachment_key.clone()),
3231 );
3232 }
3233 hits.push(hit);
3234 }
3235
3236 if options.output == "jsonl" {
3238 let mut out = String::new();
3239 for hit in &hits {
3240 out.push_str(&serde_json::to_string(hit).map_err(|e| e.to_string())?);
3241 out.push('\n');
3242 }
3243 Ok(out)
3244 } else {
3245 let total = hits.len() as u64;
3246 format_json(
3247 &normalize_list_envelope(
3248 serde_json::json!({"items": hits, "total": total}),
3249 "items",
3250 Some(options.top_k),
3251 0,
3252 ),
3253 JsonStyle::Pretty,
3254 )
3255 }
3256}
3257
3258fn rag_status_value(client: &mut impl RpcCaller, collection: &str) -> Result<Value, String> {
3259 let raw_store_path = rag_store_path(collection);
3260 if raw_store_path.exists() {
3261 return rag_status_from_store(collection, &raw_store_path);
3262 }
3263
3264 let mut store_candidates = Vec::new();
3265 let collection_match = find_collection_in_tree(client, collection)?;
3266 if let Some(collection_node) = collection_match.as_ref() {
3267 if let Some(name) = collection_node.get("name").and_then(Value::as_str) {
3268 store_candidates.push(rag_store_path(name));
3269 }
3270 if let Some(key) = collection_node.get("key").and_then(Value::as_str) {
3271 store_candidates.push(rag_store_path(key));
3272 }
3273 }
3274 for store_path in unique_paths(store_candidates) {
3275 if store_path.exists() {
3276 return rag_status_from_store(collection, &store_path);
3277 }
3278 }
3279
3280 rag_status_from_zotero_sidecars(client, collection, collection_match)
3281}
3282
3283fn unique_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
3284 let mut unique = Vec::new();
3285 for path in paths {
3286 if !unique.iter().any(|seen| seen == &path) {
3287 unique.push(path);
3288 }
3289 }
3290 unique
3291}
3292
3293fn rag_status_from_store(collection: &str, store_path: &Path) -> Result<Value, String> {
3294 let raw = fs::read_to_string(store_path)
3295 .map_err(|err| format!("read RAG store {}: {err}", store_path.display()))?;
3296 let store: Value = serde_json::from_str(&raw)
3297 .map_err(|err| format!("parse RAG store {}: {err}", store_path.display()))?;
3298 let chunks = store
3299 .get("chunks")
3300 .and_then(Value::as_array)
3301 .cloned()
3302 .unwrap_or_default();
3303 let mut item_keys = Vec::<Value>::new();
3304 for chunk in &chunks {
3305 let Some(item_key) = chunk.get("item_key") else {
3306 continue;
3307 };
3308 if !item_keys.iter().any(|seen| seen == item_key) {
3309 item_keys.push(item_key.clone());
3310 }
3311 }
3312 Ok(serde_json::json!({
3313 "status": "indexed",
3314 "collection": store.get("collection").and_then(Value::as_str).unwrap_or(collection),
3315 "collection_key": store.get("collection_key").cloned().unwrap_or(Value::Null),
3316 "model": store.get("model").cloned().unwrap_or(Value::String("unknown".to_string())),
3317 "total_chunks": chunks.len(),
3318 "total_items": item_keys.len(),
3319 "store_path": store_path.to_string_lossy(),
3320 }))
3321}
3322
3323fn rag_status_from_zotero_sidecars(
3324 client: &mut impl RpcCaller,
3325 collection: &str,
3326 collection_match: Option<Value>,
3327) -> Result<Value, String> {
3328 let collection_key = collection_match
3329 .as_ref()
3330 .and_then(|node| node.get("key").cloned())
3331 .ok_or_else(|| format!("COLLECTION_NOT_FOUND: Collection not found: {collection:?}"))?;
3332 let raw = paginate_rpc(
3333 client,
3334 "collections.getItems",
3335 serde_json::json!({"key": collection_key}),
3336 500,
3337 )?;
3338 let items = raw
3339 .get("items")
3340 .and_then(Value::as_array)
3341 .or_else(|| raw.as_array())
3342 .ok_or_else(|| "collections.getItems returned non-array/non-items result".to_string())?
3343 .clone();
3344
3345 let mut indexed_items = 0usize;
3346 let mut total_chunks = 0usize;
3347 for item in &items {
3348 let item_key = item.get("key").cloned().unwrap_or(Value::Null);
3349 let chunk_count = sidecar_chunk_count_for_item(client, &item_key)?;
3350 if chunk_count > 0 {
3351 indexed_items += 1;
3352 total_chunks += chunk_count;
3353 }
3354 }
3355
3356 if indexed_items == 0 {
3357 return Ok(serde_json::json!({
3358 "status": "not indexed",
3359 "collection": collection,
3360 "total_items": items.len(),
3361 "indexed_items": 0,
3362 }));
3363 }
3364
3365 Ok(serde_json::json!({
3366 "status": "indexed",
3367 "collection": collection,
3368 "total_chunks": total_chunks,
3369 "total_items": indexed_items,
3370 "collection_items": items.len(),
3371 "source": "zotero-sidecar",
3372 }))
3373}
3374
3375fn sidecar_chunk_count_for_item(
3376 client: &mut impl RpcCaller,
3377 item_key: &Value,
3378) -> Result<usize, String> {
3379 let attachments = client.call(
3380 "attachments.list",
3381 Some(serde_json::json!({"parentKey": item_key.clone()})),
3382 )?;
3383 let Some(attachments) = attachments.as_array() else {
3384 return Ok(0);
3385 };
3386
3387 let mut count = 0usize;
3388 for attachment in attachments {
3389 let Some(path) = attachment.get("path").and_then(Value::as_str) else {
3390 continue;
3391 };
3392 let local = local_path_from_zotero_path(path);
3393 let Some(dir) = Path::new(&local).parent() else {
3394 continue;
3395 };
3396 let Ok(bytes) = read_machine_artifact_sidecar(dir, MachineArtifactKind::Chunks) else {
3397 continue;
3398 };
3399 let text = String::from_utf8_lossy(&bytes);
3400 count += text.lines().filter(|line| !line.trim().is_empty()).count();
3401 }
3402 Ok(count)
3403}
3404
3405fn rag_store_path(collection: &str) -> PathBuf {
3406 rag_store_root().join(format!("{collection}.json"))
3407}
3408
3409fn rag_store_root() -> PathBuf {
3410 let xdg_data_home = env::var_os("XDG_DATA_HOME")
3411 .filter(|path| !path.is_empty())
3412 .map(PathBuf::from);
3413 let appdata = env::var_os("APPDATA")
3414 .filter(|path| !path.is_empty())
3415 .map(PathBuf::from);
3416 let userprofile = env::var_os("USERPROFILE")
3417 .filter(|path| !path.is_empty())
3418 .map(PathBuf::from);
3419 let home = env::var_os("HOME")
3420 .filter(|path| !path.is_empty())
3421 .map(PathBuf::from);
3422
3423 rag_store_root_for_platform(
3424 ArtifactStorePlatform::current(),
3425 xdg_data_home.as_deref(),
3426 appdata.as_deref(),
3427 userprofile.as_deref(),
3428 home.as_deref(),
3429 )
3430}
3431
3432fn rag_store_root_for_platform(
3433 platform: ArtifactStorePlatform,
3434 xdg_data_home: Option<&Path>,
3435 appdata: Option<&Path>,
3436 userprofile: Option<&Path>,
3437 home: Option<&Path>,
3438) -> PathBuf {
3439 match platform {
3440 ArtifactStorePlatform::Windows => {
3441 if let Some(path) = appdata {
3442 return path.join("Zotron").join("rag");
3443 }
3444 if let Some(path) = userprofile {
3445 return path
3446 .join("AppData")
3447 .join("Roaming")
3448 .join("Zotron")
3449 .join("rag");
3450 }
3451 if let Some(path) = home {
3452 return path
3453 .join("AppData")
3454 .join("Roaming")
3455 .join("Zotron")
3456 .join("rag");
3457 }
3458 PathBuf::from(".zotron").join("rag")
3459 }
3460 ArtifactStorePlatform::Macos => {
3461 if let Some(path) = home {
3462 return path
3463 .join("Library")
3464 .join("Application Support")
3465 .join("Zotron")
3466 .join("rag");
3467 }
3468 if let Some(path) = xdg_data_home {
3469 return path.join("zotron").join("rag");
3470 }
3471 PathBuf::from(".zotron").join("rag")
3472 }
3473 ArtifactStorePlatform::Linux | ArtifactStorePlatform::Other => xdg_data_home
3474 .map(|path| path.join("zotron").join("rag"))
3475 .or_else(|| {
3476 home.map(|path| path.join(".local").join("share").join("zotron").join("rag"))
3477 })
3478 .unwrap_or_else(|| PathBuf::from(".zotron").join("rag")),
3479 }
3480}
3481
3482fn run_push_command(
3483 json_file: String,
3484 pdf: Option<String>,
3485 collection: Option<String>,
3486 on_duplicate: String,
3487 dry_run: bool,
3488 client: &mut impl RpcCaller,
3489) -> Result<String, String> {
3490 if !matches!(on_duplicate.as_str(), "skip" | "update" | "create") {
3491 return Err(format!(
3492 "INVALID_ARGS: --on-duplicate must be skip|update|create, got {on_duplicate:?}"
3493 ));
3494 }
3495
3496 let payload = if json_file == "-" {
3497 let mut input = String::new();
3498 io::stdin()
3499 .read_to_string(&mut input)
3500 .map_err(|err| format!("read stdin: {err}"))?;
3501 input
3502 } else {
3503 fs::read_to_string(&json_file).map_err(|err| format!("read {json_file}: {err}"))?
3504 };
3505 let item_json = serde_json::from_str::<Value>(&payload)
3506 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
3507
3508 match item_json.get("itemType").and_then(Value::as_str) {
3510 Some(s) if !s.is_empty() => {}
3511 _ => return Err("INVALID_ARGS: input JSON must include a non-empty \"itemType\" field".to_string()),
3512 }
3513
3514 if dry_run {
3515 let collection_key = collection
3516 .as_deref()
3517 .map(|name| resolve_collection(client, name))
3518 .transpose()?;
3519 return format_json(
3520 &serde_json::json!({
3521 "ok": true,
3522 "dryRun": true,
3523 "wouldPush": {
3524 "title": item_json.get("title").cloned().unwrap_or(Value::Null),
3525 "itemType": item_json.get("itemType").cloned().unwrap_or(Value::Null),
3526 "collectionKey": collection_key,
3527 "pdfPath": pdf,
3528 "onDuplicate": on_duplicate,
3529 }
3530 }),
3531 JsonStyle::PythonCompact,
3532 );
3533 }
3534
3535 let result = push_item(
3536 client,
3537 &item_json,
3538 pdf.as_deref(),
3539 collection.as_deref(),
3540 &on_duplicate,
3541 )?;
3542 format_json(&result, JsonStyle::PythonCompact)
3543}
3544
3545fn push_item(
3546 client: &mut impl RpcCaller,
3547 item_json: &Value,
3548 pdf_path: Option<&str>,
3549 collection: Option<&str>,
3550 on_duplicate: &str,
3551) -> Result<Value, String> {
3552 let pdf_size = if let Some(path) = pdf_path {
3553 validate_pdf_magic(path)?
3554 } else {
3555 0
3556 };
3557
3558 let collection_key = match collection {
3559 Some(name) => resolve_collection(client, name)?,
3560 None => resolve_current_collection(client)?,
3561 };
3562
3563 let dup_id = find_duplicate(client, item_json)?;
3564 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "skip") {
3565 if !is_library_root(&collection_key) {
3566 client.call(
3567 "collections.addItems",
3568 Some(serde_json::json!({"key": collection_key, "keys": [dup_id]})),
3569 )?;
3570 }
3571 let mut pdf_attached = false;
3572 if let Some(path) = pdf_path {
3573 if !item_has_pdf_attachment(client, dup_id)? {
3574 attach_pdf(client, dup_id, path)?;
3575 pdf_attached = true;
3576 }
3577 }
3578 return Ok(push_result(
3579 "skipped_duplicate",
3580 Some(dup_id.to_string()),
3581 pdf_attached,
3582 if pdf_attached { pdf_size } else { 0 },
3583 Value::Null,
3584 ));
3585 }
3586
3587 let xpi_payload = to_xpi_payload(item_json, Some(&collection_key));
3588 let (item_key, status) =
3589 if let Some(dup_id) = dup_id.as_deref().filter(|_| on_duplicate == "update") {
3590 let mut params = serde_json::Map::new();
3591 params.insert("key".to_string(), Value::String(dup_id.to_string()));
3592 params.insert(
3593 "fields".to_string(),
3594 xpi_payload
3595 .get("fields")
3596 .cloned()
3597 .unwrap_or_else(|| serde_json::json!({})),
3598 );
3599 if let Some(creators) = xpi_payload.get("creators") {
3600 params.insert("creators".to_string(), creators.clone());
3601 }
3602 if let Some(tags) = xpi_payload.get("tags") {
3603 params.insert("tags".to_string(), tags.clone());
3604 }
3605 client.call("items.update", Some(Value::Object(params)))?;
3606 (dup_id.to_string(), "updated")
3607 } else {
3608 let created = client.call("items.create", Some(xpi_payload))?;
3609 let key = created
3610 .get("key")
3611 .and_then(Value::as_str)
3612 .ok_or_else(|| format!("items.create returned unexpected shape: {created:?}"))?;
3613 (key.to_string(), "created")
3614 };
3615
3616 let mut pdf_attached = false;
3617 if let Some(path) = pdf_path {
3618 if status != "updated" || !item_has_pdf_attachment(client, &item_key)? {
3619 attach_pdf(client, &item_key, path)?;
3620 pdf_attached = true;
3621 }
3622 }
3623
3624 if status == "updated" && !is_library_root(&collection_key) {
3625 client.call(
3626 "collections.addItems",
3627 Some(serde_json::json!({"key": collection_key, "keys": [item_key]})),
3628 )?;
3629 }
3630
3631 Ok(push_result(
3632 status,
3633 Some(item_key),
3634 pdf_attached,
3635 if pdf_attached { pdf_size } else { 0 },
3636 Value::Null,
3637 ))
3638}
3639
3640fn validate_pdf_magic(path: &str) -> Result<u64, String> {
3641 let bytes = fs::read(path)
3642 .map_err(|e| format!("INVALID_PDF: cannot read {path}: {e}"))?;
3643 if !bytes.starts_with(b"%PDF-") {
3644 return Err(format!(
3645 "INVALID_PDF: {path} does not start with %PDF- magic bytes"
3646 ));
3647 }
3648 Ok(bytes.len() as u64)
3649}
3650
3651fn resolve_current_collection(client: &mut impl RpcCaller) -> Result<Value, String> {
3652 let selected = client.call("system.currentCollection", None)?;
3653 Ok(selected
3654 .get("key")
3655 .cloned()
3656 .unwrap_or_else(|| Value::Number(0.into())))
3657}
3658
3659fn find_duplicate(
3660 client: &mut impl RpcCaller,
3661 item_json: &Value,
3662) -> Result<Option<String>, String> {
3663 if let Some(doi) = item_json
3664 .get("DOI")
3665 .and_then(Value::as_str)
3666 .filter(|doi| !doi.is_empty())
3667 {
3668 let hits = client.call("search.byIdentifier", Some(serde_json::json!({"doi": doi})))?;
3669 if let Some(key) = first_hit_key(&hits) {
3670 return Ok(Some(key));
3671 }
3672 }
3673
3674 if let Some(title) = item_json
3675 .get("title")
3676 .and_then(Value::as_str)
3677 .filter(|title| title.len() >= 10)
3678 {
3679 let hits = client.call(
3680 "search.quick",
3681 Some(serde_json::json!({"query": title, "limit": 20})),
3682 )?;
3683 if let Some(items) = response_items(&hits) {
3684 for item in items {
3685 if item.get("title").and_then(Value::as_str) == Some(title) {
3686 if let Some(key) = item.get("key").and_then(Value::as_str) {
3687 return Ok(Some(key.to_string()));
3688 }
3689 }
3690 }
3691 }
3692 }
3693
3694 Ok(None)
3695}
3696
3697fn first_hit_key(response: &Value) -> Option<String> {
3698 response_items(response)?
3699 .first()?
3700 .get("key")?
3701 .as_str()
3702 .map(ToString::to_string)
3703}
3704
3705fn response_items(response: &Value) -> Option<&Vec<Value>> {
3706 response
3707 .get("items")
3708 .and_then(Value::as_array)
3709 .or_else(|| response.as_array())
3710}
3711
3712fn to_xpi_payload(item_json: &Value, collection_key: Option<&Value>) -> Value {
3713 const NON_FIELD_KEYS: &[&str] = &[
3714 "itemType",
3715 "creators",
3716 "tags",
3717 "collections",
3718 "attachments",
3719 "relations",
3720 "notes",
3721 "id",
3722 "key",
3723 "version",
3724 ];
3725
3726 let mut fields = serde_json::Map::new();
3727 if let Some(item) = item_json.as_object() {
3728 for (key, value) in item {
3729 if !NON_FIELD_KEYS.contains(&key.as_str()) && !value.is_null() && value != "" {
3730 fields.insert(key.clone(), value.clone());
3731 }
3732 }
3733 }
3734
3735 let mut payload = serde_json::Map::new();
3736 payload.insert(
3737 "itemType".to_string(),
3738 item_json
3739 .get("itemType")
3740 .cloned()
3741 .unwrap_or_else(|| Value::String("journalArticle".to_string())),
3742 );
3743 payload.insert("fields".to_string(), Value::Object(fields));
3744
3745 if let Some(creators) = item_json.get("creators").and_then(Value::as_array) {
3746 if !creators.is_empty() {
3747 payload.insert(
3748 "creators".to_string(),
3749 Value::Array(
3750 creators
3751 .iter()
3752 .map(|creator| {
3753 serde_json::json!({
3754 "firstName": creator.get("firstName").and_then(Value::as_str).unwrap_or(""),
3755 "lastName": creator.get("lastName").and_then(Value::as_str).unwrap_or(""),
3756 "creatorType": creator.get("creatorType").and_then(Value::as_str).unwrap_or("author"),
3757 })
3758 })
3759 .collect(),
3760 ),
3761 );
3762 }
3763 }
3764
3765 if let Some(tags) = item_json.get("tags").and_then(Value::as_array) {
3766 if !tags.is_empty() {
3767 payload.insert(
3768 "tags".to_string(),
3769 Value::Array(
3770 tags.iter()
3771 .map(|tag| tag.get("tag").cloned().unwrap_or_else(|| tag.clone()))
3772 .collect(),
3773 ),
3774 );
3775 }
3776 }
3777
3778 if let Some(collection_key) = collection_key.filter(|key| !is_library_root(key)) {
3779 payload.insert(
3780 "collections".to_string(),
3781 Value::Array(vec![collection_key.clone()]),
3782 );
3783 }
3784
3785 Value::Object(payload)
3786}
3787
3788fn item_has_pdf_attachment(client: &mut impl RpcCaller, item_key: &str) -> Result<bool, String> {
3789 let attachments = client.call(
3790 "attachments.list",
3791 Some(serde_json::json!({"parentKey": item_key})),
3792 )?;
3793 Ok(has_pdf_attachment(&attachments))
3794}
3795
3796fn attach_pdf(client: &mut impl RpcCaller, item_key: &str, path: &str) -> Result<(), String> {
3797 client.call(
3798 "attachments.add",
3799 Some(serde_json::json!({
3800 "parentKey": item_key,
3801 "path": zotero_path(path),
3802 "title": "Full Text PDF",
3803 })),
3804 )?;
3805 Ok(())
3806}
3807
3808fn zotero_path(path: &str) -> String {
3809 let path = Path::new(path)
3810 .canonicalize()
3811 .unwrap_or_else(|_| Path::new(path).to_path_buf())
3812 .to_string_lossy()
3813 .into_owned();
3814 if is_wsl() {
3815 return ProcessCommand::new("wslpath")
3816 .arg("-w")
3817 .arg(&path)
3818 .output()
3819 .ok()
3820 .filter(|output| output.status.success())
3821 .and_then(|output| String::from_utf8(output.stdout).ok())
3822 .map(|converted| converted.trim().to_string())
3823 .filter(|converted| !converted.is_empty())
3824 .unwrap_or(path);
3825 }
3826 path
3827}
3828
3829fn is_wsl() -> bool {
3830 if env::var_os("WSL_DISTRO_NAME").is_some() {
3831 return true;
3832 }
3833 fs::read_to_string("/proc/sys/kernel/osrelease")
3834 .map(|release| release.to_ascii_lowercase().contains("microsoft"))
3835 .unwrap_or(false)
3836}
3837
3838fn is_library_root(value: &Value) -> bool {
3839 value.as_i64() == Some(0) || value.as_u64() == Some(0)
3840}
3841
3842fn push_result(
3843 status: &str,
3844 zotero_item_key: Option<String>,
3845 pdf_attached: bool,
3846 pdf_size_bytes: u64,
3847 error: Value,
3848) -> Value {
3849 serde_json::json!({
3850 "status": status,
3851 "zotero_item_key": zotero_item_key,
3852 "pdf_attached": pdf_attached,
3853 "pdf_size_bytes": pdf_size_bytes,
3854 "error": error,
3855 })
3856}
3857
3858fn run_search(
3859 args: SearchArgs,
3860 client: &mut impl RpcCaller,
3861) -> Result<(Value, JsonStyle), String> {
3862 let SearchArgs {
3863 query, fulltext, author, after, before, journal, tag,
3864 doi, isbn, issn, collection, limit, offset, ..
3865 } = args;
3866
3867 let has_identifier = doi.is_some() || isbn.is_some() || issn.is_some();
3868 if has_identifier {
3869 let mut params = serde_json::Map::new();
3870 if let Some(doi) = doi { params.insert("doi".into(), Value::String(doi)); }
3871 if let Some(isbn) = isbn { params.insert("isbn".into(), Value::String(isbn)); }
3872 if let Some(issn) = issn { params.insert("issn".into(), Value::String(issn)); }
3873 let value = client.call("search.byIdentifier", Some(Value::Object(params)))?;
3874 return Ok((normalize_list_envelope(value, "items", None, 0), JsonStyle::Pretty));
3875 }
3876
3877 if fulltext {
3878 let query = query.ok_or("INVALID_ARGS: --fulltext requires a search query")?;
3879 let mut params = serde_json::json!({"query": query, "limit": limit});
3880 if let (Some(col), Some(map)) = (collection, params.as_object_mut()) {
3881 map.insert("collection".into(), resolve_collection(client, &col)?);
3882 }
3883 let value = client.call("search.fulltext", Some(params))?;
3884 return Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty));
3885 }
3886
3887 let has_filters = author.is_some() || after.is_some() || before.is_some()
3888 || journal.is_some() || tag.is_some();
3889 if has_filters {
3890 let mut conditions: Vec<Value> = Vec::new();
3891 if let Some(query) = &query {
3892 conditions.push(serde_json::json!({
3893 "field": "quicksearch-titleCreatorYear",
3894 "operator": "contains",
3895 "value": query,
3896 }));
3897 }
3898 if let Some(author) = author {
3899 conditions.push(serde_json::json!({
3900 "field": "creator", "operator": "contains", "value": author,
3901 }));
3902 }
3903 if let Some(after) = after {
3904 conditions.push(serde_json::json!({
3905 "field": "date", "operator": "isAfter", "value": after,
3906 }));
3907 }
3908 if let Some(before) = before {
3909 conditions.push(serde_json::json!({
3910 "field": "date", "operator": "isBefore", "value": before,
3911 }));
3912 }
3913 if let Some(journal) = journal {
3914 conditions.push(serde_json::json!({
3915 "field": "publicationTitle", "operator": "contains", "value": journal,
3916 }));
3917 }
3918 if let Some(tag) = tag {
3919 conditions.push(serde_json::json!({
3920 "field": "tag", "operator": "is", "value": tag,
3921 }));
3922 }
3923 let value = client.call(
3924 "search.advanced",
3925 Some(serde_json::json!({
3926 "conditions": conditions,
3927 "operator": "and",
3928 "limit": limit,
3929 "offset": offset,
3930 })),
3931 )?;
3932 return Ok((normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty));
3933 }
3934
3935 let query = query.ok_or(
3936 "INVALID_ARGS: provide a search query, or use --doi/--isbn/--issn for identifier lookup"
3937 )?;
3938 let value = if let Some(col) = collection {
3939 let key = resolve_collection(client, &col)?;
3940 let response = client.call(
3941 "collections.getItems",
3942 Some(serde_json::json!({"key": key})),
3943 )?;
3944 collection_quick_search_response(&response, &query, limit)
3945 } else {
3946 filter_search_artifacts(client.call(
3947 "search.quick",
3948 Some(serde_json::json!({"query": query, "limit": limit})),
3949 )?)
3950 };
3951 Ok((normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty))
3952}
3953
3954fn run_search_management_command(
3955 command: SearchManagementCommand,
3956 client: &mut impl RpcCaller,
3957) -> Result<(Value, JsonStyle), String> {
3958 match command {
3959 SearchManagementCommand::SavedSearches { .. } => Ok((
3960 normalize_list_envelope(client.call("search.savedSearches", None)?, "items", None, 0),
3961 JsonStyle::Pretty,
3962 )),
3963 SearchManagementCommand::CreateSaved {
3964 name, condition, dry_run, ..
3965 } => {
3966 let conditions = condition
3967 .iter()
3968 .map(|raw| parse_search_condition(raw))
3969 .collect::<Result<Vec<_>, _>>()?;
3970 let params = serde_json::json!({"name": name, "conditions": conditions});
3971 if dry_run {
3972 Ok((dry_run_value("search.createSavedSearch", params), JsonStyle::PythonCompact))
3973 } else {
3974 Ok((client.call("search.createSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3975 }
3976 }
3977 SearchManagementCommand::DeleteSaved {
3978 search_key, dry_run, ..
3979 } => {
3980 let params = serde_json::json!({"key": search_key});
3981 if dry_run {
3982 Ok((dry_run_value("search.deleteSavedSearch", params), JsonStyle::PythonCompact))
3983 } else {
3984 Ok((client.call("search.deleteSavedSearch", Some(params))?, JsonStyle::PythonCompact))
3985 }
3986 }
3987 }
3988}
3989
3990fn filter_search_artifacts(mut value: Value) -> Value {
3991 let Some(items) = value.get_mut("items").and_then(Value::as_array_mut) else {
3992 return value;
3993 };
3994 items.retain(|item| match item.get("title").and_then(Value::as_str) {
3995 Some(title) => !is_zotron_evidence_artifact(title),
3996 None => true,
3997 });
3998 let total_items = items.len() as u64;
3999 if let Some(total) = value.get_mut("total") {
4000 *total = Value::from(total_items);
4001 }
4002 value
4003}
4004
4005fn collection_quick_search_response(response: &Value, query: &str, limit: u64) -> Value {
4006 let mut matched = collection_items(response)
4007 .into_iter()
4008 .filter(|item| !item_is_evidence_artifact(item))
4009 .filter(|item| quick_item_matches(item, query))
4010 .collect::<Vec<_>>();
4011 let total = matched.len() as u64;
4012 let limit = usize::try_from(limit).unwrap_or(usize::MAX);
4013 if matched.len() > limit {
4014 matched.truncate(limit);
4015 }
4016 serde_json::json!({"items": matched, "total": total})
4017}
4018
4019fn item_is_evidence_artifact(item: &Value) -> bool {
4020 item.get("title")
4021 .and_then(Value::as_str)
4022 .is_some_and(is_zotron_evidence_artifact)
4023}
4024
4025fn quick_item_matches(item: &Value, query: &str) -> bool {
4026 let terms = query
4027 .split_whitespace()
4028 .map(|term| term.to_lowercase())
4029 .filter(|term| !term.is_empty())
4030 .collect::<Vec<_>>();
4031 if terms.is_empty() {
4032 return true;
4033 }
4034 let mut haystack = String::new();
4035 append_search_text(item, &mut haystack);
4036 let haystack = haystack.to_lowercase();
4037 terms.iter().all(|term| haystack.contains(term))
4038}
4039
4040fn append_search_text(value: &Value, out: &mut String) {
4041 match value {
4042 Value::String(text) => {
4043 out.push(' ');
4044 out.push_str(text);
4045 }
4046 Value::Number(number) => {
4047 out.push(' ');
4048 out.push_str(&number.to_string());
4049 }
4050 Value::Bool(value) => {
4051 out.push(' ');
4052 out.push_str(if *value { "true" } else { "false" });
4053 }
4054 Value::Array(items) => {
4055 for item in items {
4056 append_search_text(item, out);
4057 }
4058 }
4059 Value::Object(map) => {
4060 for item in map.values() {
4061 append_search_text(item, out);
4062 }
4063 }
4064 Value::Null => {}
4065 }
4066}
4067
4068fn parse_search_condition(raw: &str) -> Result<Value, String> {
4069 let mut parts = raw.split_whitespace();
4070 let field = parts.next();
4071 let operator = parts.next();
4072 let value = parts.collect::<Vec<_>>().join(" ");
4073 match (field, operator, value.is_empty()) {
4074 (Some(field), Some(operator), false) => Ok(serde_json::json!({
4075 "field": field,
4076 "operator": operator,
4077 "value": value,
4078 })),
4079 _ => Err(format!(
4080 "INVALID_ARGS: --condition must be 'field operator value', got: {raw:?}"
4081 )),
4082 }
4083}
4084
4085fn normalize_list_envelope(value: Value, list_key: &str, limit: Option<u64>, offset: u64) -> Value {
4086 if let Value::Array(arr) = value {
4087 let total = arr.len() as u64;
4088 let mut obj = serde_json::Map::new();
4089 obj.insert(list_key.to_string(), Value::Array(arr));
4090 obj.insert("total".to_string(), Value::from(total));
4091 if let Some(limit) = limit {
4092 obj.insert("limit".to_string(), Value::from(limit));
4093 }
4094 obj.insert("offset".to_string(), Value::from(offset));
4095 obj.insert("hasMore".to_string(), Value::Bool(false));
4096 return Value::Object(obj);
4097 }
4098
4099 let mut obj = match value {
4100 Value::Object(obj) if obj.contains_key(list_key) => obj,
4101 other => return other,
4102 };
4103
4104 let items_len = obj
4105 .get(list_key)
4106 .and_then(Value::as_array)
4107 .map_or(0, |a| a.len()) as u64;
4108 let total = obj
4109 .get("total")
4110 .and_then(Value::as_u64)
4111 .unwrap_or(items_len);
4112
4113 obj.insert("total".to_string(), Value::from(total));
4114 if let Some(limit) = limit {
4115 obj.insert("limit".to_string(), Value::from(limit));
4116 }
4117 obj.insert("offset".to_string(), Value::from(offset));
4118 obj.insert(
4119 "hasMore".to_string(),
4120 Value::Bool(offset + items_len < total),
4121 );
4122
4123 Value::Object(obj)
4124}
4125
4126const RPC_PAGINATION_SAFETY_CAP: usize = 10_000;
4127const RPC_PAGE_LIST_KEYS: [&str; 4] = ["items", "tags", "results", "data"];
4128
4129fn paginate_rpc(
4130 client: &mut impl RpcCaller,
4131 method: &str,
4132 params: Value,
4133 page_size: usize,
4134) -> Result<Value, String> {
4135 let base = params
4136 .as_object()
4137 .ok_or_else(|| "params must be a JSON object".to_string())?;
4138 let mut out = Vec::new();
4139 let mut prev_page: Option<Vec<Value>> = None;
4140 let mut offset = 0usize;
4141
4142 loop {
4143 let mut page_params = base.clone();
4144 page_params.insert("offset".to_string(), Value::Number(offset.into()));
4145 page_params.insert("limit".to_string(), Value::Number(page_size.into()));
4146 let response = client.call(method, Some(Value::Object(page_params)))?;
4147
4148 let page = match extract_page(&response) {
4149 Some(page) => page,
4150 None if out.is_empty() => return Ok(response),
4151 None if response.is_object() => {
4152 return Err(format!(
4153 "paginate: {method:?} returned a non-paginated dict after {} accumulated rows; aborting",
4154 out.len()
4155 ));
4156 }
4157 None => {
4158 return Err(format!(
4159 "paginate: {method:?} returned non-list/non-dict shape after {} accumulated rows; aborting",
4160 out.len()
4161 ));
4162 }
4163 };
4164
4165 if prev_page.as_ref() == Some(&page) {
4166 return Err(format!(
4167 "paginate: {method:?} returned identical pages — method likely ignores offset; aborting after {} rows",
4168 out.len()
4169 ));
4170 }
4171
4172 let page_len = page.len();
4173 out.extend(page.clone());
4174 if page_len < page_size {
4175 return Ok(Value::Array(out));
4176 }
4177 if out.len() >= RPC_PAGINATION_SAFETY_CAP {
4178 out.truncate(RPC_PAGINATION_SAFETY_CAP);
4179 return Ok(Value::Array(out));
4180 }
4181 prev_page = Some(page);
4182 offset += page_size;
4183 }
4184}
4185
4186fn extract_page(response: &Value) -> Option<Vec<Value>> {
4187 if let Some(page) = response.as_array() {
4188 return Some(page.clone());
4189 }
4190 let object = response.as_object()?;
4191 for key in RPC_PAGE_LIST_KEYS {
4192 if let Some(page) = object.get(key).and_then(Value::as_array) {
4193 return Some(page.clone());
4194 }
4195 }
4196 None
4197}
4198
4199fn run_find_pdfs_command(
4200 client: &mut impl RpcCaller,
4201 collection: String,
4202 limit: usize,
4203) -> Result<(Value, JsonStyle), String> {
4204 let collection_key = resolve_collection(client, &collection)?;
4205 let response = client.call(
4206 "collections.getItems",
4207 Some(serde_json::json!({"key": collection_key})),
4208 )?;
4209 let items = collection_items(&response);
4210
4211 let mut missing = Vec::new();
4212 for item in &items {
4213 let Some(item_key) = item.get("key").and_then(Value::as_str) else {
4214 continue;
4215 };
4216 let attachments = client.call(
4217 "attachments.list",
4218 Some(serde_json::json!({"parentKey": item_key})),
4219 )?;
4220 if !has_pdf_attachment(&attachments) {
4221 missing.push(item.clone());
4222 }
4223 if limit > 0 && missing.len() >= limit {
4224 break;
4225 }
4226 }
4227
4228 let mut results = Vec::new();
4229 for item in &missing {
4230 let item_key = item
4231 .get("key")
4232 .and_then(Value::as_str)
4233 .ok_or_else(|| "missing item lacks key".to_string())?;
4234 let response = client.call(
4235 "attachments.findPDF",
4236 Some(serde_json::json!({"parentKey": item_key})),
4237 )?;
4238 let attachment = response.get("attachment").filter(|value| !value.is_null());
4239 results.push(serde_json::json!({
4240 "item_key": item_key,
4241 "title": item.get("title").cloned().unwrap_or(Value::Null),
4242 "found": attachment.is_some(),
4243 "attachment_key": attachment
4244 .and_then(|attachment| attachment.get("key"))
4245 .cloned()
4246 .unwrap_or(Value::Null),
4247 }));
4248 }
4249
4250 Ok((
4251 serde_json::json!({
4252 "scanned": items.len(),
4253 "attempted": missing.len(),
4254 "results": results,
4255 }),
4256 JsonStyle::Pretty,
4257 ))
4258}
4259
4260fn collection_items(response: &Value) -> Vec<Value> {
4261 if let Some(items) = response.get("items").and_then(Value::as_array) {
4262 return items.clone();
4263 }
4264 response.as_array().cloned().unwrap_or_default()
4265}
4266
4267fn has_pdf_attachment(attachments: &Value) -> bool {
4268 attachments
4269 .as_array()
4270 .is_some_and(|attachments| attachments.iter().any(is_pdf_attachment))
4271}
4272
4273fn is_pdf_attachment(attachment: &Value) -> bool {
4274 let content_type = attachment
4275 .get("contentType")
4276 .and_then(Value::as_str)
4277 .unwrap_or_default()
4278 .to_lowercase();
4279 let path = attachment
4280 .get("path")
4281 .and_then(Value::as_str)
4282 .unwrap_or_default()
4283 .to_lowercase();
4284 matches!(
4285 content_type.as_str(),
4286 "application/pdf" | "application/x-pdf"
4287 ) || path.ends_with(".pdf")
4288}
4289
4290fn call_json(
4291 client: &mut impl RpcCaller,
4292 method: &str,
4293 params: Option<Value>,
4294) -> Result<Value, String> {
4295 client.call(method, params)
4296}
4297
4298fn run_system_command(
4299 command: SystemCommand,
4300 client: &mut impl RpcCaller,
4301) -> Result<(Value, JsonStyle), String> {
4302 let value = match command {
4303 SystemCommand::Version { .. } => client.call("system.version", None)?,
4304 SystemCommand::Libraries { .. } => client.call("system.libraries", None)?,
4305 SystemCommand::LibraryStats { library, .. } => {
4306 let params = library.map(|id| serde_json::json!({"id": id}));
4307 client.call("system.libraryStats", params)?
4308 }
4309 SystemCommand::Schema { item_type, .. } => {
4310 if let Some(item_type) = item_type {
4311 let fields = client.call("system.itemFields", Some(serde_json::json!({"itemType": item_type})))?;
4312 let creators = client.call("system.creatorTypes", Some(serde_json::json!({"itemType": item_type})))?;
4313 let field_names: Vec<Value> = fields.as_array().unwrap_or(&vec![])
4314 .iter()
4315 .filter_map(|f| f.get("field").cloned())
4316 .collect();
4317 let creator_names: Vec<Value> = creators.as_array().unwrap_or(&vec![])
4318 .iter()
4319 .filter_map(|c| c.get("creatorType").cloned())
4320 .collect();
4321 serde_json::json!({
4322 "itemType": item_type,
4323 "fields": field_names,
4324 "creatorTypes": creator_names,
4325 })
4326 } else {
4327 let types = client.call("system.itemTypes", None)?;
4328 let type_names: Vec<Value> = types.as_array().unwrap_or(&vec![])
4329 .iter()
4330 .filter_map(|t| t.get("itemType").cloned())
4331 .collect();
4332 Value::Array(type_names)
4333 }
4334 }
4335 SystemCommand::CurrentCollection { .. } => client.call("system.currentCollection", None)?,
4336 SystemCommand::ListMethods { .. } => client.call("system.listMethods", None)?,
4337 SystemCommand::Describe { method, .. } => {
4338 let params = method.map(|method| serde_json::json!({"method": method}));
4339 client.call("system.describe", params)?
4340 }
4341 };
4342 Ok((value, JsonStyle::Pretty))
4343}
4344
4345fn run_items_command(
4346 command: ItemsCommand,
4347 client: &mut impl RpcCaller,
4348) -> Result<(Value, JsonStyle), String> {
4349 let (value, style) = match command {
4350 ItemsCommand::Add {
4351 doi,
4352 isbn,
4353 from_url,
4354 file,
4355 collection,
4356 dry_run,
4357 ..
4358 } => {
4359 if let Some(doi) = doi {
4360 run_add_identifier_command(client, "items.addByDOI", "doi", doi, collection, dry_run)?
4361 } else if let Some(isbn) = isbn {
4362 run_add_identifier_command(client, "items.addByISBN", "isbn", isbn, collection, dry_run)?
4363 } else if let Some(from_url) = from_url {
4364 run_add_identifier_command(client, "items.addByURL", "url", from_url, collection, dry_run)?
4365 } else if let Some(file) = file {
4366 let mut params = serde_json::json!({"path": zotero_path(&file)});
4367 maybe_insert_collection(client, &mut params, collection)?;
4368 run_mutation_command(client, "items.addFromFile", params, dry_run)?
4369 } else {
4370 return Err("INVALID_ARGS: provide one of --doi, --isbn, --from-url, or --file".into());
4371 }
4372 }
4373 ItemsCommand::Create {
4374 item_type,
4375 fields,
4376 dry_run,
4377 ..
4378 } => {
4379 let parsed_fields = parse_field_options(&fields)?;
4380 let mut params = serde_json::json!({"itemType": item_type});
4381 if !parsed_fields.is_empty() {
4382 if let Some(map) = params.as_object_mut() {
4383 map.insert("fields".to_string(), Value::Object(parsed_fields));
4384 }
4385 }
4386 run_mutation_command(client, "items.create", params, dry_run)?
4387 }
4388 ItemsCommand::Update {
4389 key,
4390 fields,
4391 dry_run,
4392 ..
4393 } => {
4394 let parsed_fields = parse_field_options(&fields)?;
4395 let mut params = serde_json::json!({"key": key});
4396 if !parsed_fields.is_empty() {
4397 if let Some(map) = params.as_object_mut() {
4398 map.insert("fields".to_string(), Value::Object(parsed_fields));
4399 }
4400 }
4401 run_mutation_command(client, "items.update", params, dry_run)?
4402 }
4403 ItemsCommand::Delete { key, dry_run, .. } => run_mutation_command(
4404 client,
4405 "items.delete",
4406 serde_json::json!({"key": key}),
4407 dry_run,
4408 )?,
4409 ItemsCommand::Trash {
4410 items, dry_run, ..
4411 } => {
4412 if items.len() == 1 {
4413 run_mutation_command(
4414 client,
4415 "items.trash",
4416 serde_json::json!({"key": items[0]}),
4417 dry_run,
4418 )?
4419 } else {
4420 run_mutation_command(
4421 client,
4422 "items.batchTrash",
4423 serde_json::json!({"keys": items}),
4424 dry_run,
4425 )?
4426 }
4427 }
4428 ItemsCommand::Restore { item, dry_run, .. } => run_mutation_command(
4429 client,
4430 "items.restore",
4431 serde_json::json!({"key": item}),
4432 dry_run,
4433 )?,
4434 ItemsCommand::MergeDuplicates { keys, dry_run, .. } => {
4435 if keys.len() < 2 {
4436 return Err("INVALID_ARGS: need at least 2 keys to merge".to_string());
4437 }
4438 run_mutation_command(
4439 client,
4440 "items.mergeDuplicates",
4441 serde_json::json!({"keys": keys}),
4442 dry_run,
4443 )?
4444 }
4445 ItemsCommand::AddRelated {
4446 key,
4447 target,
4448 dry_run,
4449 ..
4450 } => run_mutation_command(
4451 client,
4452 "items.addRelated",
4453 serde_json::json!({"key": key, "targetKey": target}),
4454 dry_run,
4455 )?,
4456 ItemsCommand::RemoveRelated {
4457 key,
4458 target,
4459 dry_run,
4460 ..
4461 } => run_mutation_command(
4462 client,
4463 "items.removeRelated",
4464 serde_json::json!({"key": key, "targetKey": target}),
4465 dry_run,
4466 )?,
4467 ItemsCommand::Get { item, .. } => (
4468 client.call("items.get", Some(serde_json::json!({"key": item})))?,
4469 JsonStyle::Pretty,
4470 ),
4471 ItemsCommand::List {
4472 limit,
4473 offset,
4474 sort,
4475 direction,
4476 trash,
4477 ..
4478 } => {
4479 if trash {
4480 let value = client.call(
4481 "items.getTrash",
4482 Some(serde_json::json!({"limit": limit, "offset": offset})),
4483 )?;
4484 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4485 } else {
4486 let mut params = serde_json::json!({
4487 "limit": limit,
4488 "offset": offset,
4489 "direction": direction,
4490 });
4491 if let (Some(sort), Some(map)) = (sort, params.as_object_mut()) {
4492 map.insert("sort".to_string(), Value::String(sort));
4493 }
4494 let value = client.call("items.list", Some(params))?;
4495 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4496 }
4497 }
4498 ItemsCommand::FindDuplicates { .. } => (
4499 client.call("items.findDuplicates", None)?,
4500 JsonStyle::Pretty,
4501 ),
4502 ItemsCommand::Recent {
4503 limit,
4504 offset,
4505 recent_type,
4506 ..
4507 } => {
4508 if recent_type != "added" && recent_type != "modified" {
4509 return Err(format!(
4510 "--type must be added or modified, got {recent_type:?}"
4511 ));
4512 }
4513 let value = client.call(
4514 "items.getRecent",
4515 Some(
4516 serde_json::json!({"limit": limit, "offset": offset, "type": recent_type}),
4517 ),
4518 )?;
4519 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
4520 }
4521 ItemsCommand::Fulltext { key, .. } => (
4522 client.call("items.getFullText", Some(serde_json::json!({"key": key})))?,
4523 JsonStyle::Pretty,
4524 ),
4525 ItemsCommand::Related { key, .. } => (
4526 normalize_list_envelope(
4527 client.call("items.getRelated", Some(serde_json::json!({"key": key})))?,
4528 "items",
4529 None,
4530 0,
4531 ),
4532 JsonStyle::Pretty,
4533 ),
4534 ItemsCommand::CitationKey { key, .. } => (
4535 client.call("items.citationKey", Some(serde_json::json!({"key": key})))?,
4536 JsonStyle::Pretty,
4537 ),
4538 };
4539 Ok((value, style))
4540}
4541
4542fn run_add_identifier_command(
4543 client: &mut impl RpcCaller,
4544 method: &str,
4545 param_name: &str,
4546 param_value: String,
4547 collection: Option<String>,
4548 dry_run: bool,
4549) -> Result<(Value, JsonStyle), String> {
4550 let mut params = Value::Object(serde_json::Map::from_iter([(
4551 param_name.to_string(),
4552 Value::String(param_value),
4553 )]));
4554 maybe_insert_collection(client, &mut params, collection)?;
4555 run_mutation_command(client, method, params, dry_run)
4556}
4557
4558fn run_mutation_command(
4559 client: &mut impl RpcCaller,
4560 method: &str,
4561 params: Value,
4562 dry_run: bool,
4563) -> Result<(Value, JsonStyle), String> {
4564 let value = if dry_run {
4565 serde_json::json!({
4566 "ok": true,
4567 "dryRun": true,
4568 "wouldCall": method,
4569 "wouldCallParams": params,
4570 })
4571 } else {
4572 client.call(method, Some(params))?
4573 };
4574 Ok((value, JsonStyle::PythonCompact))
4575}
4576
4577fn parse_field_options(fields: &[String]) -> Result<serde_json::Map<String, Value>, String> {
4578 let mut parsed = serde_json::Map::new();
4579 for field in fields {
4580 let (key, value) = field
4581 .split_once('=')
4582 .ok_or_else(|| format!("INVALID_ARGS: --field must be key=value, got: {field:?}"))?;
4583 parsed.insert(key.to_string(), Value::String(value.to_string()));
4584 }
4585 Ok(parsed)
4586}
4587
4588fn maybe_insert_collection(
4589 client: &mut impl RpcCaller,
4590 params: &mut Value,
4591 collection: Option<String>,
4592) -> Result<(), String> {
4593 let Some(collection) = collection else {
4594 return Ok(());
4595 };
4596 let collection = resolve_collection(client, &collection)?;
4597 let include = match &collection {
4598 Value::Null => false,
4599 Value::Number(number) => number.as_i64() != Some(0),
4600 _ => true,
4601 };
4602 if include {
4603 params
4604 .as_object_mut()
4605 .expect("mutation params are always objects")
4606 .insert("collection".to_string(), collection);
4607 }
4608 Ok(())
4609}
4610
4611fn run_settings_command(
4612 command: SettingsCommand,
4613 client: &mut impl RpcCaller,
4614) -> Result<(Value, JsonStyle), String> {
4615 let (value, style) = match command {
4616 SettingsCommand::Get { key, .. } => (
4617 client.call("settings.get", Some(serde_json::json!({"key": key})))?,
4618 JsonStyle::Pretty,
4619 ),
4620 SettingsCommand::List { .. } => (client.call("settings.getAll", None)?, JsonStyle::Pretty),
4621 SettingsCommand::Set {
4622 pairs,
4623 file,
4624 dry_run,
4625 ..
4626 } => {
4627 if let Some(file) = file {
4628 let raw = fs::read_to_string(&file)
4630 .map_err(|err| format!("INVALID_JSON: Could not read JSON: {err}"))?;
4631 let settings: Value = serde_json::from_str(&raw)
4632 .map_err(|err| format!("INVALID_JSON: Could not parse JSON: {err}"))?;
4633 if dry_run {
4634 (
4635 dry_run_value("settings.setAll", settings),
4636 JsonStyle::PythonCompact,
4637 )
4638 } else {
4639 (
4640 client.call("settings.setAll", Some(settings))?,
4641 JsonStyle::PythonCompact,
4642 )
4643 }
4644 } else if pairs.len() == 2 {
4645 let key = &pairs[0];
4647 let value = &pairs[1];
4648 let parsed_value = serde_json::from_str::<Value>(value)
4649 .unwrap_or(Value::String(value.clone()));
4650 let params = serde_json::json!({"key": key, "value": parsed_value});
4651 if dry_run {
4652 (
4653 dry_run_value("settings.set", params),
4654 JsonStyle::PythonCompact,
4655 )
4656 } else {
4657 (
4658 client.call("settings.set", Some(params))?,
4659 JsonStyle::PythonCompact,
4660 )
4661 }
4662 } else if pairs.len() > 2 && pairs.len() % 2 == 0 {
4663 let mut map = serde_json::Map::new();
4665 for chunk in pairs.chunks(2) {
4666 let parsed = serde_json::from_str::<Value>(&chunk[1])
4667 .unwrap_or(Value::String(chunk[1].clone()));
4668 map.insert(chunk[0].clone(), parsed);
4669 }
4670 let settings = Value::Object(map);
4671 if dry_run {
4672 (
4673 dry_run_value("settings.setAll", settings),
4674 JsonStyle::PythonCompact,
4675 )
4676 } else {
4677 (
4678 client.call("settings.setAll", Some(settings))?,
4679 JsonStyle::PythonCompact,
4680 )
4681 }
4682 } else {
4683 return Err(
4684 "INVALID_ARGS: provide key value pairs (even number of args) or --file".into(),
4685 );
4686 }
4687 }
4688 };
4689 Ok((value, style))
4690}
4691
4692fn run_tags_command(
4693 command: TagsCommand,
4694 client: &mut impl RpcCaller,
4695) -> Result<(Value, JsonStyle), String> {
4696 let (value, style) = match command {
4697 TagsCommand::List { limit, .. } => {
4698 let value = client.call("tags.list", Some(serde_json::json!({"limit": limit})))?;
4699 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
4700 }
4701 TagsCommand::Rename {
4702 old, new, dry_run, ..
4703 } => run_tag_mutation(
4704 client,
4705 "tags.rename",
4706 serde_json::json!({"oldName": old, "newName": new}),
4707 dry_run,
4708 )?,
4709 TagsCommand::Delete { tag, dry_run, .. } => run_tag_mutation(
4710 client,
4711 "tags.delete",
4712 serde_json::json!({"tag": tag}),
4713 dry_run,
4714 )?,
4715 TagsCommand::Add {
4716 keys, tags, dry_run, ..
4717 } => {
4718 if keys.len() == 1 {
4719 run_tag_mutation(
4720 client,
4721 "tags.add",
4722 serde_json::json!({"key": keys[0], "tags": tags}),
4723 dry_run,
4724 )?
4725 } else {
4726 run_tag_mutation(
4727 client,
4728 "tags.batchUpdate",
4729 serde_json::json!({"keys": keys, "add": tags}),
4730 dry_run,
4731 )?
4732 }
4733 }
4734 TagsCommand::Remove {
4735 keys, tags, dry_run, ..
4736 } => {
4737 if keys.len() == 1 {
4738 run_tag_mutation(
4739 client,
4740 "tags.remove",
4741 serde_json::json!({"key": keys[0], "tags": tags}),
4742 dry_run,
4743 )?
4744 } else {
4745 run_tag_mutation(
4746 client,
4747 "tags.batchUpdate",
4748 serde_json::json!({"keys": keys, "remove": tags}),
4749 dry_run,
4750 )?
4751 }
4752 }
4753 };
4754 Ok((value, style))
4755}
4756
4757fn run_tag_mutation(
4758 client: &mut impl RpcCaller,
4759 method: &str,
4760 params: Value,
4761 dry_run: bool,
4762) -> Result<(Value, JsonStyle), String> {
4763 if dry_run {
4764 Ok((dry_run_value(method, params), JsonStyle::PythonCompact))
4765 } else {
4766 Ok((client.call(method, Some(params))?, JsonStyle::PythonCompact))
4767 }
4768}
4769
4770fn dry_run_value(method: &str, params: Value) -> Value {
4771 serde_json::json!({
4772 "ok": true,
4773 "dryRun": true,
4774 "wouldCall": method,
4775 "wouldCallParams": params,
4776 })
4777}
4778
4779fn run_annotations_command(
4780 command: AnnotationsCommand,
4781 client: &mut impl RpcCaller,
4782) -> Result<(Value, JsonStyle), String> {
4783 let (value, style) = match command {
4784 AnnotationsCommand::List { parent, .. } => {
4785 let value = client.call(
4786 "annotations.list",
4787 Some(serde_json::json!({"parentKey": parent})),
4788 )?;
4789 let total = value
4790 .get("items")
4791 .and_then(Value::as_array)
4792 .map_or(0, |a| a.len()) as u64;
4793 (normalize_list_envelope(value, "items", Some(total), 0), JsonStyle::Pretty)
4794 }
4795 AnnotationsCommand::Create {
4796 parent,
4797 annotation_type,
4798 position,
4799 quote,
4800 page,
4801 sort_index,
4802 text,
4803 comment,
4804 color,
4805 dry_run,
4806 ..
4807 } => {
4808 let annotation_type = annotation_type.unwrap_or_else(|| "highlight".to_string());
4809 if !matches!(
4810 annotation_type.as_str(),
4811 "highlight" | "note" | "underline" | "image" | "ink"
4812 ) {
4813 return Err(format!(
4814 "INVALID_ARGS: --type must be highlight|note|underline|image|ink, got {annotation_type:?}"
4815 ));
4816 }
4817 let mut params = serde_json::Map::new();
4818 params.insert("parentKey".to_string(), Value::String(parent));
4819 params.insert("type".to_string(), Value::String(annotation_type.clone()));
4820 params.insert("color".to_string(), Value::String(color));
4821
4822 if let Some(ref quote_text) = quote {
4823 if !matches!(annotation_type.as_str(), "highlight" | "underline") {
4824 return Err(format!(
4825 "INVALID_ARGS: --quote is only valid for highlight|underline, got {annotation_type:?}"
4826 ));
4827 }
4828 params.insert("quote".to_string(), Value::String(quote_text.clone()));
4829 if let Some(page_idx) = page {
4830 params.insert(
4831 "pageIndex".to_string(),
4832 Value::Number(page_idx.into()),
4833 );
4834 }
4835 if let Some(raw) = position {
4837 let pos = serde_json::from_str::<Value>(&raw)
4838 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))?;
4839 validate_annotation_position(annotation_type.as_str(), &pos)?;
4840 params.insert("position".to_string(), pos);
4841 }
4842 } else {
4843 let position = position
4844 .ok_or_else(|| "INVALID_ARGS: --position JSON is required (or use --quote)".to_string())
4845 .and_then(|raw| {
4846 serde_json::from_str::<Value>(&raw)
4847 .map_err(|err| format!("INVALID_JSON: Could not parse --position: {err}"))
4848 })?;
4849 validate_annotation_position(annotation_type.as_str(), &position)?;
4850 params.insert("position".to_string(), position);
4851 }
4852
4853 if let Some(sort_index) = sort_index {
4854 params.insert(
4855 "sortIndex".to_string(),
4856 parse_annotation_sort_index(sort_index)?,
4857 );
4858 }
4859 if let Some(text) = text {
4860 params.insert("text".to_string(), Value::String(text));
4861 }
4862 if let Some(comment) = comment {
4863 params.insert("comment".to_string(), Value::String(comment));
4864 }
4865 run_mutating_command(client, "annotations.create", Value::Object(params), dry_run)?
4866 }
4867 AnnotationsCommand::Delete {
4868 annotation_key,
4869 dry_run,
4870 ..
4871 } => run_mutating_command(
4872 client,
4873 "annotations.delete",
4874 serde_json::json!({"key": annotation_key}),
4875 dry_run,
4876 )?,
4877 };
4878 Ok((value, style))
4879}
4880
4881fn validate_annotation_position(annotation_type: &str, position: &Value) -> Result<(), String> {
4882 position
4883 .get("pageIndex")
4884 .and_then(Value::as_i64)
4885 .filter(|value| *value >= 0)
4886 .ok_or_else(|| {
4887 "INVALID_ARGS: --position must include a non-negative integer pageIndex".to_string()
4888 })?;
4889
4890 if annotation_type == "ink" {
4891 let has_paths = position
4892 .get("paths")
4893 .and_then(Value::as_array)
4894 .is_some_and(|paths| !paths.is_empty());
4895 if !has_paths {
4896 return Err("INVALID_ARGS: ink --position must include non-empty paths".to_string());
4897 }
4898 return Ok(());
4899 }
4900
4901 let valid_rects = position
4902 .get("rects")
4903 .and_then(Value::as_array)
4904 .is_some_and(|rects| !rects.is_empty() && rects.iter().all(is_annotation_rect));
4905 if !valid_rects {
4906 return Err(
4907 "INVALID_ARGS: --position must include non-empty rects of [x1, y1, x2, y2]".to_string(),
4908 );
4909 }
4910 Ok(())
4911}
4912
4913fn is_annotation_rect(value: &Value) -> bool {
4914 value.as_array().is_some_and(|coords| {
4915 coords.len() == 4
4916 && coords
4917 .iter()
4918 .all(|coord| coord.as_f64().is_some_and(f64::is_finite))
4919 })
4920}
4921
4922fn parse_annotation_sort_index(raw: String) -> Result<Value, String> {
4923 let parsed = serde_json::from_str::<Value>(&raw).unwrap_or_else(|_| Value::String(raw));
4924 let valid = match &parsed {
4925 Value::Number(number) => number.as_f64().is_some_and(f64::is_finite),
4926 Value::String(value) => {
4927 is_zotero_pdf_sort_index(value.trim())
4928 || (!value.trim().is_empty()
4929 && value.trim().parse::<f64>().is_ok_and(f64::is_finite))
4930 }
4931 _ => false,
4932 };
4933 if valid {
4934 Ok(parsed)
4935 } else {
4936 Err(format!(
4937 "INVALID_ARGS: --sort-index must be a finite number or numeric string, got {parsed}"
4938 ))
4939 }
4940}
4941
4942fn is_zotero_pdf_sort_index(value: &str) -> bool {
4943 let mut parts = value.split('|');
4944 matches!(
4945 (parts.next(), parts.next(), parts.next(), parts.next()),
4946 (Some(page), Some(offset), Some(y), None)
4947 if page.len() == 5
4948 && offset.len() == 6
4949 && y.len() == 5
4950 && page.chars().all(|ch| ch.is_ascii_digit())
4951 && offset.chars().all(|ch| ch.is_ascii_digit())
4952 && y.chars().all(|ch| ch.is_ascii_digit())
4953 )
4954}
4955
4956fn run_attachments_command(
4957 command: AttachmentsCommand,
4958 client: &mut impl RpcCaller,
4959) -> Result<(Value, JsonStyle), String> {
4960 let value = match command {
4961 AttachmentsCommand::List {
4962 parent,
4963 limit,
4964 offset,
4965 ..
4966 } => normalize_list_envelope(
4967 client.call(
4968 "attachments.list",
4969 Some(serde_json::json!({"parentKey": parent})),
4970 )?,
4971 "items",
4972 Some(limit),
4973 offset,
4974 ),
4975 AttachmentsCommand::Get { key, .. } => {
4976 client.call("attachments.get", Some(serde_json::json!({"key": key})))?
4977 }
4978 AttachmentsCommand::Fulltext { key, .. } => client.call(
4979 "attachments.getFulltext",
4980 Some(serde_json::json!({"key": key})),
4981 )?,
4982 AttachmentsCommand::Path { key, .. } => localize_attachment_path_response(
4983 client.call("attachments.getPath", Some(serde_json::json!({"key": key})))?,
4984 ),
4985 AttachmentsCommand::Add {
4986 parent,
4987 path,
4988 from_url,
4989 title,
4990 dry_run,
4991 ..
4992 } => {
4993 match (path, from_url) {
4994 (Some(p), None) => {
4995 let mut params = serde_json::json!({"parentKey": parent, "path": zotero_path(&p)});
4996 insert_optional_string(&mut params, "title", title);
4997 if dry_run {
4998 return Ok((
4999 dry_run_value("attachments.add", params),
5000 JsonStyle::PythonCompact,
5001 ));
5002 }
5003 return Ok((
5004 client.call("attachments.add", Some(params))?,
5005 JsonStyle::PythonCompact,
5006 ));
5007 }
5008 (None, Some(u)) => {
5009 let mut params = serde_json::json!({"parentKey": parent, "url": u});
5010 insert_optional_string(&mut params, "title", title);
5011 if dry_run {
5012 return Ok((
5013 dry_run_value("attachments.addByURL", params),
5014 JsonStyle::PythonCompact,
5015 ));
5016 }
5017 return Ok((
5018 client.call("attachments.addByURL", Some(params))?,
5019 JsonStyle::PythonCompact,
5020 ));
5021 }
5022 (Some(_), Some(_)) => {
5023 return Err("INVALID_ARGS: --path and --from-url are mutually exclusive".to_string());
5024 }
5025 (None, None) => {
5026 return Err("INVALID_ARGS: either --path or --from-url is required".to_string());
5027 }
5028 }
5029 }
5030 AttachmentsCommand::Delete { key, dry_run, .. } => {
5031 let params = serde_json::json!({"key": key});
5032 if dry_run {
5033 return Ok((
5034 dry_run_value("attachments.delete", params),
5035 JsonStyle::PythonCompact,
5036 ));
5037 }
5038 return Ok((
5039 client.call("attachments.delete", Some(params))?,
5040 JsonStyle::PythonCompact,
5041 ));
5042 }
5043 AttachmentsCommand::FindPdf { parent, .. } => client.call(
5044 "attachments.findPDF",
5045 Some(serde_json::json!({"parentKey": parent})),
5046 )?,
5047 };
5048 Ok((value, JsonStyle::Pretty))
5049}
5050
5051fn localize_attachment_path_response(mut value: Value) -> Value {
5052 if let Some(path) = value.get("path").and_then(Value::as_str) {
5053 let local = local_path_from_zotero_path(path);
5054 if let Some(map) = value.as_object_mut() {
5055 map.insert("path".to_string(), Value::String(local));
5056 }
5057 }
5058 value
5059}
5060
5061fn run_notes_command(
5062 command: NotesCommand,
5063 client: &mut impl RpcCaller,
5064) -> Result<(Value, JsonStyle), String> {
5065 let (value, style) = match command {
5066 NotesCommand::List {
5067 parent,
5068 limit,
5069 offset,
5070 ..
5071 } => {
5072 let value = client.call(
5073 "notes.list",
5074 Some(serde_json::json!({"parentKey": parent})),
5075 )?;
5076 (normalize_list_envelope(value, "items", Some(limit), offset), JsonStyle::Pretty)
5077 }
5078 NotesCommand::Get { note_key, .. } => {
5079 let value = client.call("notes.get", Some(serde_json::json!({"key": note_key})))?;
5080 (value, JsonStyle::Pretty)
5081 }
5082 NotesCommand::Create {
5083 parent,
5084 content,
5085 tags,
5086 dry_run,
5087 ..
5088 } => {
5089 let mut params = serde_json::Map::new();
5090 params.insert("parentKey".to_string(), Value::String(parent));
5091 params.insert("content".to_string(), Value::String(content));
5092 if !tags.is_empty() {
5093 params.insert(
5094 "tags".to_string(),
5095 Value::Array(tags.into_iter().map(Value::String).collect()),
5096 );
5097 }
5098 run_mutating_command(client, "notes.create", Value::Object(params), dry_run)?
5099 }
5100 NotesCommand::Update {
5101 note_key,
5102 content,
5103 dry_run,
5104 ..
5105 } => run_mutating_command(
5106 client,
5107 "notes.update",
5108 serde_json::json!({"key": note_key, "content": content}),
5109 dry_run,
5110 )?,
5111 NotesCommand::Delete {
5112 note_key, dry_run, ..
5113 } => {
5114 run_mutating_command(
5116 client,
5117 "items.delete",
5118 serde_json::json!({"key": note_key}),
5119 dry_run,
5120 )?
5121 }
5122 NotesCommand::Search { query, limit, .. } => {
5123 let value = client.call(
5124 "notes.search",
5125 Some(serde_json::json!({"query": query, "limit": limit})),
5126 )?;
5127 (normalize_list_envelope(value, "items", Some(limit), 0), JsonStyle::Pretty)
5128 }
5129 };
5130 Ok((value, style))
5131}
5132
5133fn run_mutating_command(
5134 client: &mut impl RpcCaller,
5135 method: &str,
5136 params: Value,
5137 dry_run: bool,
5138) -> Result<(Value, JsonStyle), String> {
5139 if dry_run {
5140 Ok((
5141 serde_json::json!({
5142 "ok": true,
5143 "dryRun": true,
5144 "wouldCall": method,
5145 "wouldCallParams": params,
5146 }),
5147 JsonStyle::PythonCompact,
5148 ))
5149 } else {
5150 client
5151 .call(method, Some(params))
5152 .map(|value| (value, JsonStyle::PythonCompact))
5153 }
5154}
5155
5156fn run_collections_command(
5157 command: CollectionsCommand,
5158 client: &mut impl RpcCaller,
5159) -> Result<(Value, JsonStyle), String> {
5160 let value = match command {
5161 CollectionsCommand::List { .. } => normalize_list_envelope(
5162 client.call("collections.list", None)?,
5163 "items",
5164 None,
5165 0,
5166 ),
5167 CollectionsCommand::Tree { .. } => client.call("collections.tree", None)?,
5168 CollectionsCommand::Get { name_or_id, .. } => {
5169 let key = resolve_collection(client, &name_or_id)?;
5170 client.call("collections.get", Some(serde_json::json!({"key": key})))?
5171 }
5172 CollectionsCommand::GetItems {
5173 name_or_id,
5174 limit,
5175 offset,
5176 ..
5177 } => {
5178 let key = resolve_collection(client, &name_or_id)?;
5179 let mut params = serde_json::json!({"key": key});
5180 if let Some(map) = params.as_object_mut() {
5181 if let Some(limit) = limit {
5182 map.insert("limit".to_string(), Value::Number(limit.into()));
5183 }
5184 if offset > 0 {
5185 map.insert("offset".to_string(), Value::Number(offset.into()));
5186 }
5187 }
5188 normalize_list_envelope(
5189 client.call("collections.getItems", Some(params))?,
5190 "items",
5191 limit,
5192 offset,
5193 )
5194 }
5195 CollectionsCommand::Stats { name_or_id, .. } => {
5196 let key = resolve_collection(client, &name_or_id)?;
5197 client.call("collections.stats", Some(serde_json::json!({"key": key})))?
5198 }
5199 CollectionsCommand::Rename {
5200 old_name,
5201 new_name,
5202 dry_run,
5203 ..
5204 } => {
5205 let key = resolve_mutable_collection(client, &old_name, "rename")?;
5206 let params = serde_json::json!({"key": key, "name": new_name});
5207 if dry_run {
5208 return Ok((
5209 dry_run_value("collections.rename", params),
5210 JsonStyle::PythonCompact,
5211 ));
5212 }
5213 return Ok((
5214 client.call("collections.rename", Some(params))?,
5215 JsonStyle::PythonCompact,
5216 ));
5217 }
5218 CollectionsCommand::Create {
5219 name,
5220 parent,
5221 dry_run,
5222 ..
5223 } => {
5224 let mut params = serde_json::json!({"name": name});
5225 if let Some(parent) = parent {
5226 let parent_key = resolve_mutable_collection(client, &parent, "use as parent")?;
5227 if let Some(map) = params.as_object_mut() {
5228 map.insert("parentKey".to_string(), parent_key);
5229 }
5230 }
5231 if dry_run {
5232 return Ok((
5233 dry_run_value("collections.create", params),
5234 JsonStyle::PythonCompact,
5235 ));
5236 }
5237 return Ok((
5238 client.call("collections.create", Some(params))?,
5239 JsonStyle::PythonCompact,
5240 ));
5241 }
5242 CollectionsCommand::Delete {
5243 name_or_id,
5244 dry_run,
5245 ..
5246 } => {
5247 let key = resolve_mutable_collection(client, &name_or_id, "delete")?;
5248 let params = serde_json::json!({"key": key});
5249 if dry_run {
5250 return Ok((
5251 dry_run_value("collections.delete", params),
5252 JsonStyle::PythonCompact,
5253 ));
5254 }
5255 return Ok((
5256 client.call("collections.delete", Some(params))?,
5257 JsonStyle::PythonCompact,
5258 ));
5259 }
5260 CollectionsCommand::AddItems {
5261 collection,
5262 item_keys,
5263 dry_run,
5264 ..
5265 } => {
5266 let key = resolve_mutable_collection(client, &collection, "add to")?;
5267 let params = serde_json::json!({"key": key, "keys": item_keys});
5268 if dry_run {
5269 return Ok((
5270 dry_run_value("collections.addItems", params),
5271 JsonStyle::PythonCompact,
5272 ));
5273 }
5274 return Ok((
5275 client.call("collections.addItems", Some(params))?,
5276 JsonStyle::PythonCompact,
5277 ));
5278 }
5279 CollectionsCommand::RemoveItems {
5280 collection,
5281 item_keys,
5282 dry_run,
5283 ..
5284 } => {
5285 let key = resolve_mutable_collection(client, &collection, "operate on")?;
5286 let params = serde_json::json!({"key": key, "keys": item_keys});
5287 if dry_run {
5288 return Ok((
5289 dry_run_value("collections.removeItems", params),
5290 JsonStyle::PythonCompact,
5291 ));
5292 }
5293 return Ok((
5294 client.call("collections.removeItems", Some(params))?,
5295 JsonStyle::PythonCompact,
5296 ));
5297 }
5298 };
5299 Ok((value, JsonStyle::Pretty))
5300}
5301
5302fn resolve_export_keys(
5303 client: &mut impl RpcCaller,
5304 mut keys: Vec<String>,
5305 collection: Option<String>,
5306) -> Result<Vec<String>, String> {
5307 if let Some(name) = collection {
5308 let col_key = resolve_collection(client, &name)?;
5309 let response = client.call(
5310 "collections.getItems",
5311 Some(serde_json::json!({"key": col_key})),
5312 )?;
5313 let items = collection_items(&response);
5314 for item in items {
5315 if let Some(key) = item.get("key").and_then(Value::as_str) {
5316 if !keys.contains(&key.to_string()) {
5317 keys.push(key.to_string());
5318 }
5319 }
5320 }
5321 }
5322 if keys.is_empty() {
5323 return Err("No item keys provided. Pass positional keys and/or --collection.".to_string());
5324 }
5325 Ok(keys)
5326}
5327
5328fn run_export(args: ExportArgs, client: &mut impl RpcCaller) -> Result<String, String> {
5329 let keys = resolve_export_keys(client, args.keys, args.collection)?;
5330 match args.format.as_str() {
5331 "bibtex" => run_export_content_command(client, "export.bibtex", keys),
5332 "ris" => run_export_content_command(client, "export.ris", keys),
5333 "csl-json" => {
5334 let response =
5335 client.call("export.cslJson", Some(serde_json::json!({"keys": keys})))?;
5336 if let Some(content) = response.get("content") {
5337 format_json(content, JsonStyle::Pretty)
5338 } else {
5339 format_json(&response, JsonStyle::PythonCompact)
5340 }
5341 }
5342 "bibliography" => {
5343 let response = client.call(
5344 "export.bibliography",
5345 Some(serde_json::json!({"keys": keys, "style": args.style})),
5346 )?;
5347 if let Some(object) = response.as_object() {
5348 let field = if args.html { "html" } else { "text" };
5349 if object.contains_key("html") || object.contains_key("text") {
5350 return raw_value_output(
5351 object.get(field).unwrap_or(&Value::String(String::new())),
5352 );
5353 }
5354 }
5355 format_json(&response, JsonStyle::PythonCompact)
5356 }
5357 other => Err(format!(
5358 "INVALID_ARGS: unknown format {other:?}, expected bibtex/ris/csl-json/bibliography"
5359 )),
5360 }
5361}
5362
5363fn run_export_content_command(
5364 client: &mut impl RpcCaller,
5365 method: &str,
5366 keys: Vec<String>,
5367) -> Result<String, String> {
5368 let response = client.call(method, Some(serde_json::json!({"keys": keys})))?;
5369 if let Some(content) = response.get("content") {
5370 raw_value_output(content)
5371 } else {
5372 format_json(&response, JsonStyle::PythonCompact)
5373 }
5374}
5375
5376fn raw_value_output(value: &Value) -> Result<String, String> {
5377 let mut out = match value {
5378 Value::Null => String::new(),
5379 Value::String(content) => content.clone(),
5380 other => to_python_repr(other),
5381 };
5382 out.push('\n');
5383 Ok(out)
5384}
5385
5386fn to_python_repr(value: &Value) -> String {
5387 match value {
5388 Value::Null => "None".to_string(),
5389 Value::Bool(value) => {
5390 if *value {
5391 "True".to_string()
5392 } else {
5393 "False".to_string()
5394 }
5395 }
5396 Value::Number(value) => value.to_string(),
5397 Value::String(value) => format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\'")),
5398 Value::Array(values) => {
5399 let inner = values
5400 .iter()
5401 .map(to_python_repr)
5402 .collect::<Vec<_>>()
5403 .join(", ");
5404 format!("[{inner}]")
5405 }
5406 Value::Object(entries) => {
5407 let inner = entries
5408 .iter()
5409 .map(|(key, value)| {
5410 format!("'{}': {}", key.replace('\'', "\\'"), to_python_repr(value))
5411 })
5412 .collect::<Vec<_>>()
5413 .join(", ");
5414 format!("{{{inner}}}")
5415 }
5416 }
5417}
5418
5419fn resolve_collection(client: &mut impl RpcCaller, name_or_id: &str) -> Result<Value, String> {
5420 let trimmed = name_or_id.trim();
5421 if let Ok(id) = trimmed.parse::<i64>() {
5422 return Ok(Value::Number(id.into()));
5423 }
5424
5425 let collections = client.call("collections.list", None)?;
5426 let items = collections
5427 .get("items")
5428 .and_then(Value::as_array)
5429 .or_else(|| collections.as_array())
5430 .ok_or_else(|| "collections.list returned non-array result".to_string())?;
5431
5432 if let Some(collection) = items
5433 .iter()
5434 .find(|collection| collection.get("key").and_then(Value::as_str) == Some(trimmed))
5435 {
5436 return collection_key(collection);
5437 }
5438
5439 let exact = items
5440 .iter()
5441 .filter(|collection| collection.get("name").and_then(Value::as_str) == Some(trimmed))
5442 .collect::<Vec<_>>();
5443 if exact.len() == 1 {
5444 return collection_key(exact[0]);
5445 }
5446
5447 let needle = normalize_collection_name(trimmed);
5448 let fuzzy = items
5449 .iter()
5450 .filter(|collection| {
5451 collection
5452 .get("name")
5453 .and_then(Value::as_str)
5454 .map(normalize_collection_name)
5455 .is_some_and(|name| name.contains(&needle))
5456 })
5457 .collect::<Vec<_>>();
5458
5459 match fuzzy.len() {
5460 1 => collection_key(fuzzy[0]),
5461 0 => Err(format!(
5462 "COLLECTION_NOT_FOUND: No collection named {trimmed:?}"
5463 )),
5464 _ => Err(format!(
5465 "COLLECTION_AMBIGUOUS: Multiple collections match {trimmed:?}"
5466 )),
5467 }
5468}
5469
5470fn collection_key(collection: &Value) -> Result<Value, String> {
5471 collection
5472 .get("key")
5473 .cloned()
5474 .ok_or_else(|| "collection result is missing key".to_string())
5475}
5476
5477fn resolve_mutable_collection(
5478 client: &mut impl RpcCaller,
5479 name_or_id: &str,
5480 operation: &str,
5481) -> Result<Value, String> {
5482 let key = resolve_collection(client, name_or_id)?;
5483 if key.as_i64() == Some(0) {
5484 return Err(format!(
5485 "COLLECTION_NOT_FOUND: {name_or_id:?} resolved to library root (cannot {operation})"
5486 ));
5487 }
5488 Ok(key)
5489}
5490
5491fn insert_optional_string(value: &mut Value, key: &str, maybe: Option<String>) {
5492 if let (Some(map), Some(content)) = (value.as_object_mut(), maybe) {
5493 map.insert(key.to_string(), Value::String(content));
5494 }
5495}
5496
5497fn normalize_collection_name(name: &str) -> String {
5498 name.split_whitespace()
5499 .collect::<Vec<_>>()
5500 .join(" ")
5501 .to_lowercase()
5502}
5503
5504fn format_json(value: &Value, style: JsonStyle) -> Result<String, String> {
5505 let mut out = match style {
5506 JsonStyle::PythonCompact => to_python_compact_json(value),
5507 JsonStyle::Pretty => serde_json::to_string_pretty(value).map_err(|err| err.to_string())?,
5508 };
5509 out.push('\n');
5510 Ok(out)
5511}
5512
5513fn to_python_compact_json(value: &Value) -> String {
5514 match value {
5515 Value::Null => "null".to_string(),
5516 Value::Bool(value) => value.to_string(),
5517 Value::Number(value) => value.to_string(),
5518 Value::String(value) => {
5519 serde_json::to_string(value).expect("string serialization cannot fail")
5520 }
5521 Value::Array(values) => {
5522 let inner = values
5523 .iter()
5524 .map(to_python_compact_json)
5525 .collect::<Vec<_>>()
5526 .join(", ");
5527 format!("[{inner}]")
5528 }
5529 Value::Object(entries) => {
5530 let inner = entries
5531 .iter()
5532 .map(|(key, value)| {
5533 let key = serde_json::to_string(key).expect("string serialization cannot fail");
5534 format!("{key}: {}", to_python_compact_json(value))
5535 })
5536 .collect::<Vec<_>>()
5537 .join(", ");
5538 format!("{{{inner}}}")
5539 }
5540 }
5541}