icechunk-python 0.1.0-alpha.1

Transactional storage engine for Zarr designed for use on cloud object storage
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
# module
from collections.abc import AsyncGenerator, Iterable
from typing import Any, Self

from zarr.abc.store import AccessMode, ByteRangeRequest, Store
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import AccessModeLiteral, BytesLike
from zarr.core.sync import SyncMixin

from ._icechunk_python import (
    PyIcechunkStore,
    S3Credentials,
    SnapshotMetadata,
    StorageConfig,
    StoreConfig,
    VirtualRefConfig,
    __version__,
    pyicechunk_store_create,
    pyicechunk_store_exists,
    pyicechunk_store_from_bytes,
    pyicechunk_store_open_existing,
)

__all__ = [
    "__version__",
    "IcechunkStore",
    "StorageConfig",
    "S3Credentials",
    "SnapshotMetadata",
    "StoreConfig",
    "VirtualRefConfig",
]


class IcechunkStore(Store, SyncMixin):
    _store: PyIcechunkStore

    @classmethod
    async def open(cls, *args: Any, **kwargs: Any) -> Self:
        """FIXME: Better handle the open method based on the access mode the user passed in along with the kwargs
        https://github.com/zarr-developers/zarr-python/blob/c878da2a900fc621ff23cc6d84d45cd3cb26cbed/src/zarr/abc/store.py#L24-L30
        """
        if "mode" in kwargs:
            mode = kwargs.pop("mode")
        else:
            mode = "r"

        access_mode = AccessMode.from_literal(mode)

        if "storage" in kwargs:
            storage = kwargs.pop("storage")
        else:
            raise ValueError(
                "Storage configuration is required. Pass a Storage object to construct an IcechunkStore"
            )

        store_exists = await pyicechunk_store_exists(storage)

        if access_mode.overwrite:
            if store_exists:
                raise ValueError(
                    "Store already exists and overwrite is not allowed for IcechunkStore"
                )
            store = await cls.create(storage, mode, *args, **kwargs)
        elif access_mode.create or access_mode.update:
            if store_exists:
                store = await cls.open_existing(storage, mode, *args, **kwargs)
            else:
                store = await cls.create(storage, mode, *args, **kwargs)
        else:
            store = await cls.open_existing(storage, mode, *args, **kwargs)

        # We dont want to call _open() becuase icechunk handles the opening, etc.
        # if we have gotten this far we can mark it as open
        store._is_open = True

        return store

    def __init__(
        self,
        store: PyIcechunkStore,
        mode: AccessModeLiteral = "r",
        *args: Any,
        **kwargs: Any,
    ):
        """Create a new IcechunkStore. This should not be called directly, instead use the create or open_existing class methods."""
        super().__init__(mode, *args, **kwargs)
        if store is None:
            raise ValueError(
                "An IcechunkStore should not be created with the default constructor, instead use either the create or open_existing class methods."
            )
        self._store = store

    @classmethod
    async def open_existing(
        cls,
        storage: StorageConfig,
        mode: AccessModeLiteral = "r",
        config: StoreConfig | None = None,
        *args: Any,
        **kwargs: Any,
    ) -> Self:
        """Open an existing IcechunkStore from the given storage.

        If there is not store at the given location, an error will be raised.

        It is recommended to use the cached storage option for better performance. If cached=True,
        this will be configured automatically with the provided storage_config as the underlying
        storage backend.

        If opened with AccessModeLiteral "r", the store will be read-only. Otherwise the store will be writable.
        """
        config = config or StoreConfig()
        read_only = mode == "r"
        store = await pyicechunk_store_open_existing(
            storage, read_only=read_only, config=config
        )
        return cls(store=store, mode=mode, args=args, kwargs=kwargs)

    @classmethod
    async def create(
        cls,
        storage: StorageConfig,
        mode: AccessModeLiteral = "w",
        config: StoreConfig | None = None,
        *args: Any,
        **kwargs: Any,
    ) -> Self:
        """Create a new IcechunkStore with the given storage configuration.

        If a store already exists at the given location, an error will be raised.

        It is recommended to use the cached storage option for better performance. If cached=True,
        this will be configured automatically with the provided storage_config as the underlying
        storage backend.
        """
        config = config or StoreConfig()
        store = await pyicechunk_store_create(storage, config=config)
        return cls(store=store, mode=mode, args=args, kwargs=kwargs)

    def with_mode(self, mode: AccessModeLiteral) -> Self:
        """
        Return a new store of the same type pointing to the same location with a new mode.

        The returned Store is not automatically opened. Call :meth:`Store.open` before
        using.

        Parameters
        ----------
        mode: AccessModeLiteral
            The new mode to use.

        Returns
        -------
        store:
            A new store of the same type with the new mode.

        """
        read_only = mode == "r"
        new_store = self._store.with_mode(read_only)
        return self.__class__(new_store, mode=mode)

    def __eq__(self, value: object) -> bool:
        if not isinstance(value, self.__class__):
            return False
        return self._store == value._store

    def __getstate__(self) -> object:
        store_repr = self._store.as_bytes()
        return {"store": store_repr, "mode": self.mode}

    def __setstate__(self, state: Any) -> None:
        store_repr = state["store"]
        mode = state["mode"]
        is_read_only = mode == "r"
        self._store = pyicechunk_store_from_bytes(store_repr, is_read_only)
        self._is_open = True

    @property
    def snapshot_id(self) -> str:
        """Return the current snapshot id."""
        return self._store.snapshot_id

    def change_set_bytes(self) -> bytes:
        return self._store.change_set_bytes()

    @property
    def branch(self) -> str | None:
        """Return the current branch name."""
        return self._store.branch

    async def checkout(
        self,
        snapshot_id: str | None = None,
        branch: str | None = None,
        tag: str | None = None,
    ) -> None:
        """Checkout a branch, tag, or specific snapshot."""
        if snapshot_id is not None:
            if branch is not None or tag is not None:
                raise ValueError(
                    "only one of snapshot_id, branch, or tag may be specified"
                )
            return await self._store.checkout_snapshot(snapshot_id)
        if branch is not None:
            if tag is not None:
                raise ValueError(
                    "only one of snapshot_id, branch, or tag may be specified"
                )
            return await self._store.checkout_branch(branch)
        if tag is not None:
            return await self._store.checkout_tag(tag)

        raise ValueError("a snapshot_id, branch, or tag must be specified")

    async def commit(self, message: str) -> str:
        """Commit any uncommitted changes to the store.

        This will create a new snapshot on the current branch and return
        the snapshot id.
        """
        return await self._store.commit(message)

    async def distributed_commit(
        self, message: str, other_change_set_bytes: list[bytes]
    ) -> str:
        return await self._store.distributed_commit(message, other_change_set_bytes)

    @property
    def has_uncommitted_changes(self) -> bool:
        """Return True if there are uncommitted changes to the store"""
        return self._store.has_uncommitted_changes

    async def reset(self) -> None:
        """Discard any uncommitted changes and reset to the previous snapshot state."""
        return await self._store.reset()

    async def new_branch(self, branch_name: str) -> str:
        """Create a new branch from the current snapshot. This requires having no uncommitted changes."""
        return await self._store.new_branch(branch_name)

    async def tag(self, tag_name: str, snapshot_id: str) -> None:
        """Tag an existing snapshot with a given name."""
        return await self._store.tag(tag_name, snapshot_id=snapshot_id)

    def ancestry(self) -> AsyncGenerator[SnapshotMetadata, None]:
        """Get the list of parents of the current version.

        Returns
        -------
        AsyncGenerator[SnapshotMetadata, None]
        """
        return self._store.ancestry()

    async def empty(self) -> bool:
        """Check if the store is empty."""
        return await self._store.empty()

    async def clear(self) -> None:
        """Clear the store."""
        return await self._store.clear()

    async def get(
        self,
        key: str,
        prototype: BufferPrototype,
        byte_range: tuple[int | None, int | None] | None = None,
    ) -> Buffer | None:
        """Retrieve the value associated with a given key.

        Parameters
        ----------
        key : str
        byte_range : tuple[int, Optional[int]], optional

        Returns
        -------
        Buffer
        """
        try:
            result = await self._store.get(key, byte_range)
        except ValueError as _e:
            # Zarr python expects None to be returned if the key does not exist
            # but an IcechunkStore returns an error if the key does not exist
            return None

        return prototype.buffer.from_bytes(result)

    async def get_partial_values(
        self,
        prototype: BufferPrototype,
        key_ranges: Iterable[tuple[str, ByteRangeRequest]],
    ) -> list[Buffer | None]:
        """Retrieve possibly partial values from given key_ranges.

        Parameters
        ----------
        key_ranges : Iterable[tuple[str, tuple[int | None, int | None]]]
            Ordered set of key, range pairs, a key may occur multiple times with different ranges

        Returns
        -------
        list of values, in the order of the key_ranges, may contain null/none for missing keys
        """
        # NOTE: pyo3 has not implicit conversion from an Iterable to a rust iterable. So we convert it
        # to a list here first. Possible opportunity for optimization.
        result = await self._store.get_partial_values(list(key_ranges))
        return [prototype.buffer.from_bytes(r) for r in result]

    async def exists(self, key: str) -> bool:
        """Check if a key exists in the store.

        Parameters
        ----------
        key : str

        Returns
        -------
        bool
        """
        return await self._store.exists(key)

    @property
    def supports_writes(self) -> bool:
        """Does the store support writes?"""
        return self._store.supports_writes

    async def set(self, key: str, value: Buffer) -> None:
        """Store a (key, value) pair.

        Parameters
        ----------
        key : str
        value : Buffer
        """
        return await self._store.set(key, value.to_bytes())

    async def set_if_not_exists(self, key: str, value: Buffer) -> None:
        """
        Store a key to ``value`` if the key is not already present.

        Parameters
        -----------
        key : str
        value : Buffer
        """
        return await self._store.set_if_not_exists(key, value.to_bytes())

    async def set_virtual_ref(
        self, key: str, location: str, *, offset: int, length: int
    ) -> None:
        """Store a virtual reference to a chunk.

        Parameters
        ----------
        key : str
            The chunk to store the reference under. This is the fully qualified zarr key eg: 'array/c/0/0/0'
        location : str
            The location of the chunk in storage. This is absolute path to the chunk in storage eg: 's3://bucket/path/to/file.nc'
        offset : int
            The offset in bytes from the start of the file location in storage the chunk starts at
        length : int
            The length of the chunk in bytes, measured from the given offset
        """
        return await self._store.set_virtual_ref(key, location, offset, length)

    async def delete(self, key: str) -> None:
        """Remove a key from the store

        Parameters
        ----------
        key : strz
        """
        return await self._store.delete(key)

    @property
    def supports_partial_writes(self) -> bool:
        """Does the store support partial writes?"""
        return self._store.supports_partial_writes

    async def set_partial_values(
        self, key_start_values: Iterable[tuple[str, int, BytesLike]]
    ) -> None:
        """Store values at a given key, starting at byte range_start.

        Parameters
        ----------
        key_start_values : list[tuple[str, int, BytesLike]]
            set of key, range_start, values triples, a key may occur multiple times with different
            range_starts, range_starts (considering the length of the respective values) must not
            specify overlapping ranges for the same key
        """
        # NOTE: pyo3 does not implicit conversion from an Iterable to a rust iterable. So we convert it
        # to a list here first. Possible opportunity for optimization.
        return await self._store.set_partial_values(list(key_start_values))

    @property
    def supports_listing(self) -> bool:
        """Does the store support listing?"""
        return self._store.supports_listing

    @property
    def supports_deletes(self) -> bool:
        return self._store.supports_deletes

    def list(self) -> AsyncGenerator[str, None]:
        """Retrieve all keys in the store.

        Returns
        -------
        AsyncGenerator[str, None]
        """
        # The zarr spec specefies that that this and other
        # listing methods should not be async, so we need to
        # wrap the async method in a sync method.
        return self._store.list()

    def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
        """Retrieve all keys in the store with a given prefix.

        Parameters
        ----------
        prefix : str

        Returns
        -------
        AsyncGenerator[str, None]
        """
        # The zarr spec specefies that that this and other
        # listing methods should not be async, so we need to
        # wrap the async method in a sync method.
        return self._store.list_prefix(prefix)

    def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
        """
        Retrieve all keys and prefixes with a given prefix and which do not contain the character
        “/” after the given prefix.

        Parameters
        ----------
        prefix : str

        Returns
        -------
        AsyncGenerator[str, None]
        """
        # The zarr spec specefies that that this and other
        # listing methods should not be async, so we need to
        # wrap the async method in a sync method.
        return self._store.list_dir(prefix)