Skip to content
Open
1 change: 1 addition & 0 deletions changes/3998.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Centralized JSON document I/O behind free functions in `zarr.core._json` and removed the unused private `Store._get_bytes`/`_get_json` methods and their per-store overrides.
208 changes: 0 additions & 208 deletions src/zarr/abc/store.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
from __future__ import annotations

import asyncio
import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from functools import partial
from itertools import starmap
from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable

from zarr.core.sync import sync

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, AsyncIterator, Iterable, Sequence
from types import TracebackType
Expand Down Expand Up @@ -219,211 +216,6 @@ async def get(
"""
...

async def _get_bytes(
self, key: str, *, prototype: BufferPrototype, byte_range: ByteRequest | None = None
) -> bytes:
"""
Retrieve raw bytes from the store asynchronously.

This is a convenience method that wraps ``get()`` and converts the result
to bytes. Use this when you need the raw byte content of a stored value.

Parameters
----------
key : str
The key identifying the data to retrieve.
prototype : BufferPrototype
The buffer prototype to use for reading the data.
byte_range : ByteRequest, optional
If specified, only retrieve a portion of the stored data.
Can be a ``RangeByteRequest``, ``OffsetByteRequest``, or ``SuffixByteRequest``.

Returns
-------
bytes
The raw bytes stored at the given key.

Raises
------
FileNotFoundError
If the key does not exist in the store.

See Also
--------
get : Lower-level method that returns a Buffer object.
get_bytes : Synchronous version of this method.
get_json : Asynchronous method for retrieving and parsing JSON data.

Examples
--------
>>> store = await MemoryStore.open()
>>> await store.set("data", Buffer.from_bytes(b"hello world"))
>>> data = await store.get_bytes("data", prototype=default_buffer_prototype())
>>> print(data)
b'hello world'
"""
buffer = await self.get(key, prototype, byte_range)
if buffer is None:
raise FileNotFoundError(key)
return buffer.to_bytes()

def _get_bytes_sync(
self, key: str = "", *, prototype: BufferPrototype, byte_range: ByteRequest | None = None
) -> bytes:
"""
Retrieve raw bytes from the store synchronously.

This is a synchronous wrapper around ``get_bytes()``. It should only
be called from non-async code. For async contexts, use ``get_bytes()``
instead.

Parameters
----------
key : str, optional
The key identifying the data to retrieve. Defaults to an empty string.
prototype : BufferPrototype
The buffer prototype to use for reading the data.
byte_range : ByteRequest, optional
If specified, only retrieve a portion of the stored data.
Can be a ``RangeByteRequest``, ``OffsetByteRequest``, or ``SuffixByteRequest``.

Returns
-------
bytes
The raw bytes stored at the given key.

Raises
------
FileNotFoundError
If the key does not exist in the store.

Warnings
--------
Do not call this method from async functions. Use ``get_bytes()`` instead
to avoid blocking the event loop.

See Also
--------
get_bytes : Asynchronous version of this method.
get_json_sync : Synchronous method for retrieving and parsing JSON data.

Examples
--------
>>> store = MemoryStore()
>>> await store.set("data", Buffer.from_bytes(b"hello world"))
>>> data = store.get_bytes_sync("data", prototype=default_buffer_prototype())
>>> print(data)
b'hello world'
"""

return sync(self._get_bytes(key, prototype=prototype, byte_range=byte_range))

async def _get_json(
self, key: str, *, prototype: BufferPrototype, byte_range: ByteRequest | None = None
) -> Any:
"""
Retrieve and parse JSON data from the store asynchronously.

This is a convenience method that retrieves bytes from the store and
parses them as JSON.

Parameters
----------
key : str
The key identifying the JSON data to retrieve.
prototype : BufferPrototype
The buffer prototype to use for reading the data.
byte_range : ByteRequest, optional
If specified, only retrieve a portion of the stored data.
Can be a ``RangeByteRequest``, ``OffsetByteRequest``, or ``SuffixByteRequest``.
Note: Using byte ranges with JSON may result in invalid JSON.

Returns
-------
Any
The parsed JSON data. This follows the behavior of ``json.loads()`` and
can be any JSON-serializable type: dict, list, str, int, float, bool, or None.

Raises
------
FileNotFoundError
If the key does not exist in the store.
json.JSONDecodeError
If the stored data is not valid JSON.

See Also
--------
get_bytes : Method for retrieving raw bytes.
get_json_sync : Synchronous version of this method.

Examples
--------
>>> store = await MemoryStore.open()
>>> metadata = {"zarr_format": 3, "node_type": "array"}
>>> await store.set("zarr.json", Buffer.from_bytes(json.dumps(metadata).encode()))
>>> data = await store.get_json("zarr.json", prototype=default_buffer_prototype())
>>> print(data)
{'zarr_format': 3, 'node_type': 'array'}
"""

return json.loads(await self._get_bytes(key, prototype=prototype, byte_range=byte_range))

def _get_json_sync(
self, key: str = "", *, prototype: BufferPrototype, byte_range: ByteRequest | None = None
) -> Any:
"""
Retrieve and parse JSON data from the store synchronously.

This is a synchronous wrapper around ``get_json()``. It should only
be called from non-async code. For async contexts, use ``get_json()``
instead.

Parameters
----------
key : str, optional
The key identifying the JSON data to retrieve. Defaults to an empty string.
prototype : BufferPrototype
The buffer prototype to use for reading the data.
byte_range : ByteRequest, optional
If specified, only retrieve a portion of the stored data.
Can be a ``RangeByteRequest``, ``OffsetByteRequest``, or ``SuffixByteRequest``.
Note: Using byte ranges with JSON may result in invalid JSON.

Returns
-------
Any
The parsed JSON data. This follows the behavior of ``json.loads()`` and
can be any JSON-serializable type: dict, list, str, int, float, bool, or None.

Raises
------
FileNotFoundError
If the key does not exist in the store.
json.JSONDecodeError
If the stored data is not valid JSON.

Warnings
--------
Do not call this method from async functions. Use ``get_json()`` instead
to avoid blocking the event loop.

See Also
--------
get_json : Asynchronous version of this method.
get_bytes_sync : Synchronous method for retrieving raw bytes without parsing.

Examples
--------
>>> store = MemoryStore()
>>> metadata = {"zarr_format": 3, "node_type": "array"}
>>> store.set("zarr.json", Buffer.from_bytes(json.dumps(metadata).encode()))
>>> data = store.get_json_sync("zarr.json", prototype=default_buffer_prototype())
>>> print(data)
{'zarr_format': 3, 'node_type': 'array'}
"""

return sync(self._get_json(key, prototype=prototype, byte_range=byte_range))

@abstractmethod
async def get_partial_values(
self,
Expand Down
133 changes: 133 additions & 0 deletions src/zarr/core/_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Helpers for moving JSON documents in and out of zarr stores.

These are free functions, deliberately not methods on the ``Store`` ABC:
reading and writing JSON is a composition of the store's ``get``/``set``
primitives with a buffer/JSON conversion, not part of the store contract.
Keeping them as functions means stores cannot (and need not) override them,
and the ``Store`` definition stays free of any dependency on the buffer
prototype.

These functions are pure: the JSON encoding parameters (``indent``,
``allow_nan``) are explicit arguments rather than read from the global config.
Callers that want zarr's configured indentation pass
``indent=config.get("json_indent")``.

Two layers:

- ``buffer_to_json`` / ``json_to_buffer`` convert between a ``Buffer`` and a
parsed JSON value. The buffer prototype lives here, at buffer construction,
where it is meaningful.
- ``get_json`` / ``set_json`` compose those with ``Store.get`` / ``Store.set``.
``get_json`` returns ``None`` for a missing key (the contract most callers
want); callers that require presence check for ``None`` themselves.
"""

from __future__ import annotations

import json
from typing import TYPE_CHECKING, cast

from zarr.core.buffer import default_buffer_prototype

if TYPE_CHECKING:
from zarr.abc.store import ByteRequest, Store
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import JSON


def buffer_to_json(buffer: Buffer) -> JSON:
"""Parse the contents of a `Buffer` as a JSON value."""
# json.loads is typed as returning Any; the result is by definition JSON.
return cast("JSON", json.loads(buffer.to_bytes()))


def buffer_to_json_object(buffer: Buffer) -> dict[str, JSON]:
"""Parse the contents of a `Buffer` as a JSON object (a `dict`).

Every metadata document zarr reads is a JSON object, so this narrows the
`JSON` union to `dict[str, JSON]` once, here, instead of at each call site.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems trivial, but do you want to add a Parameters section?

Parameters
----------
buffer
The buffer whose contents are parsed as a JSON object.

Raises
------
TypeError
If the parsed value is not a JSON object.
"""
obj = buffer_to_json(buffer)
if not isinstance(obj, dict):
raise TypeError(f"Expected a JSON object, got {type(obj).__name__}.")
return obj


def json_to_buffer(
obj: JSON,
*,
prototype: BufferPrototype | None = None,
indent: int | None = None,
allow_nan: bool = True,
) -> Buffer:
"""Serialize a JSON value into a `Buffer`.

Parameters
----------
obj
The JSON-serializable value to encode.
prototype
The buffer prototype to construct the result with. Defaults to
`default_buffer_prototype()`.
indent
Indentation passed to `json.dumps`. `None` (the default) writes
without newline indentation, using json's default separators.
Callers that want zarr's configured indentation pass
`indent=config.get("json_indent")`.
allow_nan
Whether to permit `NaN`/`Infinity` in the output, passed to
`json.dumps`.
"""
if prototype is None:
prototype = default_buffer_prototype()
return prototype.buffer.from_bytes(json.dumps(obj, indent=indent, allow_nan=allow_nan).encode())


async def get_json(store: Store, key: str, *, byte_range: ByteRequest | None = None) -> JSON | None:
"""Read and parse the JSON document at `key`, or `None` if it is absent.

Parameters
----------
store
The store to read from.
key
The key identifying the JSON document.
byte_range
If given, read only this portion of the value. Note that a partial
read of a JSON document may not be valid JSON.

Returns
-------
JSON or None
The parsed JSON value, or `None` if `key` does not exist.
"""
buffer = await store.get(key, default_buffer_prototype(), byte_range)
return None if buffer is None else buffer_to_json(buffer)


async def set_json(
store: Store,
key: str,
obj: JSON,
*,
prototype: BufferPrototype | None = None,
indent: int | None = None,
allow_nan: bool = True,
) -> None:
"""Serialize `obj` as JSON and write it to `key` in `store`.

`indent` and `allow_nan` are forwarded to `json_to_buffer`.
"""
await store.set(
key, json_to_buffer(obj, prototype=prototype, indent=indent, allow_nan=allow_nan)
)
8 changes: 4 additions & 4 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import json
import warnings
from asyncio import gather
from collections.abc import Iterable, Mapping, Sequence
Expand Down Expand Up @@ -28,6 +27,7 @@
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core._info import ArrayInfo
from zarr.core._json import buffer_to_json_object
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
Expand Down Expand Up @@ -289,13 +289,13 @@ async def get_array_metadata(
if zarr_format == 2:
# V2 arrays are comprised of a .zarray and .zattrs objects
assert zarray_bytes is not None
metadata_dict = json.loads(zarray_bytes.to_bytes())
zattrs_dict = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {}
metadata_dict = buffer_to_json_object(zarray_bytes)
zattrs_dict = buffer_to_json_object(zattrs_bytes) if zattrs_bytes is not None else {}
metadata_dict["attributes"] = zattrs_dict
else:
# V3 arrays are comprised of a zarr.json object
assert zarr_json_bytes is not None
metadata_dict = json.loads(zarr_json_bytes.to_bytes())
metadata_dict = buffer_to_json_object(zarr_json_bytes)

parse_node_type_array(metadata_dict.get("node_type"))

Expand Down
Loading
Loading