import ctypes
import io
import pathlib
import warnings
from typing import BinaryIO, Union
import numpy as np
from .._c_api import mts_create_array_callback_t
from .._c_lib import _get_library
from ..tensor import TensorMap
from ._block import (
CreateArrayCallback,
_block_to_dict,
_single_block_from_npz,
create_numpy_array,
)
from ._labels import _labels_from_npz, _labels_to_npz
from ._utils import _save_buffer_raw
[docs]
def load(file: Union[str, pathlib.Path, BinaryIO], use_numpy=False) -> TensorMap:
"""
Load a previously saved :py:class:`TensorMap` from the given file.
:py:class:`TensorMap` are serialized using numpy's ``.npz`` format, i.e. a ZIP file
without compression (storage method is ``STORED``), where each file is stored as a
``.npy`` array. See the C API documentation for more information on the format.
:param file: file to load: this can be a string, a :py:class:`pathlib.Path`
containing the path to the file to load, or a file-like object that should be
opened in binary mode.
:param use_numpy: should we use numpy or the native implementation? Numpy should be
able to process more dtypes than the native implementation, which is limited to
float64, but the native implementation is usually faster than going through
numpy.
"""
if use_numpy:
return _tensor_from_npz(file)
else:
if isinstance(file, (str, pathlib.Path)):
return load_custom_array(file, create_numpy_array)
else:
# assume we have a file-like object
buffer = file.read()
assert isinstance(buffer, bytes)
return load_buffer_custom_array(buffer, create_numpy_array)
[docs]
def load_buffer(
buffer: Union[bytes, bytearray, memoryview], use_numpy=False
) -> TensorMap:
"""
Load a previously saved :py:class:`TensorMap` from an in-memory buffer.
:param buffer: In-memory buffer containing a serialized ``TensorMap``
:param use_numpy: should we use numpy or the native implementation?
"""
if use_numpy:
return _tensor_from_npz(io.BytesIO(buffer))
else:
return load_buffer_custom_array(buffer, create_numpy_array)
[docs]
def load_custom_array(
path: Union[str, pathlib.Path],
create_array: "CreateArrayCallback",
) -> TensorMap:
"""
Load a previously saved :py:class:`TensorMap` from the given path using a custom
array creation callback.
This is an advanced functionality, which should not be needed by most users.
This function allows to specify the kind of array to use when loading the data
through the ``create_array`` callback. This callback should take three arguments: a
pointer to the shape, the number of elements in the shape, and a pointer to the
``mts_array_t`` to be filled.
:py:func:`metatensor.io.create_numpy_array` and
:py:func:`metatensor.io.create_torch_array` can be used to load data into numpy
and torch arrays respectively.
:param path: path of the file to load
:param create_array: callback used to create arrays as needed
"""
lib = _get_library()
if isinstance(path, str):
path = path.encode("utf8")
elif isinstance(path, pathlib.Path):
path = bytes(path)
ptr = lib.mts_tensormap_load(path, mts_create_array_callback_t(create_array))
return TensorMap._from_ptr(ptr)
[docs]
def load_buffer_custom_array(
buffer: Union[bytes, bytearray, memoryview],
create_array: "CreateArrayCallback",
) -> TensorMap:
"""
Load a previously saved :py:class:`TensorMap` from the given buffer using a custom
array creation callback.
This is an advanced functionality, which should not be needed by most users.
This function allows to specify the kind of array to use when loading the data
through the ``create_array`` callback. This callback should take three arguments: a
pointer to the shape, the number of elements in the shape, and a pointer to the
``mts_array_t`` to be filled.
:py:func:`metatensor.io.create_numpy_array` and
:py:func:`metatensor.io.create_torch_array` can be used to load data into numpy
and torch arrays respectively.
:param buffer: in-memory buffer containing a saved :py:class:`TensorMap`
:param create_array: callback used to create arrays as needed
"""
lib = _get_library()
if isinstance(buffer, bytearray):
char_array = ctypes.c_char * len(buffer)
buffer = char_array.from_buffer(buffer)
elif isinstance(buffer, memoryview):
char_array = ctypes.c_char * len(buffer)
# FIXME: we would prefer not to make a copy here, but ctypes does not support
# passing a memory view to C, even if it is contiguous.
# https://github.com/python/cpython/issues/60190
buffer = char_array.from_buffer_copy(buffer)
ptr = lib.mts_tensormap_load_buffer(
buffer,
len(buffer),
mts_create_array_callback_t(create_array),
)
return TensorMap._from_ptr(ptr)
def _save_tensor(
file: Union[str, pathlib.Path, BinaryIO],
tensor: TensorMap,
use_numpy=False,
):
assert isinstance(tensor, TensorMap)
if isinstance(file, (str, pathlib.Path)):
if not file.endswith(".npz"):
file += ".npz"
warnings.warn(
message=f"adding '.npz' extension, the file will be saved at '{file}'",
stacklevel=1,
)
if use_numpy:
all_entries = _tensor_to_dict(tensor)
np.savez(file, **all_entries)
else:
lib = _get_library()
if isinstance(file, (str, pathlib.Path)):
if isinstance(file, str):
path = file.encode("utf8")
elif isinstance(file, pathlib.Path):
path = bytes(file)
lib.mts_tensormap_save(path, tensor._ptr)
else:
# assume we have a file-like object
buffer = _save_tensor_buffer_raw(tensor)
file.write(buffer.raw)
def _save_tensor_buffer_raw(tensor: TensorMap) -> ctypes.Array:
"""
Save a TensorMap to an in-memory buffer, returning the data as a ctypes array of
``ctypes.c_char``.
"""
lib = _get_library()
return _save_buffer_raw(lib.mts_tensormap_save_buffer, tensor._ptr)
def _tensor_to_dict(tensor_map):
result = {
"keys": _labels_to_npz(tensor_map.keys),
}
for block_i, block in enumerate(tensor_map.blocks()):
prefix = f"blocks/{block_i}/"
result.update(_block_to_dict(block, prefix, is_gradient=False))
return result
def _tensor_from_npz(file):
dictionary = np.load(file)
keys = _labels_from_npz(dictionary["keys"])
blocks = []
for block_i in range(len(keys)):
prefix = f"blocks/{block_i}/"
properties = _labels_from_npz(dictionary[f"{prefix}properties"])
block = _single_block_from_npz(prefix, dictionary, properties)
blocks.append(block)
return TensorMap(keys, blocks)