"""
A collection of classes representing data stored by a storage backend.
"""
import abc
from openff.evaluator.attributes import AttributeClass
from openff.evaluator.datasets import PropertyPhase
from openff.evaluator.forcefield import ForceFieldSource
from openff.evaluator.storage.attributes import FilePath, StorageAttribute
from openff.evaluator.substances import Substance
from openff.evaluator.thermodynamics import ThermodynamicState
[docs]class BaseStoredData(AttributeClass, abc.ABC):
"""A base representation of cached data to be stored by
a storage backend.
The expectation is that stored data may exist in storage
as two parts:
1) A JSON serialized representation of this class (or
a subclass), which contains lightweight information
such as the state and composition of the system. Any
larger pieces of data, such as coordinates or
trajectories, should be referenced as a file name.
2) A directory like structure (either directly a directory,
or some NetCDF like compressed archive) of ancillary
files which do not easily lend themselves to be
serialized within a JSON object, whose files are referenced
by their file name by the data object.
The ancillary directory-like structure is not required if the
data may be suitably stored in the data object itself.
"""
[docs] @classmethod
@abc.abstractmethod
def has_ancillary_data(cls):
"""Returns whether this data object requires an
accompanying data directory-like structure.
Returns
-------
bool
True if this class requires an accompanying
data directory-like structure.
"""
raise NotImplementedError()
[docs] def to_storage_query(self):
"""Returns the storage query which would match this
data object.
Returns
-------
BaseDataQuery
The storage query which would match this
data object.
"""
raise NotImplementedError()
[docs]class HashableStoredData(BaseStoredData, abc.ABC):
"""Represents a class of data objects which can be
rapidly compared / indexed by their hash values.
"""
def __eq__(self, other):
return type(self) == type(other) and hash(self) == hash(other)
def __ne__(self, other):
return not self.__eq__(other)
@abc.abstractmethod
def __hash__(self):
raise NotImplementedError
[docs]class ForceFieldData(HashableStoredData):
"""A data container for force field objects which
will be saved to disk.
"""
force_field_source = StorageAttribute(
docstring="The force field source object.",
type_hint=ForceFieldSource,
)
[docs] @classmethod
def has_ancillary_data(cls):
return False
[docs] def to_storage_query(self):
"""
Returns
-------
SimulationDataQuery
The storage query which would match this
data object.
"""
from .query import ForceFieldQuery
return ForceFieldQuery.from_data_object(self)
def __eq__(self, other):
return super(ForceFieldData, self).__eq__(other)
def __ne__(self, other):
return super(ForceFieldData, self).__ne__(other)
def __hash__(self):
force_field_string = self.force_field_source.json()
return hash(force_field_string.encode())
[docs]class ReplaceableData(BaseStoredData, abc.ABC):
"""Represents a piece of stored data which can be
replaced in a `StorageBackend` by another piece of
data of the same type.
This may be the case for example when attempting to
store a piece of `StoredSimulationData`, but another
piece of data measured from the same calculation and
for the same system already exists in the system, but
stores less configurations.
"""
[docs]class StoredSimulationData(ReplaceableData):
"""A representation of data which has been cached
from a single previous simulation.
Notes
-----
The ancillary directory which stores larger information such
as trajectories should be of the form:
.. code-block::
|--- data_object.json
|--- data_directory
|--- coordinate_file_name.pdb
|--- trajectory_file_name.dcd
|--- statistics_file_name.csv
"""
substance = StorageAttribute(
docstring="A description of the composition of the stored system.",
type_hint=Substance,
)
thermodynamic_state = StorageAttribute(
docstring="The state at which the data was collected.",
type_hint=ThermodynamicState,
)
property_phase = StorageAttribute(
docstring="The phase of the system (e.g. liquid, gas).",
type_hint=PropertyPhase,
)
source_calculation_id = StorageAttribute(
docstring="The server id of the calculation which yielded this data.",
type_hint=str,
)
force_field_id = StorageAttribute(
docstring="The id of the force field parameters used to generate the data.",
type_hint=str,
)
coordinate_file_name = StorageAttribute(
docstring="The name of a coordinate file which encodes the "
"topology information of the system.",
type_hint=FilePath,
)
trajectory_file_name = StorageAttribute(
docstring="The name of a .dcd trajectory file containing "
"configurations generated by the simulation.",
type_hint=FilePath,
)
statistics_file_name = StorageAttribute(
docstring="The name of a `StatisticsArray` csv file, containing "
"statistics generated by the simulation.",
type_hint=FilePath,
)
statistical_inefficiency = StorageAttribute(
docstring="The statistical inefficiency of the collected data.",
type_hint=float,
)
number_of_molecules = StorageAttribute(
docstring="The total number of molecules in the system.",
type_hint=int,
)
[docs] @classmethod
def has_ancillary_data(cls):
return True
[docs] def to_storage_query(self):
"""
Returns
-------
SimulationDataQuery
The storage query which would match this
data object.
"""
from .query import SimulationDataQuery
return SimulationDataQuery.from_data_object(self)