"""
An API for defining, storing, and loading sets of physical
property data.
"""
import abc
import re
import uuid
from enum import IntFlag, unique
import numpy
import pandas
from openff.units import unit
from openff.evaluator.attributes import UNDEFINED, Attribute, AttributeClass
from openff.evaluator.datasets import CalculationSource, MeasurementSource, Source
from openff.evaluator.substances import Component, ExactAmount, MoleFraction, Substance
from openff.evaluator.thermodynamics import ThermodynamicState
from openff.evaluator.utils.serialization import TypedBaseModel
[docs]@unique
class PropertyPhase(IntFlag):
"""An enum describing the phase that a property was
collected in.
Examples
--------
Properties measured in multiple phases (e.g. enthalpies of
vaporization) can be defined be concatenating `PropertyPhase`
enums:
>>> gas_liquid_phase = PropertyPhase.Gas | PropertyPhase.Liquid
"""
Undefined = 0x00
Solid = 0x01
Liquid = 0x02
Gas = 0x04
[docs] @classmethod
def from_string(cls, enum_string):
"""Parses a phase enum from its string representation.
Parameters
----------
enum_string: str
The str representation of a `PropertyPhase`
Returns
-------
PropertyPhase
The created enum
Examples
--------
To round-trip convert a phase enum:
>>> phase = PropertyPhase.Liquid | PropertyPhase.Gas
>>> phase_str = str(phase)
>>> parsed_phase = PropertyPhase.from_string(phase_str)
"""
if len(enum_string) == 0:
return PropertyPhase.Undefined
components = [cls[x] for x in enum_string.split(" + ")]
if len(components) == 0:
return PropertyPhase.Undefined
enum_value = components[0]
for component in components[1:]:
enum_value |= component
return enum_value
def __str__(self):
return " + ".join([phase.name for phase in PropertyPhase if self & phase])
def __repr__(self):
return f"<PropertyPhase {str(self)}>"
[docs]class PhysicalProperty(AttributeClass, abc.ABC):
"""Represents the value of any physical property and it's uncertainty
if provided.
It additionally stores the thermodynamic state at which the property
was collected, the phase it was collected in, information about
the composition of the observed system, and metadata about how the
property was collected.
"""
[docs] @classmethod
@abc.abstractmethod
def default_unit(cls):
"""openff.evaluator.unit.Unit: The default unit (e.g. g / mol) associated with this
class of property."""
raise NotImplementedError()
id = Attribute(
docstring="A unique identifier string assigned to this property",
type_hint=str,
default_value=lambda: str(uuid.uuid4()).replace("-", ""),
)
substance = Attribute(
docstring="The substance that this property was measured estimated for.",
type_hint=Substance,
)
phase = Attribute(
docstring="The phase / phases that this property was measured in.",
type_hint=PropertyPhase,
)
thermodynamic_state = Attribute(
docstring="The thermodynamic state that this property"
"was measured / estimated at.",
type_hint=ThermodynamicState,
)
value = Attribute(
docstring="The measured / estimated value of this property.",
type_hint=unit.Quantity,
)
uncertainty = Attribute(
docstring="The uncertainty in measured / estimated value of this property.",
type_hint=unit.Quantity,
optional=True,
)
source = Attribute(
docstring="The original source of this physical property.",
type_hint=Source,
optional=True,
)
metadata = Attribute(
docstring="Additional metadata associated with this property. All property "
"metadata will be made accessible to estimation workflows.",
type_hint=dict,
optional=True,
)
gradients = Attribute(
docstring="The gradients of this property with respect to "
"different force field parameters.",
type_hint=list,
optional=True,
)
[docs] def __init__(
self,
thermodynamic_state=None,
phase=PropertyPhase.Undefined,
substance=None,
value=None,
uncertainty=None,
source=None,
):
"""Constructs a new PhysicalProperty object.
Parameters
----------
thermodynamic_state : ThermodynamicState
The thermodynamic state that the property was measured in.
phase : PropertyPhase
The phase that the property was measured in.
substance : Substance
The composition of the substance that was measured.
value: openff.evaluator.unit.Quantity
The value of the measured physical property.
uncertainty: openff.evaluator.unit.Quantity
The uncertainty in the measured value.
source: Source
The source of this property.
"""
if thermodynamic_state is not None:
self.thermodynamic_state = thermodynamic_state
if phase is not None:
self.phase = phase
if substance is not None:
self.substance = substance
if value is not None:
self.value = value
if uncertainty is not None:
self.uncertainty = uncertainty
self.gradients = []
if source is not None:
self.source = source
def __setstate__(self, state):
if "id" not in state:
state["id"] = str(uuid.uuid4()).replace("-", "")
super(PhysicalProperty, self).__setstate__(state)
[docs] def validate(self, attribute_type=None):
super(PhysicalProperty, self).validate(attribute_type)
assert self.value.units.dimensionality == self.default_unit().dimensionality
if self.uncertainty != UNDEFINED:
assert (
self.uncertainty.units.dimensionality
== self.default_unit().dimensionality
)
[docs]class PhysicalPropertyDataSet(TypedBaseModel):
"""
An object for storing and curating data sets of both physical property
measurements and estimated. This class defines a number of convenience
functions for filtering out unwanted properties, and for generating
general statistics (such as the number of properties per substance)
about the set.
"""
[docs] def __init__(self):
"""
Constructs a new PhysicalPropertyDataSet object.
"""
self._properties = []
@property
def properties(self):
"""tuple of PhysicalProperty: A list of all of the properties
within this set.
"""
return tuple(self._properties)
@property
def property_types(self):
"""set of str: The types of property within this data set."""
return set([x.__class__.__name__ for x in self._properties])
@property
def substances(self):
"""set of Substance: The substances for which the properties in this data set
were collected for."""
return set([x.substance for x in self._properties])
@property
def sources(self):
"""set of Source: The sources from which the properties in this data set were
gathered."""
return set([x.source for x in self._properties])
[docs] def merge(self, data_set, validate=True):
"""Merge another data set into the current one.
Parameters
----------
data_set : PhysicalPropertyDataSet
The secondary data set to merge into this one.
validate: bool
Whether to validate the other data set before merging.
"""
if data_set is None:
return
self.add_properties(*data_set, validate=validate)
[docs] def add_properties(self, *physical_properties, validate=True):
"""Adds a physical property to the data set.
Parameters
----------
physical_properties: PhysicalProperty
The physical property to add.
validate: bool
Whether to validate the properties before adding them
to the set.
"""
all_ids = set(x.id for x in self)
# TODO: Do we need to check for adding the same property twice?
for physical_property in physical_properties:
if validate:
physical_property.validate()
if physical_property.id in all_ids:
raise KeyError(
f"A property with the unique id {physical_property.id} already "
f"exists."
)
all_ids.add(physical_property.id)
self._properties.extend(physical_properties)
[docs] def properties_by_substance(self, substance):
"""A generator which may be used to loop over all of the properties
which were measured for a particular substance.
Parameters
----------
substance: Substance
The substance of interest.
Returns
-------
generator of PhysicalProperty
"""
for physical_property in self._properties:
if physical_property.substance != substance:
continue
yield physical_property
[docs] def properties_by_type(self, property_type):
"""A generator which may be used to loop over all of properties
of a particular type, e.g. all "Density" properties.
Parameters
----------
property_type: str or type of PhysicalProperty
The type of property of interest. This may either be the string
class name of the property or the class type.
Returns
-------
generator of PhysicalProperty
"""
if not isinstance(property_type, str):
property_type = property_type.__name__
for physical_property in self._properties:
if physical_property.__class__.__name__ != property_type:
continue
yield physical_property
[docs] def validate(self):
"""Checks to ensure that all properties within
the set are valid physical property object.
"""
for physical_property in self._properties:
physical_property.validate()
[docs] def to_pandas(self):
"""Converts a `PhysicalPropertyDataSet` to a `pandas.DataFrame` object
with columns of
- 'Id'
- 'Temperature (K)'
- 'Pressure (kPa)'
- 'Phase'
- 'N Components'
- 'Component 1'
- 'Role 1'
- 'Mole Fraction 1'
- 'Exact Amount 1'
- ...
- 'Component N'
- 'Role N'
- 'Mole Fraction N'
- 'Exact Amount N'
- '<Property 1> Value (<default unit>)'
- '<Property 1> Uncertainty / (<default unit>)'
- ...
- '<Property N> Value / (<default unit>)'
- '<Property N> Uncertainty / (<default unit>)'
- `'Source'`
where 'Component X' is a column containing the smiles representation of
component X.
Returns
-------
pandas.DataFrame
The create data frame.
"""
if len(self) == 0:
return pandas.DataFrame()
# Keep track of the maximum number of components in any substance
# as this determines the number of component columns.
maximum_number_of_components = 0
data_rows = []
# Extract the data from the data set.
default_units = {}
for physical_property in self:
# Extract the measured state.
temperature = physical_property.thermodynamic_state.temperature.to(
unit.kelvin
).magnitude
pressure = None
if physical_property.thermodynamic_state.pressure != UNDEFINED:
pressure = physical_property.thermodynamic_state.pressure.to(
unit.kilopascal
).magnitude
phase = str(physical_property.phase)
# Extract the component data.
components = []
amounts = []
roles = []
for index, component in enumerate(physical_property.substance):
component_amounts = {MoleFraction: None, ExactAmount: None}
for x in physical_property.substance.get_amounts(component):
assert isinstance(x, (MoleFraction, ExactAmount))
component_amounts[type(x)] = x.value
components.append(component.smiles)
amounts.append(component_amounts)
roles.append(component.role.name)
# Extract the value data as a string.
default_unit = physical_property.default_unit()
default_units[physical_property.__class__.__name__] = default_unit
value = (
None
if physical_property.value == UNDEFINED
else physical_property.value.to(default_unit).magnitude
)
uncertainty = (
None
if physical_property.uncertainty == UNDEFINED
else physical_property.uncertainty.to(default_unit).magnitude
)
# Extract the data source.
source = None
if isinstance(physical_property.source, MeasurementSource):
source = physical_property.source.doi
if source is None or len(source) == 0:
source = physical_property.source.reference
elif isinstance(physical_property.source, CalculationSource):
source = physical_property.source.fidelity
# Create the data row.
data_row = {
"Id": physical_property.id,
"Temperature (K)": temperature,
"Pressure (kPa)": pressure,
"Phase": phase,
"N Components": len(physical_property.substance),
}
for index in range(len(components)):
data_row[f"Component {index + 1}"] = components[index]
data_row[f"Role {index + 1}"] = roles[index]
data_row[f"Mole Fraction {index + 1}"] = amounts[index][MoleFraction]
data_row[f"Exact Amount {index + 1}"] = amounts[index][ExactAmount]
data_row[f"{type(physical_property).__name__} Value ({default_unit:~})"] = (
value
)
data_row[
f"{type(physical_property).__name__} Uncertainty ({default_unit:~})"
] = uncertainty
data_row["Source"] = source
data_rows.append(data_row)
maximum_number_of_components = max(
maximum_number_of_components, len(physical_property.substance)
)
# Set up the column headers.
if len(data_rows) == 0:
return None
data_columns = [
"Id",
"Temperature (K)",
"Pressure (kPa)",
"Phase",
"N Components",
]
for index in range(maximum_number_of_components):
data_columns.append(f"Component {index + 1}")
data_columns.append(f"Role {index + 1}")
data_columns.append(f"Mole Fraction {index + 1}")
data_columns.append(f"Exact Amount {index + 1}")
for property_type in self.property_types:
default_unit = default_units[property_type]
data_columns.append(f"{property_type} Value ({default_unit:~})")
data_columns.append(f"{property_type} Uncertainty ({default_unit:~})")
data_columns.append("Source")
data_frame = pandas.DataFrame(data_rows, columns=data_columns)
return data_frame
[docs] @classmethod
def from_pandas(cls, data_frame: pandas.DataFrame) -> "PhysicalPropertyDataSet":
"""Constructs a data set object from a pandas ``DataFrame`` object.
Notes
-----
* All physical properties are assumed to be source from experimental
measurements.
* Currently this method onlu supports data frames containing properties
which are built-in to the framework (e.g. Density).
* This method assumes the data frame has a structure identical to that
produced by the ``PhysicalPropertyDataSet.to_pandas`` function.
Parameters
----------
data_frame
The data frame to construct the data set from.
Returns
-------
The constructed data set.
"""
from openff.evaluator import properties
property_header_matches = {
re.match(r"^([a-zA-Z]+) Value \(([a-zA-Z0-9+-/\s*^]*)\)$", header)
for header in data_frame
if header.find(" Value ") >= 0
}
property_headers = {}
# Validate that the headers have the correct format, specify a
# built-in property type, and specify correctly the properties
# units.
for match in property_header_matches:
assert match
property_type_string, property_unit_string = match.groups()
assert hasattr(properties, property_type_string)
property_type = getattr(properties, property_type_string)
property_unit = unit.Unit(property_unit_string)
assert property_unit is not None
assert (
property_unit.dimensionality
== property_type.default_unit().dimensionality
)
property_headers[match.group(0)] = (property_type, property_unit)
# Convert the data rows to property objects.
physical_properties = []
for _, data_row in data_frame.iterrows():
data_row = data_row.dropna()
# Extract the state at which the measurement was made.
thermodynamic_state = ThermodynamicState(
temperature=data_row["Temperature (K)"] * unit.kelvin,
pressure=data_row["Pressure (kPa)"] * unit.kilopascal,
)
property_phase = PropertyPhase.from_string(data_row["Phase"])
# Extract the substance the measurement was made for.
substance = Substance()
for i in range(data_row["N Components"]):
component = Component(
smiles=data_row[f"Component {i + 1}"],
role=Component.Role[data_row.get(f"Role {i + 1}", "Solvent")],
)
mole_fraction = data_row.get(f"Mole Fraction {i + 1}", 0.0)
exact_amount = data_row.get(f"Exact Amount {i + 1}", 0)
if not numpy.isclose(mole_fraction, 0.0):
substance.add_component(component, MoleFraction(mole_fraction))
if not numpy.isclose(exact_amount, 0.0):
substance.add_component(component, ExactAmount(exact_amount))
for (
property_header,
(property_type, property_unit),
) in property_headers.items():
# Check to see whether the row contains a value for this
# type of property.
if property_header not in data_row:
continue
uncertainty_header = property_header.replace("Value", "Uncertainty")
source_string = data_row["Source"]
is_doi = all(
any(
re.match(pattern, split_string, re.I)
for pattern in [
r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
r"^10.1002/[^\s]+$",
r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
r"^10.1021/\w\w\d+$",
r"^10.1207/[\w\d]+\&\d+_\d+$",
]
)
for split_string in source_string.split(" + ")
)
physical_property = property_type(
thermodynamic_state=thermodynamic_state,
phase=property_phase,
value=data_row[property_header] * property_unit,
uncertainty=(
None
if uncertainty_header not in data_row
else data_row[uncertainty_header] * property_unit
),
substance=substance,
source=MeasurementSource(
doi="" if not is_doi else source_string,
reference=source_string if not is_doi else "",
),
)
identifier = data_row.get("Id", None)
if identifier:
physical_property.id = identifier
physical_properties.append(physical_property)
data_set = PhysicalPropertyDataSet()
data_set.add_properties(*physical_properties)
return data_set
def __len__(self):
return len(self._properties)
def __iter__(self):
return iter(self._properties)
def __getstate__(self):
return {"properties": self._properties}
def __setstate__(self, state):
self._properties = state["properties"]
assert all(isinstance(x, PhysicalProperty) for x in self)
# Ensure each property has a unique id.
all_ids = set(x.id for x in self)
assert len(all_ids) == len(self)
def __str__(self):
return (
f"n_properties={len(self)} n_substances={len(self.substances)} "
f"n_sources={len(self.sources)}"
)
def __repr__(self):
return f"<PhysicalPropertyDataSet {str(self)}>"