Source code for openff.evaluator.datasets.curation.components.conversion

"""The module contains curation components for converting one type of property (e.g.
density) into another (e.g excess molar volume)"""
import functools
import logging
from typing import TYPE_CHECKING, Union

import pandas
from pydantic import Field
from typing_extensions import Literal

from openff.evaluator.datasets.curation.components import (
    CurationComponent,
    CurationComponentSchema,
)

if TYPE_CHECKING:

    conint = int
    PositiveInt = int
    PositiveFloat = float

else:

    from pydantic import conint

logger = logging.getLogger(__name__)


[docs]class ConvertExcessDensityDataSchema(CurationComponentSchema):

    type: Literal["ConvertExcessDensityDataSchema"] = "ConvertExcessDensityDataSchema"

    temperature_precision: conint(ge=0) = Field(
        2,
        description="The number of decimal places to compare temperatures (K) to "
        "within when attempting to identify compatible pure and binary data.",
    )
    pressure_precision: conint(ge=0) = Field(
        1,
        description="The number of decimal places to compare pressures (kPa) to "
        "within when attempting to identify compatible pure and binary data.",
    )


[docs]class ConvertExcessDensityData(CurationComponent):
    """A component for converting binary mass density data to excess molar volume
    data and vice versa where pure density data measured for the components is
    available.

    Notes
    -----
    This protocol may result in duplicate data points being generated. It is
    recommended to apply the de-duplication filter after this component has been
    applied.
    """

    @classmethod
    @functools.lru_cache(500)
    def _molecular_weight(cls, smiles):

        from openff.toolkit.topology import Molecule
        from openmm import unit as openmm_unit

        molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

        molecular_weight = sum(
            atom.mass.value_in_unit(openmm_unit.gram / openmm_unit.mole)
            for atom in molecule.atoms
        )

        return molecular_weight

    @classmethod
    def _find_overlapping_data_points(
        cls,
        pure_data_set: pandas.DataFrame,
        binary_data_set: pandas.DataFrame,
        schema: ConvertExcessDensityDataSchema,
    ):
        """Finds those binary data points for which there also exists pure
         data points for each component in the binary system.

        Parameters
        ----------
        pure_data_set
            The pure data set.
        binary_data_set
            The binary data set.
        schema
            The schema for this component.

        Returns
        -------
        pandas.DataFrame
            The data set containing the pure and binary data points
            measured for the same substances at the same state pounts
        """

        if len(pure_data_set) == 0 or len(binary_data_set) == 0:
            return pandas.DataFrame()

        pure_data_set = pure_data_set.dropna(axis=1, how="all")
        binary_data_set = binary_data_set.dropna(axis=1, how="all")

        # Round the floats which will be compared.
        pure_data_set["Temperature (K)"] = pure_data_set["Temperature (K)"].round(
            schema.temperature_precision
        )
        pure_data_set["Pressure (kPa)"] = pure_data_set["Pressure (kPa)"].round(
            schema.pressure_precision
        )

        binary_data_set["Temperature (K)"] = binary_data_set["Temperature (K)"].round(
            schema.temperature_precision
        )
        binary_data_set["Pressure (kPa)"] = binary_data_set["Pressure (kPa)"].round(
            schema.pressure_precision
        )

        # Only consider pure measurements which only have mole fractions defined
        if "Exact Amount 1" in pure_data_set:
            pure_data_set = pure_data_set[pure_data_set["Exact Amount 1"].isna()]

        if "Mole Fraction 1" not in pure_data_set:
            return pandas.DataFrame()

        pure_data_set = pure_data_set[pure_data_set["Mole Fraction 1"].notna()]

        # Retain only the minimally informative pure data columns.
        data_columns = [
            "Temperature (K)",
            "Pressure (kPa)",
            "Phase",
            "Component 1",
            "Density Value (g / ml)",
            "Source",
        ]

        if "Density Uncertainty (g / ml)" in pure_data_set:
            data_columns.append("Density Uncertainty (g / ml)")

        pure_data_set = pure_data_set[data_columns]

        pure_data_set = pandas.merge(
            pure_data_set,
            pure_data_set,
            how="inner",
            on=["Temperature (K)", "Pressure (kPa)", "Phase"],
        )

        overlapping_set = pandas.merge(
            binary_data_set,
            pure_data_set,
            how="inner",
            left_on=[
                "Temperature (K)",
                "Pressure (kPa)",
                "Phase",
                "Component 1",
                "Component 2",
            ],
            right_on=[
                "Temperature (K)",
                "Pressure (kPa)",
                "Phase",
                "Component 1_x",
                "Component 1_y",
            ],
            suffixes=("", ""),
        )

        return overlapping_set

    @classmethod
    def _convert_density_to_v_excess(
        cls, density_data_set: pandas.DataFrame
    ) -> pandas.DataFrame:
        """Converts a pandas data frame containing both binary mass densities
        and pure mass densities into one which contains excess molar volume
        measurements.

        Parameters
        ----------
        density_data_set
            The data frame containing both pure and binary
            density measurements. This should be generated using the
            `find_overlapping_data_points` function.

        Returns
        -------
            A data frame which contains the excess molar volume measurements.
        """

        m_1 = density_data_set["Component 1"].apply(cls._molecular_weight)
        m_1_x_1 = m_1 * density_data_set["Mole Fraction 1"]

        m_2 = density_data_set["Component 2"].apply(cls._molecular_weight)
        m_2_x_2 = m_2 * density_data_set["Mole Fraction 2"]

        v_excess = (
            (m_1_x_1 + m_2_x_2) / density_data_set["Density Value (g / ml)"]
            - m_1_x_1 / density_data_set["Density Value (g / ml)_x"]
            - m_2_x_2 / density_data_set["Density Value (g / ml)_y"]
        )

        source = density_data_set[["Source", "Source_x", "Source_y"]].agg(
            " + ".join, axis=1
        )

        # Add the new values to a new data frame.
        columns_to_drop = [
            x for x in density_data_set if x.endswith("_x") or x.endswith("_y")
        ]
        columns_to_drop.append("Density Value (g / ml)")
        columns_to_drop.append("Source")

        if "Density Uncertainty (g / ml)" in density_data_set:
            columns_to_drop.append("Density Uncertainty (g / ml)")

        v_excess_data_set = density_data_set.drop(columns=columns_to_drop).copy()

        v_excess_data_set.insert(
            v_excess_data_set.shape[1],
            "ExcessMolarVolume Value (cm ** 3 / mol)",
            v_excess,
        )
        v_excess_data_set.insert(v_excess_data_set.shape[1], "Source", source)

        return v_excess_data_set

    @classmethod
    def _convert_v_excess_to_density(
        cls, v_excess_data_set: pandas.DataFrame
    ) -> pandas.DataFrame:
        """Converts a pandas data frame containing both excess molar volumes
        and pure mass densities into one which contains binary mass density
        measurements.

        Parameters
        ----------
        v_excess_data_set
            The data frame containing both pure density and excess molar
            volume measurements. This should be generated using the
            `find_overlapping_data_points` function.

        Returns
        -------
            A data frame which contains the excess molar volume measurements.
        """

        m_1 = v_excess_data_set["Component 1"].apply(cls._molecular_weight)
        m_1_x_1 = m_1 * v_excess_data_set["Mole Fraction 1"]

        m_2 = v_excess_data_set["Component 2"].apply(cls._molecular_weight)
        m_2_x_2 = m_2 * v_excess_data_set["Mole Fraction 2"]

        v_excess = v_excess_data_set["ExcessMolarVolume Value (cm ** 3 / mol)"]

        denominator = (
            v_excess
            + m_1_x_1 / v_excess_data_set["Density Value (g / ml)_x"]
            + m_2_x_2 / v_excess_data_set["Density Value (g / ml)_y"]
        )

        rho_binary = (m_1_x_1 + m_2_x_2) / denominator

        source = v_excess_data_set[["Source", "Source_x", "Source_y"]].agg(
            " + ".join, axis=1
        )

        # Add the new values to a new data frame.
        columns_to_drop = [
            x for x in v_excess_data_set if x.endswith("_x") or x.endswith("_y")
        ]
        columns_to_drop.append("ExcessMolarVolume Value (cm ** 3 / mol)")
        columns_to_drop.append("Source")

        if "ExcessMolarVolume Uncertainty (cm ** 3 / mol)" in v_excess_data_set:
            columns_to_drop.append("ExcessMolarVolume Uncertainty (cm ** 3 / mol)")

        density_data_set = v_excess_data_set.drop(columns=columns_to_drop).copy()

        density_data_set.insert(
            density_data_set.shape[1] - 1, "Density Value (g / ml)", rho_binary
        )
        density_data_set.insert(density_data_set.shape[1] - 1, "Source", source)

        return density_data_set

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: ConvertExcessDensityDataSchema,
        n_processes,
    ) -> pandas.DataFrame:

        if len(data_frame) == 0:
            return data_frame

        # Check to make sure the data frame contains at least a
        # density column which may store pure densities.
        if "Density Value (g / ml)" not in data_frame:
            return data_frame

        # Separate out the data sets of interest
        pure_density_data = data_frame[
            (data_frame["Density Value (g / ml)"].notna())
            & (data_frame["N Components"] == 1)
        ]

        pure_density_data = pure_density_data.dropna(axis=1, how="all")

        # Exit early if no pure densities can be found.
        if len(pure_density_data) == 0:
            return data_frame

        # Add the pure data to the binary data sets to make conversion easier.
        binary_density_data = data_frame[
            (data_frame["Density Value (g / ml)"].notna())
            & (data_frame["N Components"] == 2)
        ]
        binary_density_data = binary_density_data.dropna(axis=1, how="all")

        binary_density_data = cls._find_overlapping_data_points(
            pure_density_data, binary_density_data, schema
        )

        v_excess_data = pandas.DataFrame()

        if "ExcessMolarVolume Value (cm ** 3 / mol)" in data_frame:

            v_excess_data = data_frame[
                (data_frame["ExcessMolarVolume Value (cm ** 3 / mol)"].notna())
                & (data_frame["N Components"] == 2)
            ]
            v_excess_data = v_excess_data.dropna(axis=1, how="all")
            v_excess_data = cls._find_overlapping_data_points(
                pure_density_data, v_excess_data, schema
            )

        if len(binary_density_data) == 0 and len(v_excess_data) == 0:
            return data_frame

        # Inter-convert the two sets
        data_to_concat = [data_frame]

        if len(binary_density_data) > 0:

            v_excess_from_density = cls._convert_density_to_v_excess(
                binary_density_data
            )
            data_to_concat.append(v_excess_from_density)

        if len(v_excess_data) > 0:

            density_from_v_excess = cls._convert_v_excess_to_density(v_excess_data)
            data_to_concat.append(density_from_v_excess)

        if len(data_to_concat) > 1:

            converted_data = pandas.concat(
                data_to_concat,
                ignore_index=True,
                sort=False,
            )

        else:

            converted_data = data_frame

        return converted_data


ConversionComponentSchema = Union[ConvertExcessDensityDataSchema]