Source code for openff.evaluator.datasets.curation.components.filtering

import functools
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

import numpy
import pandas
from openff.units import unit
from pydantic import Field, root_validator, validator
from scipy.optimize import linear_sum_assignment
from typing_extensions import Literal

from openff.evaluator.datasets.curation.components import (
    CurationComponent,
    CurationComponentSchema,
)
from openff.evaluator.datasets.utilities import (
    data_frame_to_substances,
    reorder_data_frame,
)
from openff.evaluator.utils.checkmol import (
    ChemicalEnvironment,
    analyse_functional_groups,
)

if TYPE_CHECKING:

    conint = int
    confloat = float
    PositiveInt = int
    PositiveFloat = float

else:

    from pydantic import PositiveFloat, PositiveInt, confloat, conint, constr

logger = logging.getLogger(__name__)

ComponentEnvironments = List[List[ChemicalEnvironment]]
MoleFractionRange = Tuple[confloat(ge=0.0, le=1.0), confloat(ge=0.0, le=1.0)]


[docs]class FilterDuplicatesSchema(CurationComponentSchema):

    type: Literal["FilterDuplicates"] = "FilterDuplicates"

    temperature_precision: conint(ge=0) = Field(
        2,
        description="The number of decimal places to compare temperatures (K) to "
        "within.",
    )
    pressure_precision: conint(ge=0) = Field(
        3,
        description="The number of decimal places to compare pressures (kPa) to "
        "within.",
    )
    mole_fraction_precision: conint(ge=0) = Field(
        6,
        description="The number of decimal places to compare mole fractions to within.",
    )


[docs]class FilterDuplicates(CurationComponent):
    """A component to remove duplicate data points (within a specified precision)
    from a data set.
    """

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterDuplicatesSchema, n_processes
    ) -> pandas.DataFrame:

        if len(data_frame) == 0:
            return data_frame

        data_frame = data_frame.copy()
        data_frame = reorder_data_frame(data_frame)

        minimum_n_components = data_frame["N Components"].min()
        maximum_n_components = data_frame["N Components"].max()

        filtered_data = []

        for n_components in range(minimum_n_components, maximum_n_components + 1):

            component_data = data_frame[
                data_frame["N Components"] == n_components
            ].copy()

            component_data["Temperature (K)"] = component_data["Temperature (K)"].round(
                schema.temperature_precision
            )
            component_data["Pressure (kPa)"] = component_data["Pressure (kPa)"].round(
                schema.pressure_precision
            )

            subset_columns = ["Temperature (K)", "Pressure (kPa)", "Phase"]

            for index in range(n_components):

                component_data[f"Mole Fraction {index + 1}"] = component_data[
                    f"Mole Fraction {index + 1}"
                ].round(schema.mole_fraction_precision)

                subset_columns.extend(
                    [
                        f"Component {index + 1}",
                        f"Role {index + 1}",
                        f"Mole Fraction {index + 1}",
                        f"Exact Amount {index + 1}",
                    ]
                )

            subset_columns = [x for x in subset_columns if x in component_data]
            value_headers = [x for x in component_data if x.find(" Value ") >= 0]

            sorted_filtered_data = []

            for value_header in value_headers:

                uncertainty_header = value_header.replace("Value", "Uncertainty")

                property_data = component_data[component_data[value_header].notna()]

                if uncertainty_header in component_data:
                    property_data = property_data.sort_values(
                        uncertainty_header, na_position="first"
                    )

                property_data = property_data.drop_duplicates(
                    subset=subset_columns, keep="last"
                )

                sorted_filtered_data.append(property_data)

            sorted_filtered_data = pandas.concat(
                sorted_filtered_data, ignore_index=True, sort=False
            )

            filtered_data.append(sorted_filtered_data)

        filtered_data = pandas.concat(filtered_data, ignore_index=True, sort=False)
        return filtered_data


[docs]class FilterByTemperatureSchema(CurationComponentSchema):

    type: Literal["FilterByTemperature"] = "FilterByTemperature"

    minimum_temperature: Optional[PositiveFloat] = Field(
        ...,
        description="Retain data points measured for temperatures above this value (K)",
    )
    maximum_temperature: Optional[PositiveFloat] = Field(
        ...,
        description="Retain data points measured for temperatures below this value (K)",
    )

    @root_validator
    def _min_max(cls, values):
        minimum_temperature = values.get("minimum_temperature")
        maximum_temperature = values.get("maximum_temperature")

        if minimum_temperature is not None and maximum_temperature is not None:
            assert maximum_temperature > minimum_temperature

        return values


[docs]class FilterByTemperature(CurationComponent):
    """A component which will filter out data points which were measured outside of a
    specified temperature range
    """

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByTemperatureSchema,
        n_processes,
    ) -> pandas.DataFrame:

        filtered_frame = data_frame

        if schema.minimum_temperature is not None:
            filtered_frame = filtered_frame[
                schema.minimum_temperature < filtered_frame["Temperature (K)"]
            ]

        if schema.maximum_temperature is not None:
            filtered_frame = filtered_frame[
                filtered_frame["Temperature (K)"] < schema.maximum_temperature
            ]

        return filtered_frame


[docs]class FilterByPressureSchema(CurationComponentSchema):

    type: Literal["FilterByPressure"] = "FilterByPressure"

    minimum_pressure: Optional[PositiveFloat] = Field(
        ...,
        description="Retain data points measured for pressures above this value (kPa)",
    )
    maximum_pressure: Optional[PositiveFloat] = Field(
        ...,
        description="Retain data points measured for pressures below this value (kPa)",
    )

    @root_validator
    def _min_max(cls, values):
        minimum_pressure = values.get("minimum_pressure")
        maximum_pressure = values.get("maximum_pressure")

        if minimum_pressure is not None and maximum_pressure is not None:
            assert maximum_pressure > minimum_pressure

        return values


[docs]class FilterByPressure(CurationComponent):
    """A component which will filter out data points which were measured outside of a
    specified pressure range.
    """

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterByPressureSchema, n_processes
    ) -> pandas.DataFrame:

        filtered_frame = data_frame

        if schema.minimum_pressure is not None:
            filtered_frame = filtered_frame[
                schema.minimum_pressure < filtered_frame["Pressure (kPa)"]
            ]

        if schema.maximum_pressure is not None:
            filtered_frame = filtered_frame[
                filtered_frame["Pressure (kPa)"] < schema.maximum_pressure
            ]

        return filtered_frame


[docs]class FilterByMoleFractionSchema(CurationComponentSchema):

    type: Literal["FilterByMoleFraction"] = "FilterByMoleFraction"

    mole_fraction_ranges: Dict[conint(gt=1), List[List[MoleFractionRange]]] = Field(
        ...,
        description="The ranges of mole fractions to retain. Each key in the "
        "dictionary corresponds to a number of components in the system. Each value "
        "is a list of the allowed mole fraction ranges for all but one of the "
        "components, i.e for a binary system, the allowed mole fraction for only the "
        "first component must be specified.",
    )

    @validator("mole_fraction_ranges")
    def _validate_ranges(cls, value: Dict[int, List[List[MoleFractionRange]]]):

        for n_components, ranges in value.items():

            assert len(ranges) == n_components - 1

            assert all(
                mole_fraction_range[0] < mole_fraction_range[1]
                for component_ranges in ranges
                for mole_fraction_range in component_ranges
            )

        return value


[docs]class FilterByMoleFraction(CurationComponent):
    """A component which will filter out data points which were measured outside of a
    specified mole fraction range.
    """

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByMoleFractionSchema,
        n_processes,
    ) -> pandas.DataFrame:

        filtered_frame = data_frame

        full_query = ~filtered_frame["N Components"].isin(schema.mole_fraction_ranges)

        for n_components, ranges in schema.mole_fraction_ranges.items():

            # Build the query to apply
            n_component_query = filtered_frame["N Components"] == n_components

            for index, component_ranges in enumerate(ranges):

                component_query = None

                for mole_fraction_range in component_ranges:

                    fraction_query = (
                        filtered_frame[f"Mole Fraction {index + 1}"]
                        > mole_fraction_range[0]
                    ) & (
                        filtered_frame[f"Mole Fraction {index + 1}"]
                        < mole_fraction_range[1]
                    )

                    if component_query is None:
                        component_query = fraction_query
                    else:
                        component_query |= fraction_query

                n_component_query &= component_query

            full_query |= n_component_query

        filtered_frame = filtered_frame[full_query]
        return filtered_frame


[docs]class FilterByRacemicSchema(CurationComponentSchema):

    type: Literal["FilterByRacemic"] = "FilterByRacemic"


[docs]class FilterByRacemic(CurationComponent):
    """A component which will filter out data points which were measured for racemic
    mixtures.
    """

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByMoleFractionSchema,
        n_processes,
    ) -> pandas.DataFrame:

        # Begin building the query. All pure substances should be
        # retained by default.
        query = data_frame["N Components"] < 2

        for n_components in range(2, data_frame["N Components"].max() + 1):

            component_data = data_frame[data_frame["N Components"] == n_components]

            if len(component_data) == 0:
                continue

            component_combinations = itertools.combinations(range(n_components), 2)

            is_racemic = None

            for index_0, index_1 in component_combinations:

                components_racemic = component_data[
                    f"Component {index_0 + 1}"
                ].str.replace("@", "") == component_data[
                    f"Component {index_1 + 1}"
                ].str.replace(
                    "@", ""
                )

                is_racemic = (
                    components_racemic
                    if is_racemic is None
                    else (is_racemic | components_racemic)
                )

            not_racemic = ~is_racemic
            query |= not_racemic

        filtered_frame = data_frame[query]
        return filtered_frame


[docs]class FilterByElementsSchema(CurationComponentSchema):

    type: Literal["FilterByElements"] = "FilterByElements"

    allowed_elements: Optional[List[constr(min_length=1)]] = Field(
        None,
        description="The only elements which must be present in the measured system "
        "for the data point to be retained. This option is mutually exclusive with "
        "`forbidden_elements`",
    )
    forbidden_elements: Optional[List[constr(min_length=1)]] = Field(
        None,
        description="The elements which must not be present in the measured system for "
        "the data point to be retained. This option is mutually exclusive with "
        "`allowed_elements`",
    )

    @root_validator
    def _validate_mutually_exclusive(cls, values):

        allowed_elements = values.get("allowed_elements")
        forbidden_elements = values.get("forbidden_elements")

        assert allowed_elements is not None or forbidden_elements is not None
        assert allowed_elements is None or forbidden_elements is None

        return values


[docs]class FilterByElements(CurationComponent):
    """A component which will filter out data points which were measured for systems
    which contain specific elements."""

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterByElementsSchema, n_processes
    ) -> pandas.DataFrame:

        from openff.toolkit.topology import Molecule

        def filter_function(data_row):

            n_components = data_row["N Components"]

            for index in range(n_components):

                smiles = data_row[f"Component {index + 1}"]
                molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

                if schema.allowed_elements is not None and not all(
                    [x.symbol in schema.allowed_elements for x in molecule.atoms]
                ):
                    return False

                if schema.forbidden_elements is not None and any(
                    [x.symbol in schema.forbidden_elements for x in molecule.atoms]
                ):
                    return False

            return True

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterByPropertyTypesSchema(CurationComponentSchema):

    type: Literal["FilterByPropertyTypes"] = "FilterByPropertyTypes"

    property_types: List[constr(min_length=1)] = Field(
        ...,
        description="The types of property to retain.",
    )
    n_components: Dict[constr(min_length=1), List[PositiveInt]] = Field(
        default_factory=dict,
        description="Optionally specify the number of components that a property "
        "should have been measured for (e.g. pure, binary) in order for that data "
        "point to be retained.",
    )

    strict: bool = Field(
        False,
        description="If true, only substances (defined without consideration for their "
        "mole fractions or exact amount) which have data available for all of the "
        "specified property types will be retained. Note that the data points aren't "
        "required to have been measured at the same state.",
    )

    @root_validator
    def _validate_n_components(cls, values):

        property_types = values.get("property_types")
        n_components = values.get("n_components")

        assert all(x in property_types for x in n_components)

        return values


[docs]class FilterByPropertyTypes(CurationComponent):
    """A component which will apply a filter which only retains properties of specified
    types."""

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByPropertyTypesSchema,
        n_processes,
    ) -> pandas.DataFrame:

        property_headers = [
            header for header in data_frame if header.find(" Value ") >= 0
        ]

        # Removes the columns for properties which are not of interest.
        for header in property_headers:

            property_type = header.split(" ")[0]

            if property_type in schema.property_types:
                continue

            data_frame = data_frame.drop(header, axis=1)

            uncertainty_header = header.replace(" Value ", " Uncertainty ")

            if uncertainty_header in data_frame:
                data_frame = data_frame.drop(uncertainty_header, axis=1)

        # Drop any rows which do not contain any values for the property types of
        # interest.
        property_headers = [
            header
            for header in property_headers
            if header.split(" ")[0] in schema.property_types
        ]

        data_frame = data_frame.dropna(subset=property_headers, how="all")

        # Apply a more specific filter which only retain which contain values
        # for the specific property types, and which were measured for the
        # specified number of components.
        for property_type, n_components in schema.n_components.items():

            property_header = next(
                iter(x for x in property_headers if x.find(f"{property_type} ") == 0),
                None,
            )

            if property_header is None:
                continue

            data_frame = data_frame[
                data_frame[property_header].isna()
                | data_frame["N Components"].isin(n_components)
            ]

        # Apply the strict filter if requested
        if schema.strict:

            reordered_data_frame = reorder_data_frame(data_frame)

            # Build a dictionary of which properties should be present partitioned
            # by the number of components they should have been be measured for.
            property_types = defaultdict(list)

            if len(schema.n_components) > 0:

                for property_type, n_components in schema.n_components.items():

                    for n_component in n_components:
                        property_types[n_component].append(property_type)

                min_n_components = min(property_types)
                max_n_components = max(property_types)

            else:

                min_n_components = reordered_data_frame["N Components"].min()
                max_n_components = reordered_data_frame["N Components"].max()

                for n_components in range(min_n_components, max_n_components + 1):
                    property_types[n_components].extend(schema.property_types)

            substances_with_data = set()
            components_with_data = {}

            # For each N component find substances which have data points for
            # all of the specified property types.
            for n_components in range(min_n_components, max_n_components + 1):

                component_data = reordered_data_frame[
                    reordered_data_frame["N Components"] == n_components
                ]

                if n_components not in property_types or len(component_data) == 0:
                    continue

                n_component_headers = [
                    header
                    for header in property_headers
                    if header.split(" ")[0] in property_types[n_components]
                    and header in component_data
                ]

                if len(n_component_headers) != len(property_types[n_components]):
                    continue

                n_component_substances = set.intersection(
                    *[
                        data_frame_to_substances(
                            component_data[component_data[header].notna()]
                        )
                        for header in n_component_headers
                    ]
                )
                substances_with_data.update(n_component_substances)
                components_with_data[n_components] = {
                    component
                    for substance in n_component_substances
                    for component in substance
                }

            if len(schema.n_components) > 0:
                components_with_all_data = set.intersection(
                    *components_with_data.values()
                )

                # Filter out any smiles for don't appear in all of the N component
                # substances.
                data_frame = FilterBySmiles.apply(
                    data_frame,
                    FilterBySmilesSchema(smiles_to_include=[*components_with_all_data]),
                )

            # Filter out any substances which (within each N component) don't have
            # all of the specified data types.
            data_frame = FilterBySubstances.apply(
                data_frame,
                FilterBySubstancesSchema(substances_to_include=[*substances_with_data]),
            )

        data_frame = data_frame.dropna(axis=1, how="all")
        return data_frame


[docs]class FilterByStereochemistrySchema(CurationComponentSchema):

    type: Literal["FilterByStereochemistry"] = "FilterByStereochemistry"


[docs]class FilterByStereochemistry(CurationComponent):
    """A component which filters out data points measured for systems whereby the
    stereochemistry of a number of components is undefined."""

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByStereochemistrySchema,
        n_processes,
    ) -> pandas.DataFrame:

        from openff.toolkit.topology import Molecule
        from openff.toolkit.utils import UndefinedStereochemistryError

        def filter_function(data_row):

            n_components = data_row["N Components"]

            for index in range(n_components):

                smiles = data_row[f"Component {index + 1}"]

                try:
                    Molecule.from_smiles(smiles)
                except UndefinedStereochemistryError:
                    return False

            return True

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterByChargedSchema(CurationComponentSchema):

    type: Literal["FilterByCharged"] = "FilterByCharged"


[docs]class FilterByCharged(CurationComponent):
    """A component which filters out data points measured for substances where any of
    the constituent components have a net non-zero charge.
    """

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterByChargedSchema, n_processes
    ) -> pandas.DataFrame:

        from openff.toolkit.topology import Molecule

        def filter_function(data_row):

            n_components = data_row["N Components"]

            for index in range(n_components):

                smiles = data_row[f"Component {index + 1}"]
                molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

                # noinspection PyUnresolvedReferences
                atom_charges = [
                    atom.formal_charge
                    if isinstance(atom.formal_charge, int)
                    else atom.formal_charge.m_as(unit.elementary_charge)
                    for atom in molecule.atoms
                ]

                if numpy.isclose(sum(atom_charges), 0.0):
                    continue

                return False

            return True

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterByIonicLiquidSchema(CurationComponentSchema):
    type: Literal["FilterByIonicLiquid"] = "FilterByIonicLiquid"


[docs]class FilterByIonicLiquid(CurationComponent):
    """A component which filters out data points measured for substances which
    contain or are classed as an ionic liquids.
    """

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByIonicLiquidSchema,
        n_processes,
    ) -> pandas.DataFrame:
        def filter_function(data_row):

            n_components = data_row["N Components"]

            for index in range(n_components):

                smiles = data_row[f"Component {index + 1}"]

                if "." in smiles:
                    return False

            return True

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterBySmilesSchema(CurationComponentSchema):
    type: Literal["FilterBySmiles"] = "FilterBySmiles"

    smiles_to_include: Optional[List[str]] = Field(
        None,
        description="The smiles patterns to retain. This option is mutually "
        "exclusive with `smiles_to_exclude`",
    )
    smiles_to_exclude: Optional[List[str]] = Field(
        None,
        description="The smiles patterns to exclude. This option is mutually "
        "exclusive with `smiles_to_include`",
    )
    allow_partial_inclusion: bool = Field(
        False,
        description="If False, all the components in a substance must appear in "
        "the `smiles_to_include` list, otherwise, only some must appear. "
        "This option only applies when `smiles_to_include` is set.",
    )

    @root_validator
    def _validate_mutually_exclusive(cls, values):

        smiles_to_include = values.get("smiles_to_include")
        smiles_to_exclude = values.get("smiles_to_exclude")

        assert smiles_to_include is not None or smiles_to_exclude is not None
        assert smiles_to_include is None or smiles_to_exclude is None

        return values


[docs]class FilterBySmiles(CurationComponent):
    """A component which filters the data set so that it only contains either a
    specific set of smiles, or does not contain any of a set of specifically excluded
    smiles.
    """

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterBySmilesSchema, n_processes
    ) -> pandas.DataFrame:

        smiles_to_include = schema.smiles_to_include
        smiles_to_exclude = schema.smiles_to_exclude

        if smiles_to_include is not None:
            smiles_to_exclude = []
        elif smiles_to_exclude is not None:
            smiles_to_include = []

        def filter_function(data_row):

            n_components = data_row["N Components"]

            component_smiles = [
                data_row[f"Component {index + 1}"] for index in range(n_components)
            ]

            if any(x in smiles_to_exclude for x in component_smiles):
                return False
            elif len(smiles_to_exclude) > 0:
                return True

            if not schema.allow_partial_inclusion and not all(
                x in smiles_to_include for x in component_smiles
            ):
                return False

            if schema.allow_partial_inclusion and not any(
                x in smiles_to_include for x in component_smiles
            ):
                return False

            return True

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterBySmirksSchema(CurationComponentSchema):

    type: Literal["FilterBySmirks"] = "FilterBySmirks"

    smirks_to_include: Optional[List[str]] = Field(
        None,
        description="The smirks patterns which must be matched by a substance in "
        "order to retain a measurement. This option is mutually exclusive with "
        "`smirks_to_exclude`",
    )
    smirks_to_exclude: Optional[List[str]] = Field(
        None,
        description="The smirks patterns which must not be matched by a substance in "
        "order to retain a measurement. This option is mutually exclusive with "
        "`smirks_to_include`",
    )
    allow_partial_inclusion: bool = Field(
        False,
        description="If False, all the components in a substance must match at least "
        "one pattern in `smirks_to_include` in order to retain a measurement, "
        "otherwise, only a least one component must match. This option only applies "
        "when `smirks_to_include` is set.",
    )

    @root_validator
    def _validate_mutually_exclusive(cls, values):

        smirks_to_include = values.get("smirks_to_include")
        smirks_to_exclude = values.get("smirks_to_exclude")

        assert smirks_to_include is not None or smirks_to_exclude is not None
        assert smirks_to_include is None or smirks_to_exclude is None

        return values


[docs]class FilterBySmirks(CurationComponent):
    """A component which filters a data set so that it only contains measurements made
    for molecules which contain (or don't) a set of chemical environments
    represented by SMIRKS patterns.
    """

    @staticmethod
    @functools.lru_cache(1000)
    def _find_smirks_matches(smiles_pattern, *smirks_patterns):
        """Determines which (if any) of the specified smirks match the specified
        molecule.

        Parameters
        ----------
        smiles_pattern: str
            The SMILES representation to try and match against.
        smirks_patterns: str
            The smirks patterns to try and match.

        Returns
        -------
        list of str
            The matched smirks patterns.
        """

        from openff.toolkit.topology import Molecule

        if len(smirks_patterns) == 0:
            return []

        molecule = Molecule.from_smiles(smiles_pattern, allow_undefined_stereo=True)

        matches = [
            smirks
            for smirks in smirks_patterns
            if len(molecule.chemical_environment_matches(smirks)) > 0
        ]

        return matches

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterBySmirksSchema, n_processes
    ) -> pandas.DataFrame:

        smirks_to_match = (
            schema.smirks_to_include
            if schema.smirks_to_include
            else schema.smirks_to_exclude
        )

        def filter_function(data_row):

            n_components = data_row["N Components"]

            component_smiles = [
                data_row[f"Component {index + 1}"] for index in range(n_components)
            ]

            smirks_matches = {
                smiles: cls._find_smirks_matches(smiles, *smirks_to_match)
                for smiles in component_smiles
            }

            if schema.smirks_to_exclude is not None:
                return not any(len(x) > 0 for x in smirks_matches.values())

            if schema.allow_partial_inclusion:
                return any(len(x) > 0 for x in smirks_matches.values())

            return all(len(x) > 0 for x in smirks_matches.values())

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterByNComponentsSchema(CurationComponentSchema):

    type: Literal["FilterByNComponents"] = "FilterByNComponents"

    n_components: List[PositiveInt] = Field(
        ...,
        description="The number of components that measurements should have been "
        "measured for in order to be retained.",
    )


[docs]class FilterByNComponents(CurationComponent):
    """A component which filters out data points measured for systems with specified
    number of components.
    """

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByNComponentsSchema,
        n_processes,
    ) -> pandas.DataFrame:

        return data_frame[data_frame["N Components"].isin(schema.n_components)]


[docs]class FilterBySubstancesSchema(CurationComponentSchema):

    type: Literal["FilterBySubstances"] = "FilterBySubstances"

    substances_to_include: Optional[List[Tuple[str, ...]]] = Field(
        None,
        description="The substances compositions to retain, where each tuple in the "
        "list contains the smiles patterns which make up the substance to include. "
        "This option is mutually exclusive with `substances_to_exclude`.",
    )
    substances_to_exclude: Optional[List[Tuple[str, ...]]] = Field(
        None,
        description="The substances compositions to retain, where each tuple in the "
        "list contains the smiles patterns which make up the substance to exclude. "
        "This option is mutually exclusive with `substances_to_include`.",
    )

    @root_validator
    def _validate_mutually_exclusive(cls, values):

        substances_to_include = values.get("substances_to_include")
        substances_to_exclude = values.get("substances_to_exclude")

        assert substances_to_include is not None or substances_to_exclude is not None
        assert substances_to_include is None or substances_to_exclude is None

        return values


[docs]class FilterBySubstances(CurationComponent):
    """A component which filters the data set so that it only contains properties
    measured for particular substances.

    This method is similar to `filter_by_smiles`, however here we explicitly define
    the full substances compositions, rather than individual smiles which should
    either be included or excluded.

    Examples
    --------
    To filter the data set to only include measurements for pure methanol, pure
    benzene or an aqueous ethanol mix:

    >>> schema = FilterBySubstancesSchema(
    >>>     substances_to_include=[
    >>>         ('CO',),
    >>>         ('C1=CC=CC=C1',),
    >>>         ('CCO', 'O')
    >>>     ]
    >>> )

    To filter out measurements made for an aqueous mix of benzene:

    >>> schema = FilterBySubstancesSchema(
    >>>     substances_to_exclude=[('O', 'C1=CC=CC=C1')]
    >>> )
    """

    @classmethod
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterBySubstancesSchema, n_processes
    ) -> pandas.DataFrame:
        def filter_function(data_row):

            n_components = data_row["N Components"]

            substances_to_include = schema.substances_to_include
            substances_to_exclude = schema.substances_to_exclude

            if substances_to_include is not None:
                substances_to_include = [
                    tuple(sorted(x)) for x in substances_to_include
                ]
            if substances_to_exclude is not None:
                substances_to_exclude = [
                    tuple(sorted(x)) for x in substances_to_exclude
                ]

            substance = tuple(
                sorted(
                    [
                        data_row[f"Component {index + 1}"]
                        for index in range(n_components)
                    ]
                )
            )

            return (
                substances_to_exclude is not None
                and substance not in substances_to_exclude
            ) or (
                substances_to_include is not None and substance in substances_to_include
            )

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


[docs]class FilterByEnvironmentsSchema(CurationComponentSchema):

    type: Literal["FilterByEnvironments"] = "FilterByEnvironments"

    per_component_environments: Optional[Dict[int, ComponentEnvironments]] = Field(
        None,
        description="The environments which should be present in the components of "
        "the substance for which the measurements were made. Each dictionary "
        "key corresponds to a number of components in the system, and each "
        "value the environments which should be matched by those n components. "
        "This option is mutually exclusive with `environments`.",
    )
    environments: Optional[List[ChemicalEnvironment]] = Field(
        None,
        description="The environments which should be present in the substances for "
        "which measurements were made. This option is mutually exclusive with "
        "`per_component_environments`.",
    )

    at_least_one_environment: bool = Field(
        True,
        description="If true, data points will only be retained if all of the "
        "components in the measured system contain at least one of the specified "
        "environments. This option is mutually exclusive with "
        "`strictly_specified_environments`.",
    )
    strictly_specified_environments: bool = Field(
        False,
        description="If true, data points will only be retained if all of the "
        "components in the measured system strictly contain only the specified "
        "environments and no others. This option is mutually exclusive with "
        "`at_least_one_environment`.",
    )

    @validator("per_component_environments")
    def _validate_per_component_environments(cls, value):

        if value is None:
            return value

        assert all(len(y) == x for x, y in value.items())
        return value

    @root_validator
    def _validate_mutually_exclusive(cls, values):

        at_least_one_environment = values.get("at_least_one_environment")
        strictly_specified_environments = values.get("strictly_specified_environments")

        assert (
            at_least_one_environment is True or strictly_specified_environments is True
        )
        assert (
            at_least_one_environment is False
            or strictly_specified_environments is False
        )

        per_component_environments = values.get("per_component_environments")
        environments = values.get("environments")

        assert per_component_environments is not None or environments is not None
        assert per_component_environments is None or environments is None

        return values


[docs]class FilterByEnvironments(CurationComponent):
    """A component which filters a data set so that it only contains measurements made
    for substances which contain specific chemical environments.
    """

    @classmethod
    def _find_environments_per_component(cls, data_row: pandas.Series):

        n_components = data_row["N Components"]

        component_smiles = [
            data_row[f"Component {index + 1}"] for index in range(n_components)
        ]
        component_moieties = [analyse_functional_groups(x) for x in component_smiles]

        if any(x is None for x in component_moieties):

            logger.info(
                f"Checkmol was unable to parse the system with components="
                f"{component_smiles} and so this data point was discarded."
            )

            return None

        return component_moieties

    @classmethod
    def _is_match(cls, component_environments, environments_to_match, schema):

        operator = all if schema.strictly_specified_environments else any

        return operator(
            environment in environments_to_match
            for environment in component_environments
        )

    @classmethod
    def _filter_by_environments(cls, data_row, schema: FilterByEnvironmentsSchema):

        environments_per_component = cls._find_environments_per_component(data_row)

        if environments_per_component is None:
            return False

        return all(
            cls._is_match(component_environments, schema.environments, schema)
            for component_environments in environments_per_component
        )

    @classmethod
    def _filter_by_per_component(cls, data_row, schema: FilterByEnvironmentsSchema):

        n_components = data_row["N Components"]

        if (
            schema.per_component_environments is not None
            and n_components not in schema.per_component_environments
        ):
            # No filter was specified for this number of components.
            return True

        environments_per_component = cls._find_environments_per_component(data_row)

        if environments_per_component is None:
            return False

        match_matrix = numpy.zeros((n_components, n_components))

        for component_index, component_environments in enumerate(
            environments_per_component
        ):

            # noinspection PyUnresolvedReferences
            for environments_index, environments_to_match in enumerate(
                schema.per_component_environments[n_components]
            ):

                match_matrix[component_index, environments_index] = cls._is_match(
                    component_environments, environments_to_match, schema
                )

        x_indices, y_indices = linear_sum_assignment(match_matrix, maximize=True)

        return numpy.all(match_matrix[x_indices, y_indices] > 0)

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByEnvironmentsSchema,
        n_processes,
    ) -> pandas.DataFrame:

        if schema.environments is not None:
            filter_function = functools.partial(
                cls._filter_by_environments, schema=schema
            )
        else:
            filter_function = functools.partial(
                cls._filter_by_per_component, schema=schema
            )

        # noinspection PyTypeChecker
        return data_frame[data_frame.apply(filter_function, axis=1)]


FilterComponentSchema = Union[
    FilterDuplicatesSchema,
    FilterByTemperatureSchema,
    FilterByPressureSchema,
    FilterByMoleFractionSchema,
    FilterByRacemicSchema,
    FilterByElementsSchema,
    FilterByPropertyTypesSchema,
    FilterByStereochemistrySchema,
    FilterByChargedSchema,
    FilterByIonicLiquidSchema,
    FilterBySmilesSchema,
    FilterBySmirksSchema,
    FilterByNComponentsSchema,
    FilterBySubstancesSchema,
    FilterByEnvironmentsSchema,
]