Source code for openff.qcsubmit.workflow_components.filters

"""
File containing the filters workflow components.
"""

from typing import Dict, List, Optional, Set, Union

from openff.toolkit.topology import Molecule
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.utils import (
    OpenEyeToolkitWrapper,
    RDKitToolkitWrapper,
    ToolkitRegistry,
)
from openff.units import unit
from rdkit.Chem.rdMolAlign import AlignMol
from typing_extensions import Literal

from openff.qcsubmit._pydantic import Field, root_validator, validator
from openff.qcsubmit.common_structures import ComponentProperties
from openff.qcsubmit.validators import (
    SYMBOLS_TO_ELEMENTS,
    check_allowed_elements,
    check_environments,
)
from openff.qcsubmit.workflow_components.base_component import (
    CustomWorkflowComponent,
    ToolkitValidator,
)
from openff.qcsubmit.workflow_components.utils import ComponentResult


[docs]class MolecularWeightFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filters molecules based on the minimum and maximum allowed molecular weights.
    """

    type: Literal["MolecularWeightFilter"] = "MolecularWeightFilter"
    minimum_weight: int = Field(
        130,
        description="The minimum allowed molecule weight  default value taken from the openeye blockbuster filter",
    )
    maximum_weight: int = Field(
        781,
        description="The maximum allow molecule weight, default taken from the openeye blockbuster filter.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Molecules are filtered based on the allowed molecular weights."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "Molecule weight was not in the specified region."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        The common entry point of all workflow components which applies the workflow component to the given list of
        molecules.

        Parameters:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)
        # work out the weight function
        for toolkit in toolkit_registry.registered_toolkits:
            if isinstance(toolkit, OpenEyeToolkitWrapper):
                weight_func = self._get_openeye_weight
                break
            elif isinstance(toolkit, RDKitToolkitWrapper):
                weight_func = self._get_rdkit_weight
                break
        else:
            raise ModuleNotFoundError(
                "Either openeye or rdkit must be registered with the toolkit registry to use"
                "the weight filter."
            )

        for molecule in molecules:
            total_weight = weight_func(molecule)

            if self.minimum_weight < total_weight < self.maximum_weight:
                result.add_molecule(molecule)
            else:
                result.filter_molecule(molecule)

        return result

    def _get_rdkit_weight(self, molecule: Molecule) -> float:
        """
        Calculate the weight of the molecule using rdkit.
        """
        from rdkit.Chem import Descriptors

        return Descriptors.ExactMolWt(molecule.to_rdkit())

    def _get_openeye_weight(self, molecule: Molecule) -> float:
        """
        Calculate the weight of the molecule using openeye.
        """
        from openeye import oechem

        return oechem.OECalculateMolecularWeight(molecule.to_openeye())


[docs]class ElementFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filter the molecules based on a list of allowed elements.

    Note:
        The `allowed_elements` attribute can take a list of either symbols or atomic numbers and will resolve them to a
        common internal format as required.

    Example:
        Using atomic symbols or atomic numbers in components.

        ```python
        >>> from openff.qcsubmit.workflow_components import ElementFilter
        >>> efil = ElementFilter()
        # set the allowed elements to H,C,N,O
        >>> efil.allowed_elements = ['H', 'C', 'N', 'O']
        >>> efil.allowed_elements = [1, 6, 7, 8]
        ```
    """

    type: Literal["ElementFilter"] = "ElementFilter"
    allowed_elements: List[Union[int, str]] = Field(
        [
            "H",
            "C",
            "N",
            "O",
            "F",
            "P",
            "S",
            "Cl",
            "Br",
            "I",
        ],
        description="The list of allowed elements as symbols or atomic number ints.",
    )
    _check_elements = validator("allowed_elements", each_item=True, allow_reuse=True)(
        check_allowed_elements
    )

[docs]    @classmethod
    def description(cls) -> str:
        return (
            "Filter out molecules who contain elements not in the allowed element list."
        )

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "Molecules contained elements not in the allowed elements list."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    def _apply_init(self, result: ComponentResult) -> None:
        self._cache["elements"]: list[Union[str, int]] = [
            SYMBOLS_TO_ELEMENTS.get(element, element)
            for element in self.allowed_elements
        ]

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        The common entry point of all workflow components which applies the workflow component to the given list of
        molecules.

        Parameters:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        # First lets convert the allowed_elements list to ints as this is what is stored in the atom object
        _allowed_elements = self._cache["elements"]

        # now apply the filter
        for molecule in molecules:
            for atom in molecule.atoms:
                if atom.atomic_number not in _allowed_elements:
                    result.filter_molecule(molecule)
                    break
            else:
                result.add_molecule(molecule)

        return result

[docs]    def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
        """
        Generate version information for all of the software used during the running of this component.

        Returns:
            A dictionary of all of the software used in the component along wither their version numbers.

        Note:
            The element class in OpenMM is used to match the elements so the OpenMM version is given.
        """

        import openff.units

        provenance = super().provenance(toolkit_registry=toolkit_registry)
        provenance["openff-units_elements"] = openff.units.__version__

        return provenance


[docs]class CoverageFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filters molecules based on the requested force field parameter ids.

    Note:
        * The options ``allowed_ids`` and ``filtered_ids`` are mutually exclusive.

    """

    type: Literal["CoverageFilter"] = "CoverageFilter"
    allowed_ids: Optional[Set[str]] = Field(
        None,
        description="The SMIRKS parameter ids of the parameters which are allowed to be exercised by the molecules. "
        "Molecules should use at least one of these ids to be passed by the component.",
    )
    filtered_ids: Optional[Set[str]] = Field(
        None,
        description="The SMIRKS parameter ids of the parameters which are not allowed to be exercised by the molecules.",
    )
    forcefield: str = Field(
        "openff_unconstrained-1.0.0.offxml",
        description="The name of the force field which we want to filter against.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter the molecules based on the requested FF allowed parameters."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule was typed with disallowed parameters."

    @root_validator
    def _validate_mutually_exclusive(cls, values):
        ids_to_include = values.get("allowed_ids")
        ids_to_exclude = values.get("filtered_ids")

        message = "exactly one of ``allowed_ids` and `filtered_ids` must specified."

        assert ids_to_include is not None or ids_to_exclude is not None, message
        assert ids_to_include is None or ids_to_exclude is None, message

        return values

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    def _apply_init(self, result: ComponentResult) -> None:
        self._cache["forcefield"] = ForceField(self.forcefield)

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Apply the filter to the list of molecules to remove any molecules typed by an id that is not allowed, i.e. not
        included in the allowed list.

        Args:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        forcefield: ForceField = self._cache["forcefield"]

        # type the molecules
        for molecule in molecules:
            labels = forcefield.label_molecules(molecule.to_topology())[0]
            # format the labels into a set
            covered_types = set(
                [label.id for types in labels.values() for label in types.values()]
            )

            # use set intersection to check coverage for unwanted and wanted types
            unwanted_types = covered_types.intersection(self.filtered_ids or set())
            common_types = covered_types.intersection(self.allowed_ids or set())

            if self.filtered_ids is not None and unwanted_types:
                # the molecule has an unwanted parameter id
                result.filter_molecule(molecule=molecule)
            elif self.allowed_ids is not None and not common_types:
                # the molecule does not contain the wanted parameter id
                result.filter_molecule(molecule=molecule)
            else:
                # the molecule contains a wanted or does not contain a filtered parameter id
                result.add_molecule(molecule=molecule)

        return result

[docs]    def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
        """
        Generate version information for all of the software used during the running of this component.

        Returns:
            A dictionary of all of the software used in the component along wither their version numbers.
        """
        import openforcefields

        provenance = super().provenance(toolkit_registry=toolkit_registry)
        provenance["openforcefields"] = openforcefields.__version__

        return provenance


[docs]class RotorFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filters molecules based on the maximum and or minimum allowed number of rotatable bonds.

    Note:
        Rotatable bonds are torsions found using the `find_rotatable_bonds` method of the
        openforcefield.topology.Molecule class.
    """

    type: Literal["RotorFilter"] = "RotorFilter"
    maximum_rotors: Optional[int] = Field(
        4,
        description="The maximum number of rotatable bonds allowed in the molecule, if `None` the molecule has no maximum limit on rotatable bonds.",
    )
    minimum_rotors: Optional[int] = Field(
        None,
        description="The minimum number of rotatble bonds allowed in the molecule, if `None` the molecule has no limit to the minimum number of rotatble bonds.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter the molecules based on the maximum number of allowed rotatable bonds."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule has too many rotatable bonds."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    def _apply_init(self, result: ComponentResult) -> None:
        """
        Validate the choice of minimum and maximum rotators.
        """
        if self.maximum_rotors and self.minimum_rotors:
            if self.maximum_rotors < self.minimum_rotors:
                raise ValueError(
                    "The maximum number of rotors should >= the minimum to ensure some molecules pass."
                )

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Apply the filter to the list of molecules to remove any molecules with more rotors then the maximum allowed
        number.

        Args:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        # create the return
        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            # cache the rotatable bonds calc and only check fail conditions
            rotatable_bonds = molecule.find_rotatable_bonds(
                toolkit_registry=toolkit_registry
            )
            if self.maximum_rotors and len(rotatable_bonds) > self.maximum_rotors:
                result.filter_molecule(molecule)
            elif self.minimum_rotors and len(rotatable_bonds) < self.minimum_rotors:
                result.filter_molecule(molecule)
            else:
                result.add_molecule(molecule)

        return result


[docs]class SmartsFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filters molecules based on if they contain certain smarts substructures.

    Note:
        * The smarts tags used for filtering should be numerically tagged in order to work with the toolkit.
        * The options ``allowed_substructures`` and ``filtered_substructures`` are mutually exclusive.
    """

    type: Literal["SmartsFilter"] = "SmartsFilter"
    allowed_substructures: Optional[List[str]] = Field(
        None,
        description="The list of allowed substructures which should be tagged with indices.",
    )
    filtered_substructures: Optional[List[str]] = Field(
        None, description="The list of substructures which should be filtered."
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter molecules based on the given smarts patterns."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule did/didn't contain the given smarts patterns."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    _check_smarts = validator(
        "allowed_substructures",
        "filtered_substructures",
        each_item=True,
        allow_reuse=True,
    )(check_environments)

    @root_validator
    def _validate_mutually_exclusive(cls, values):
        allowed_substructures = values.get("allowed_substructures")
        filtered_substructures = values.get("filtered_substructures")

        message = "exactly one of ``allowed_substructures` and `filtered_substructures` must specified."

        assert (
            allowed_substructures is not None or filtered_substructures is not None
        ), message
        assert allowed_substructures is None or filtered_substructures is None, message

        return values

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Apply the filter to the input list of molecules removing those that match the filtered set or do not contain an
        allowed substructure.

        Args:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            if self.allowed_substructures is not None:
                for substructure in self.allowed_substructures:
                    if molecule.chemical_environment_matches(
                        query=substructure, toolkit_registry=toolkit_registry
                    ):
                        result.add_molecule(molecule=molecule)
                        break
                else:
                    # the molecule does not contain the allowed substructure so remove it
                    result.filter_molecule(molecule=molecule)

            elif self.filtered_substructures is not None:
                for substructure in self.filtered_substructures:
                    if molecule.chemical_environment_matches(
                        query=substructure, toolkit_registry=toolkit_registry
                    ):
                        result.filter_molecule(molecule=molecule)
                        break
                else:
                    # there was no filtered substructure so keep the molecule
                    result.add_molecule(molecule=molecule)

        return result


[docs]class RMSDCutoffConformerFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Prunes conformers from a molecule that are less than a specified RMSD from
    all other conformers
    """

    # standard components which must be defined
    type: Literal["RMSDCutoffConformerFilter"] = "RMSDCutoffConformerFilter"
    # custom components for this class
    cutoff: float = Field(-1.0, description="The RMSD cut off in angstroms.")

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter conformations for the given molecules using a RMSD cutoff."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "Could not filter the conformers using RMSD."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    def _prune_conformers(self, molecule: Molecule) -> None:
        no_conformers: int = molecule.n_conformers

        # This will be used to determined whether it should be pruned
        # from the RMSD calculations. If we find it should be pruned
        # just once, it is sufficient to avoid it later in the pairwise
        # processing.
        uniq: List = list([True] * no_conformers)

        # Needed to get the aligned best-fit RMSD
        rdmol = molecule.to_rdkit()

        rmsd = []
        # This begins the pairwise RMSD pruner
        if no_conformers > 1 and self.cutoff >= 0.0:
            # The reference conformer for RMSD calculation
            for j in range(no_conformers - 1):
                # A previous loop has determine this specific conformer
                # is too close to another, so we can entirely skip it
                if not uniq[j]:
                    continue

                # since k starts from j+1, we are only looking at the
                # upper triangle of the comparisons (j < k)
                for k in range(j + 1, no_conformers):
                    rmsd_i = AlignMol(rdmol, rdmol, k, j)
                    rmsd.append(rmsd_i)

                    # Flag this conformer for pruning, and also
                    # prevent it from being used as a reference in the
                    # future comparisons
                    if rmsd_i < self.cutoff:
                        uniq[k] = False

            confs = [
                molecule.conformers[j] for j, add_bool in enumerate(uniq) if add_bool
            ]

            molecule._conformers = confs.copy()

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Prunes conformers from a molecule that are less than a specified RMSD from
        all other conformers

        Args:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            if molecule.n_conformers == 0:
                result.filter_molecule(molecule)
            else:
                self._prune_conformers(molecule)
                result.add_molecule(molecule)

        return result


[docs]class ScanFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    A filter to remove/include molecules from the workflow who have scans targeting the specified SMARTS.

    Important:
        Currently only checks against 1D scans.
    """

    type: Literal["ScanFilter"] = "ScanFilter"
    scans_to_include: Optional[List[str]] = Field(
        None,
        description="Only molecules with SCANs covering these SMARTs"
        "patterns should be kept. This option is mutually"
        "exclusive with ``scans_to_exclude``.",
    )

    scans_to_exclude: Optional[List[str]] = Field(
        None,
        description="Any molecules with scans covering these SMARTs will"
        "be removed from the dataset. This option is mutally"
        "exclusive with ``scans_to_include``.",
    )

    _check_smarts = validator(
        "scans_to_include", "scans_to_exclude", each_item=True, allow_reuse=True
    )(check_environments)

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter molecules who have the desired/unwanted scans."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule contained an unwanted or did not contain a desired dihedral/improper scan."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    @root_validator
    def _validate_mutally_exclusive(cls, values):
        scans_to_include = values.get("scans_to_include")
        scans_to_exclude = values.get("scans_to_exclude")

        message = (
            "exactly one of `scans_to_include` and `scans_to_exclude` must be specified"
        )

        assert scans_to_include is not None or scans_to_exclude is not None, message
        assert scans_to_include is None or scans_to_exclude is None, message
        return values

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Keep or remove scans based on the list of torsions to include or remove.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        target_environments = self.scans_to_exclude or self.scans_to_include

        for molecule in molecules:
            torsion_indexer = molecule.properties.get("dihedrals", None)
            # if no dihedrals are tagged remove the molecule
            if torsion_indexer is None or torsion_indexer.n_torsions == 0:
                result.filter_molecule(molecule=molecule)
                continue

            all_matches = set()
            for env in target_environments:
                # get all matches as a list of sorted central bonds as they are stored this way
                matches = molecule.chemical_environment_matches(
                    query=env, toolkit_registry=toolkit_registry
                )
                for match in matches:
                    match = match if len(match) == 2 else match[1:3]
                    all_matches.add(tuple(sorted(match)))

            # now we either remove any torsions in this list or any missing from it based on include/exclude
            to_remove = []
            if self.scans_to_include is not None:
                for center_bond in torsion_indexer.torsions.keys():
                    if center_bond not in all_matches:
                        to_remove.append(center_bond)
            else:
                for center_bond in all_matches:
                    if center_bond in torsion_indexer.torsions.keys():
                        to_remove.append(center_bond)

            # now remove
            for bond in to_remove:
                del torsion_indexer.torsions[bond]

            # if we have no torsions left filter the molecule
            if not torsion_indexer.get_dihedrals:
                result.filter_molecule(molecule)

            result.add_molecule(molecule)

        return result


[docs]class ChargeFilter(ToolkitValidator, CustomWorkflowComponent):
    """
    Filter molecules if their formal charge is not in the `charges_to_include` list or is in the `charges_to_exclude` list.
    """

    type: Literal["ChargeFilter"] = "ChargeFilter"

    charges_to_include: Optional[List[int]] = Field(
        None,
        description="The list of net molecule formal charges which are allowed in the dataset."
        "This option is mutually exclusive with ``charges_to_exclude``.",
    )
    charges_to_exclude: Optional[List[int]] = Field(
        None,
        description="The list of net molecule formal charges which are to be removed from the dataset."
        "This option is mutually exclusive with ``charges_to_include``.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Filter molecules by net formal charge."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecules net formal charge was not requested or was in the `charges_to_exclude`."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

    @root_validator
    def _validate_mutually_exclusive(cls, values):
        charges_to_include = values.get("charges_to_include")
        charges_to_exclude = values.get("charges_to_exclude")

        message = "exactly one of ``charges_to_include` and `charges_to_exclude` must specified."

        assert charges_to_include is not None or charges_to_exclude is not None, message
        assert charges_to_include is None or charges_to_exclude is None, message

        return values

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Filter molecules based on their net formal charge
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            total_charge = molecule.total_charge.m_as(unit.elementary_charge)

            if (
                self.charges_to_include is not None
                and total_charge not in self.charges_to_include
            ) or (
                self.charges_to_exclude is not None
                and total_charge in self.charges_to_exclude
            ):
                result.filter_molecule(molecule=molecule)

            else:
                result.add_molecule(molecule)

        return result