"""
File containing the filters workflow components.
"""
from typing import Dict, List, Optional, Set, Union
from openff.toolkit.topology import Molecule
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.utils import (
OpenEyeToolkitWrapper,
RDKitToolkitWrapper,
ToolkitRegistry,
)
from openff.units import unit
from rdkit.Chem.rdMolAlign import AlignMol
from typing_extensions import Literal
from openff.qcsubmit._pydantic import Field, root_validator, validator
from openff.qcsubmit.common_structures import ComponentProperties
from openff.qcsubmit.validators import (
SYMBOLS_TO_ELEMENTS,
check_allowed_elements,
check_environments,
)
from openff.qcsubmit.workflow_components.base_component import (
CustomWorkflowComponent,
ToolkitValidator,
)
from openff.qcsubmit.workflow_components.utils import ComponentResult
[docs]class MolecularWeightFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filters molecules based on the minimum and maximum allowed molecular weights.
"""
type: Literal["MolecularWeightFilter"] = "MolecularWeightFilter"
minimum_weight: int = Field(
130,
description="The minimum allowed molecule weight default value taken from the openeye blockbuster filter",
)
maximum_weight: int = Field(
781,
description="The maximum allow molecule weight, default taken from the openeye blockbuster filter.",
)
[docs] @classmethod
def description(cls) -> str:
return "Molecules are filtered based on the allowed molecular weights."
[docs] @classmethod
def fail_reason(cls) -> str:
return "Molecule weight was not in the specified region."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
The common entry point of all workflow components which applies the workflow component to the given list of
molecules.
Parameters:
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.
Returns:
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
"""
result = self._create_result(toolkit_registry=toolkit_registry)
# work out the weight function
for toolkit in toolkit_registry.registered_toolkits:
if isinstance(toolkit, OpenEyeToolkitWrapper):
weight_func = self._get_openeye_weight
break
elif isinstance(toolkit, RDKitToolkitWrapper):
weight_func = self._get_rdkit_weight
break
else:
raise ModuleNotFoundError(
"Either openeye or rdkit must be registered with the toolkit registry to use"
"the weight filter."
)
for molecule in molecules:
total_weight = weight_func(molecule)
if self.minimum_weight < total_weight < self.maximum_weight:
result.add_molecule(molecule)
else:
result.filter_molecule(molecule)
return result
def _get_rdkit_weight(self, molecule: Molecule) -> float:
"""
Calculate the weight of the molecule using rdkit.
"""
from rdkit.Chem import Descriptors
return Descriptors.ExactMolWt(molecule.to_rdkit())
def _get_openeye_weight(self, molecule: Molecule) -> float:
"""
Calculate the weight of the molecule using openeye.
"""
from openeye import oechem
return oechem.OECalculateMolecularWeight(molecule.to_openeye())
[docs]class ElementFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filter the molecules based on a list of allowed elements.
Note:
The `allowed_elements` attribute can take a list of either symbols or atomic numbers and will resolve them to a
common internal format as required.
Example:
Using atomic symbols or atomic numbers in components.
```python
>>> from openff.qcsubmit.workflow_components import ElementFilter
>>> efil = ElementFilter()
# set the allowed elements to H,C,N,O
>>> efil.allowed_elements = ['H', 'C', 'N', 'O']
>>> efil.allowed_elements = [1, 6, 7, 8]
```
"""
type: Literal["ElementFilter"] = "ElementFilter"
allowed_elements: List[Union[int, str]] = Field(
[
"H",
"C",
"N",
"O",
"F",
"P",
"S",
"Cl",
"Br",
"I",
],
description="The list of allowed elements as symbols or atomic number ints.",
)
_check_elements = validator("allowed_elements", each_item=True, allow_reuse=True)(
check_allowed_elements
)
[docs] @classmethod
def description(cls) -> str:
return (
"Filter out molecules who contain elements not in the allowed element list."
)
[docs] @classmethod
def fail_reason(cls) -> str:
return "Molecules contained elements not in the allowed elements list."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
self._cache["elements"]: list[Union[str, int]] = [
SYMBOLS_TO_ELEMENTS.get(element, element)
for element in self.allowed_elements
]
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
The common entry point of all workflow components which applies the workflow component to the given list of
molecules.
Parameters:
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
Returns:
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
"""
result = self._create_result(toolkit_registry=toolkit_registry)
# First lets convert the allowed_elements list to ints as this is what is stored in the atom object
_allowed_elements = self._cache["elements"]
# now apply the filter
for molecule in molecules:
for atom in molecule.atoms:
if atom.atomic_number not in _allowed_elements:
result.filter_molecule(molecule)
break
else:
result.add_molecule(molecule)
return result
[docs] def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
"""
Generate version information for all of the software used during the running of this component.
Returns:
A dictionary of all of the software used in the component along wither their version numbers.
Note:
The element class in OpenMM is used to match the elements so the OpenMM version is given.
"""
import openff.units
provenance = super().provenance(toolkit_registry=toolkit_registry)
provenance["openff-units_elements"] = openff.units.__version__
return provenance
[docs]class CoverageFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filters molecules based on the requested force field parameter ids.
Note:
* The options ``allowed_ids`` and ``filtered_ids`` are mutually exclusive.
"""
type: Literal["CoverageFilter"] = "CoverageFilter"
allowed_ids: Optional[Set[str]] = Field(
None,
description="The SMIRKS parameter ids of the parameters which are allowed to be exercised by the molecules. "
"Molecules should use at least one of these ids to be passed by the component.",
)
filtered_ids: Optional[Set[str]] = Field(
None,
description="The SMIRKS parameter ids of the parameters which are not allowed to be exercised by the molecules.",
)
forcefield: str = Field(
"openff_unconstrained-1.0.0.offxml",
description="The name of the force field which we want to filter against.",
)
[docs] @classmethod
def description(cls) -> str:
return "Filter the molecules based on the requested FF allowed parameters."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule was typed with disallowed parameters."
@root_validator
def _validate_mutually_exclusive(cls, values):
ids_to_include = values.get("allowed_ids")
ids_to_exclude = values.get("filtered_ids")
message = "exactly one of ``allowed_ids` and `filtered_ids` must specified."
assert ids_to_include is not None or ids_to_exclude is not None, message
assert ids_to_include is None or ids_to_exclude is None, message
return values
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
self._cache["forcefield"] = ForceField(self.forcefield)
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
Apply the filter to the list of molecules to remove any molecules typed by an id that is not allowed, i.e. not
included in the allowed list.
Args:
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.
Returns:
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
"""
result = self._create_result(toolkit_registry=toolkit_registry)
forcefield: ForceField = self._cache["forcefield"]
# type the molecules
for molecule in molecules:
labels = forcefield.label_molecules(molecule.to_topology())[0]
# format the labels into a set
covered_types = set(
[label.id for types in labels.values() for label in types.values()]
)
# use set intersection to check coverage for unwanted and wanted types
unwanted_types = covered_types.intersection(self.filtered_ids or set())
common_types = covered_types.intersection(self.allowed_ids or set())
if self.filtered_ids is not None and unwanted_types:
# the molecule has an unwanted parameter id
result.filter_molecule(molecule=molecule)
elif self.allowed_ids is not None and not common_types:
# the molecule does not contain the wanted parameter id
result.filter_molecule(molecule=molecule)
else:
# the molecule contains a wanted or does not contain a filtered parameter id
result.add_molecule(molecule=molecule)
return result
[docs] def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
"""
Generate version information for all of the software used during the running of this component.
Returns:
A dictionary of all of the software used in the component along wither their version numbers.
"""
import openforcefields
provenance = super().provenance(toolkit_registry=toolkit_registry)
provenance["openforcefields"] = openforcefields.__version__
return provenance
[docs]class RotorFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filters molecules based on the maximum and or minimum allowed number of rotatable bonds.
Note:
Rotatable bonds are torsions found using the `find_rotatable_bonds` method of the
openforcefield.topology.Molecule class.
"""
type: Literal["RotorFilter"] = "RotorFilter"
maximum_rotors: Optional[int] = Field(
4,
description="The maximum number of rotatable bonds allowed in the molecule, if `None` the molecule has no maximum limit on rotatable bonds.",
)
minimum_rotors: Optional[int] = Field(
None,
description="The minimum number of rotatble bonds allowed in the molecule, if `None` the molecule has no limit to the minimum number of rotatble bonds.",
)
[docs] @classmethod
def description(cls) -> str:
return "Filter the molecules based on the maximum number of allowed rotatable bonds."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule has too many rotatable bonds."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
"""
Validate the choice of minimum and maximum rotators.
"""
if self.maximum_rotors and self.minimum_rotors:
if self.maximum_rotors < self.minimum_rotors:
raise ValueError(
"The maximum number of rotors should >= the minimum to ensure some molecules pass."
)
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
Apply the filter to the list of molecules to remove any molecules with more rotors then the maximum allowed
number.
Args:
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
Returns:
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
"""
# create the return
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
# cache the rotatable bonds calc and only check fail conditions
rotatable_bonds = molecule.find_rotatable_bonds(
toolkit_registry=toolkit_registry
)
if self.maximum_rotors and len(rotatable_bonds) > self.maximum_rotors:
result.filter_molecule(molecule)
elif self.minimum_rotors and len(rotatable_bonds) < self.minimum_rotors:
result.filter_molecule(molecule)
else:
result.add_molecule(molecule)
return result
[docs]class SmartsFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filters molecules based on if they contain certain smarts substructures.
Note:
* The smarts tags used for filtering should be numerically tagged in order to work with the toolkit.
* The options ``allowed_substructures`` and ``filtered_substructures`` are mutually exclusive.
"""
type: Literal["SmartsFilter"] = "SmartsFilter"
allowed_substructures: Optional[List[str]] = Field(
None,
description="The list of allowed substructures which should be tagged with indices.",
)
filtered_substructures: Optional[List[str]] = Field(
None, description="The list of substructures which should be filtered."
)
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules based on the given smarts patterns."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule did/didn't contain the given smarts patterns."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
_check_smarts = validator(
"allowed_substructures",
"filtered_substructures",
each_item=True,
allow_reuse=True,
)(check_environments)
@root_validator
def _validate_mutually_exclusive(cls, values):
allowed_substructures = values.get("allowed_substructures")
filtered_substructures = values.get("filtered_substructures")
message = "exactly one of ``allowed_substructures` and `filtered_substructures` must specified."
assert (
allowed_substructures is not None or filtered_substructures is not None
), message
assert allowed_substructures is None or filtered_substructures is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
Apply the filter to the input list of molecules removing those that match the filtered set or do not contain an
allowed substructure.
Args:
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
Returns:
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
"""
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
if self.allowed_substructures is not None:
for substructure in self.allowed_substructures:
if molecule.chemical_environment_matches(
query=substructure, toolkit_registry=toolkit_registry
):
result.add_molecule(molecule=molecule)
break
else:
# the molecule does not contain the allowed substructure so remove it
result.filter_molecule(molecule=molecule)
elif self.filtered_substructures is not None:
for substructure in self.filtered_substructures:
if molecule.chemical_environment_matches(
query=substructure, toolkit_registry=toolkit_registry
):
result.filter_molecule(molecule=molecule)
break
else:
# there was no filtered substructure so keep the molecule
result.add_molecule(molecule=molecule)
return result
[docs]class ScanFilter(ToolkitValidator, CustomWorkflowComponent):
"""
A filter to remove/include molecules from the workflow who have scans targeting the specified SMARTS.
Important:
Currently only checks against 1D scans.
"""
type: Literal["ScanFilter"] = "ScanFilter"
scans_to_include: Optional[List[str]] = Field(
None,
description="Only molecules with SCANs covering these SMARTs"
"patterns should be kept. This option is mutually"
"exclusive with ``scans_to_exclude``.",
)
scans_to_exclude: Optional[List[str]] = Field(
None,
description="Any molecules with scans covering these SMARTs will"
"be removed from the dataset. This option is mutally"
"exclusive with ``scans_to_include``.",
)
_check_smarts = validator(
"scans_to_include", "scans_to_exclude", each_item=True, allow_reuse=True
)(check_environments)
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules who have the desired/unwanted scans."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule contained an unwanted or did not contain a desired dihedral/improper scan."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
@root_validator
def _validate_mutally_exclusive(cls, values):
scans_to_include = values.get("scans_to_include")
scans_to_exclude = values.get("scans_to_exclude")
message = (
"exactly one of `scans_to_include` and `scans_to_exclude` must be specified"
)
assert scans_to_include is not None or scans_to_exclude is not None, message
assert scans_to_include is None or scans_to_exclude is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
Keep or remove scans based on the list of torsions to include or remove.
"""
result = self._create_result(toolkit_registry=toolkit_registry)
target_environments = self.scans_to_exclude or self.scans_to_include
for molecule in molecules:
torsion_indexer = molecule.properties.get("dihedrals", None)
# if no dihedrals are tagged remove the molecule
if torsion_indexer is None or torsion_indexer.n_torsions == 0:
result.filter_molecule(molecule=molecule)
continue
all_matches = set()
for env in target_environments:
# get all matches as a list of sorted central bonds as they are stored this way
matches = molecule.chemical_environment_matches(
query=env, toolkit_registry=toolkit_registry
)
for match in matches:
match = match if len(match) == 2 else match[1:3]
all_matches.add(tuple(sorted(match)))
# now we either remove any torsions in this list or any missing from it based on include/exclude
to_remove = []
if self.scans_to_include is not None:
for center_bond in torsion_indexer.torsions.keys():
if center_bond not in all_matches:
to_remove.append(center_bond)
else:
for center_bond in all_matches:
if center_bond in torsion_indexer.torsions.keys():
to_remove.append(center_bond)
# now remove
for bond in to_remove:
del torsion_indexer.torsions[bond]
# if we have no torsions left filter the molecule
if not torsion_indexer.get_dihedrals:
result.filter_molecule(molecule)
result.add_molecule(molecule)
return result
[docs]class ChargeFilter(ToolkitValidator, CustomWorkflowComponent):
"""
Filter molecules if their formal charge is not in the `charges_to_include` list or is in the `charges_to_exclude` list.
"""
type: Literal["ChargeFilter"] = "ChargeFilter"
charges_to_include: Optional[List[int]] = Field(
None,
description="The list of net molecule formal charges which are allowed in the dataset."
"This option is mutually exclusive with ``charges_to_exclude``.",
)
charges_to_exclude: Optional[List[int]] = Field(
None,
description="The list of net molecule formal charges which are to be removed from the dataset."
"This option is mutually exclusive with ``charges_to_include``.",
)
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules by net formal charge."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecules net formal charge was not requested or was in the `charges_to_exclude`."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
@root_validator
def _validate_mutually_exclusive(cls, values):
charges_to_include = values.get("charges_to_include")
charges_to_exclude = values.get("charges_to_exclude")
message = "exactly one of ``charges_to_include` and `charges_to_exclude` must specified."
assert charges_to_include is not None or charges_to_exclude is not None, message
assert charges_to_include is None or charges_to_exclude is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
"""
Filter molecules based on their net formal charge
"""
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
total_charge = molecule.total_charge.m_as(unit.elementary_charge)
if (
self.charges_to_include is not None
and total_charge not in self.charges_to_include
) or (
self.charges_to_exclude is not None
and total_charge in self.charges_to_exclude
):
result.filter_molecule(molecule=molecule)
else:
result.add_molecule(molecule)
return result