File containing the filters workflow components.
from typing import Dict, List, Optional, Set, Union
from openff.toolkit.topology import Molecule
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.utils import (
from openff.units import unit
from rdkit.Chem.rdMolAlign import AlignMol
from typing_extensions import Literal
from openff.qcsubmit._pydantic import Field, root_validator, validator
from openff.qcsubmit.common_structures import ComponentProperties
from openff.qcsubmit.validators import (
from openff.qcsubmit.workflow_components.base_component import (
from openff.qcsubmit.workflow_components.utils import ComponentResult
[docs]class MolecularWeightFilter(ToolkitValidator, CustomWorkflowComponent):
Filters molecules based on the minimum and maximum allowed molecular weights.
type: Literal["MolecularWeightFilter"] = "MolecularWeightFilter"
minimum_weight: int = Field(
description="The minimum allowed molecule weight default value taken from the openeye blockbuster filter",
maximum_weight: int = Field(
description="The maximum allow molecule weight, default taken from the openeye blockbuster filter.",
[docs] @classmethod
def description(cls) -> str:
return "Molecules are filtered based on the allowed molecular weights."
[docs] @classmethod
def fail_reason(cls) -> str:
return "Molecule weight was not in the specified region."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
The common entry point of all workflow components which applies the workflow component to the given list of
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
result = self._create_result(toolkit_registry=toolkit_registry)
# work out the weight function
for toolkit in toolkit_registry.registered_toolkits:
if isinstance(toolkit, OpenEyeToolkitWrapper):
weight_func = self._get_openeye_weight
elif isinstance(toolkit, RDKitToolkitWrapper):
weight_func = self._get_rdkit_weight
raise ModuleNotFoundError(
"Either openeye or rdkit must be registered with the toolkit registry to use"
"the weight filter."
for molecule in molecules:
total_weight = weight_func(molecule)
if self.minimum_weight < total_weight < self.maximum_weight:
return result
def _get_rdkit_weight(self, molecule: Molecule) -> float:
Calculate the weight of the molecule using rdkit.
from rdkit.Chem import Descriptors
return Descriptors.ExactMolWt(molecule.to_rdkit())
def _get_openeye_weight(self, molecule: Molecule) -> float:
Calculate the weight of the molecule using openeye.
from openeye import oechem
return oechem.OECalculateMolecularWeight(molecule.to_openeye())
[docs]class ElementFilter(ToolkitValidator, CustomWorkflowComponent):
Filter the molecules based on a list of allowed elements.
The `allowed_elements` attribute can take a list of either symbols or atomic numbers and will resolve them to a
common internal format as required.
Using atomic symbols or atomic numbers in components.
>>> from openff.qcsubmit.workflow_components import ElementFilter
>>> efil = ElementFilter()
# set the allowed elements to H,C,N,O
>>> efil.allowed_elements = ['H', 'C', 'N', 'O']
>>> efil.allowed_elements = [1, 6, 7, 8]
type: Literal["ElementFilter"] = "ElementFilter"
allowed_elements: List[Union[int, str]] = Field(
description="The list of allowed elements as symbols or atomic number ints.",
_check_elements = validator("allowed_elements", each_item=True, allow_reuse=True)(
[docs] @classmethod
def description(cls) -> str:
return (
"Filter out molecules who contain elements not in the allowed element list."
[docs] @classmethod
def fail_reason(cls) -> str:
return "Molecules contained elements not in the allowed elements list."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
self._cache["elements"]: list[Union[str, int]] = [
SYMBOLS_TO_ELEMENTS.get(element, element)
for element in self.allowed_elements
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
The common entry point of all workflow components which applies the workflow component to the given list of
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
result = self._create_result(toolkit_registry=toolkit_registry)
# First lets convert the allowed_elements list to ints as this is what is stored in the atom object
_allowed_elements = self._cache["elements"]
# now apply the filter
for molecule in molecules:
for atom in molecule.atoms:
if atom.atomic_number not in _allowed_elements:
return result
[docs] def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
Generate version information for all of the software used during the running of this component.
A dictionary of all of the software used in the component along wither their version numbers.
The element class in OpenMM is used to match the elements so the OpenMM version is given.
import openff.units
provenance = super().provenance(toolkit_registry=toolkit_registry)
provenance["openff-units_elements"] = openff.units.__version__
return provenance
[docs]class CoverageFilter(ToolkitValidator, CustomWorkflowComponent):
Filters molecules based on the requested force field parameter ids.
* The options ``allowed_ids`` and ``filtered_ids`` are mutually exclusive.
type: Literal["CoverageFilter"] = "CoverageFilter"
allowed_ids: Optional[Set[str]] = Field(
description="The SMIRKS parameter ids of the parameters which are allowed to be exercised by the molecules. "
"Molecules should use at least one of these ids to be passed by the component.",
filtered_ids: Optional[Set[str]] = Field(
description="The SMIRKS parameter ids of the parameters which are not allowed to be exercised by the molecules.",
forcefield: str = Field(
description="The name of the force field which we want to filter against.",
[docs] @classmethod
def description(cls) -> str:
return "Filter the molecules based on the requested FF allowed parameters."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule was typed with disallowed parameters."
def _validate_mutually_exclusive(cls, values):
ids_to_include = values.get("allowed_ids")
ids_to_exclude = values.get("filtered_ids")
message = "exactly one of ``allowed_ids` and `filtered_ids` must specified."
assert ids_to_include is not None or ids_to_exclude is not None, message
assert ids_to_include is None or ids_to_exclude is None, message
return values
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
self._cache["forcefield"] = ForceField(self.forcefield)
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
Apply the filter to the list of molecules to remove any molecules typed by an id that is not allowed, i.e. not
included in the allowed list.
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
result = self._create_result(toolkit_registry=toolkit_registry)
forcefield: ForceField = self._cache["forcefield"]
# type the molecules
for molecule in molecules:
labels = forcefield.label_molecules(molecule.to_topology())[0]
# format the labels into a set
covered_types = set(
[label.id for types in labels.values() for label in types.values()]
# use set intersection to check coverage for unwanted and wanted types
unwanted_types = covered_types.intersection(self.filtered_ids or set())
common_types = covered_types.intersection(self.allowed_ids or set())
if self.filtered_ids is not None and unwanted_types:
# the molecule has an unwanted parameter id
elif self.allowed_ids is not None and not common_types:
# the molecule does not contain the wanted parameter id
# the molecule contains a wanted or does not contain a filtered parameter id
return result
[docs] def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict:
Generate version information for all of the software used during the running of this component.
A dictionary of all of the software used in the component along wither their version numbers.
import openforcefields
provenance = super().provenance(toolkit_registry=toolkit_registry)
provenance["openforcefields"] = openforcefields.__version__
return provenance
[docs]class RotorFilter(ToolkitValidator, CustomWorkflowComponent):
Filters molecules based on the maximum and or minimum allowed number of rotatable bonds.
Rotatable bonds are torsions found using the `find_rotatable_bonds` method of the
openforcefield.topology.Molecule class.
type: Literal["RotorFilter"] = "RotorFilter"
maximum_rotors: Optional[int] = Field(
description="The maximum number of rotatable bonds allowed in the molecule, if `None` the molecule has no maximum limit on rotatable bonds.",
minimum_rotors: Optional[int] = Field(
description="The minimum number of rotatble bonds allowed in the molecule, if `None` the molecule has no limit to the minimum number of rotatble bonds.",
[docs] @classmethod
def description(cls) -> str:
return "Filter the molecules based on the maximum number of allowed rotatable bonds."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule has too many rotatable bonds."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _apply_init(self, result: ComponentResult) -> None:
Validate the choice of minimum and maximum rotators.
if self.maximum_rotors and self.minimum_rotors:
if self.maximum_rotors < self.minimum_rotors:
raise ValueError(
"The maximum number of rotors should >= the minimum to ensure some molecules pass."
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
Apply the filter to the list of molecules to remove any molecules with more rotors then the maximum allowed
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
# create the return
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
# cache the rotatable bonds calc and only check fail conditions
rotatable_bonds = molecule.find_rotatable_bonds(
if self.maximum_rotors and len(rotatable_bonds) > self.maximum_rotors:
elif self.minimum_rotors and len(rotatable_bonds) < self.minimum_rotors:
return result
[docs]class SmartsFilter(ToolkitValidator, CustomWorkflowComponent):
Filters molecules based on if they contain certain smarts substructures.
* The smarts tags used for filtering should be numerically tagged in order to work with the toolkit.
* The options ``allowed_substructures`` and ``filtered_substructures`` are mutually exclusive.
type: Literal["SmartsFilter"] = "SmartsFilter"
allowed_substructures: Optional[List[str]] = Field(
description="The list of allowed substructures which should be tagged with indices.",
filtered_substructures: Optional[List[str]] = Field(
None, description="The list of substructures which should be filtered."
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules based on the given smarts patterns."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule did/didn't contain the given smarts patterns."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
_check_smarts = validator(
def _validate_mutually_exclusive(cls, values):
allowed_substructures = values.get("allowed_substructures")
filtered_substructures = values.get("filtered_substructures")
message = "exactly one of ``allowed_substructures` and `filtered_substructures` must specified."
assert (
allowed_substructures is not None or filtered_substructures is not None
), message
assert allowed_substructures is None or filtered_substructures is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
Apply the filter to the input list of molecules removing those that match the filtered set or do not contain an
allowed substructure.
molecules: The list of molecules the component should be applied on.
toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.
A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
that passed and were filtered by the component and details about the component which generated the result.
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
if self.allowed_substructures is not None:
for substructure in self.allowed_substructures:
if molecule.chemical_environment_matches(
query=substructure, toolkit_registry=toolkit_registry
# the molecule does not contain the allowed substructure so remove it
elif self.filtered_substructures is not None:
for substructure in self.filtered_substructures:
if molecule.chemical_environment_matches(
query=substructure, toolkit_registry=toolkit_registry
# there was no filtered substructure so keep the molecule
return result
[docs]class ScanFilter(ToolkitValidator, CustomWorkflowComponent):
A filter to remove/include molecules from the workflow who have scans targeting the specified SMARTS.
Currently only checks against 1D scans.
type: Literal["ScanFilter"] = "ScanFilter"
scans_to_include: Optional[List[str]] = Field(
description="Only molecules with SCANs covering these SMARTs"
"patterns should be kept. This option is mutually"
"exclusive with ``scans_to_exclude``.",
scans_to_exclude: Optional[List[str]] = Field(
description="Any molecules with scans covering these SMARTs will"
"be removed from the dataset. This option is mutally"
"exclusive with ``scans_to_include``.",
_check_smarts = validator(
"scans_to_include", "scans_to_exclude", each_item=True, allow_reuse=True
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules who have the desired/unwanted scans."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecule contained an unwanted or did not contain a desired dihedral/improper scan."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _validate_mutally_exclusive(cls, values):
scans_to_include = values.get("scans_to_include")
scans_to_exclude = values.get("scans_to_exclude")
message = (
"exactly one of `scans_to_include` and `scans_to_exclude` must be specified"
assert scans_to_include is not None or scans_to_exclude is not None, message
assert scans_to_include is None or scans_to_exclude is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
Keep or remove scans based on the list of torsions to include or remove.
result = self._create_result(toolkit_registry=toolkit_registry)
target_environments = self.scans_to_exclude or self.scans_to_include
for molecule in molecules:
torsion_indexer = molecule.properties.get("dihedrals", None)
# if no dihedrals are tagged remove the molecule
if torsion_indexer is None or torsion_indexer.n_torsions == 0:
all_matches = set()
for env in target_environments:
# get all matches as a list of sorted central bonds as they are stored this way
matches = molecule.chemical_environment_matches(
query=env, toolkit_registry=toolkit_registry
for match in matches:
match = match if len(match) == 2 else match[1:3]
# now we either remove any torsions in this list or any missing from it based on include/exclude
to_remove = []
if self.scans_to_include is not None:
for center_bond in torsion_indexer.torsions.keys():
if center_bond not in all_matches:
for center_bond in all_matches:
if center_bond in torsion_indexer.torsions.keys():
# now remove
for bond in to_remove:
del torsion_indexer.torsions[bond]
# if we have no torsions left filter the molecule
if not torsion_indexer.get_dihedrals:
return result
[docs]class ChargeFilter(ToolkitValidator, CustomWorkflowComponent):
Filter molecules if their formal charge is not in the `charges_to_include` list or is in the `charges_to_exclude` list.
type: Literal["ChargeFilter"] = "ChargeFilter"
charges_to_include: Optional[List[int]] = Field(
description="The list of net molecule formal charges which are allowed in the dataset."
"This option is mutually exclusive with ``charges_to_exclude``.",
charges_to_exclude: Optional[List[int]] = Field(
description="The list of net molecule formal charges which are to be removed from the dataset."
"This option is mutually exclusive with ``charges_to_include``.",
[docs] @classmethod
def description(cls) -> str:
return "Filter molecules by net formal charge."
[docs] @classmethod
def fail_reason(cls) -> str:
return "The molecules net formal charge was not requested or was in the `charges_to_exclude`."
[docs] @classmethod
def properties(cls) -> ComponentProperties:
return ComponentProperties(process_parallel=True, produces_duplicates=False)
def _validate_mutually_exclusive(cls, values):
charges_to_include = values.get("charges_to_include")
charges_to_exclude = values.get("charges_to_exclude")
message = "exactly one of ``charges_to_include` and `charges_to_exclude` must specified."
assert charges_to_include is not None or charges_to_exclude is not None, message
assert charges_to_include is None or charges_to_exclude is None, message
return values
def _apply(
self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
) -> ComponentResult:
Filter molecules based on their net formal charge
result = self._create_result(toolkit_registry=toolkit_registry)
for molecule in molecules:
total_charge = molecule.total_charge.m_as(unit.elementary_charge)
if (
self.charges_to_include is not None
and total_charge not in self.charges_to_include
) or (
self.charges_to_exclude is not None
and total_charge in self.charges_to_exclude
return result