Source code for openff.qcsubmit.workflow_components.state_enumeration

"""
Components to expand stereochemistry and tautomeric states of molecules.
"""

from typing import List, Optional, Tuple

from openff.toolkit.topology import Molecule
from openff.toolkit.utils import ToolkitRegistry
from qcelemental.util import which_import
from typing_extensions import Literal

from openff.qcsubmit._pydantic import Field
from openff.qcsubmit.common_structures import ComponentProperties
from openff.qcsubmit.utils import (
    check_missing_stereo,
    get_symmetry_classes,
    get_symmetry_group,
)
from openff.qcsubmit.workflow_components.base_component import (
    CustomWorkflowComponent,
    ToolkitValidator,
)
from openff.qcsubmit.workflow_components.utils import (
    ComponentResult,
    ImproperScan,
    Scan1D,
    Scan2D,
    TorsionIndexer,
)


[docs]class EnumerateTautomers(ToolkitValidator, CustomWorkflowComponent): """ Enumerate the tautomers of a molecule using the backend toolkits through the OFFTK. """ type: Literal["EnumerateTautomers"] = "EnumerateTautomers" # custom settings for the class max_tautomers: int = Field( 20, description="The maximum number of tautomers that should be generated." )
[docs] @classmethod def description(cls) -> str: return "Enumerate the tautomers of a molecule if possible, returning the input plus any new molecules."
[docs] @classmethod def fail_reason(cls) -> str: return "The molecule tautomers could not be enumerated."
[docs] @classmethod def properties(cls) -> ComponentProperties: return ComponentProperties(process_parallel=True, produces_duplicates=True)
def _apply( self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry ) -> ComponentResult: """ Enumerate tautomers of the input molecule. The input molecules tautomers are enumerated using the desired backend toolkit and are returned along with the input molecule. Parameters: molecules: The list of molecules the component should be applied on. toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits. Returns: A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules that passed and were filtered by the component and details about the component which generated the result. """ result = self._create_result(toolkit_registry=toolkit_registry) for molecule in molecules: try: tautomers = molecule.enumerate_tautomers( max_states=self.max_tautomers, toolkit_registry=toolkit_registry ) for tautomer in tautomers: result.add_molecule(tautomer) # add the input molecule result.add_molecule(molecule=molecule) except Exception: result.filter_molecule(molecule) return result
[docs]class EnumerateStereoisomers(ToolkitValidator, CustomWorkflowComponent): """ Enumerate the stereo centers and bonds of a molecule using the backend toolkits through the OFFTK, only well defined molecules are returned by this component, this is check via a OFFTK round trip. """ type: Literal["EnumerateStereoisomers"] = "EnumerateStereoisomers" undefined_only: bool = Field( False, description="If we should only enumerate parts of the molecule with undefined stereochemistry or all stereochemistry.", ) max_isomers: int = Field( 20, description="The maximum number of stereoisomers to be generated." ) rationalise: bool = Field( True, description="If we should check that the resulting molecules are physically possible by attempting to generate conformers for them.", )
[docs] @classmethod def description(cls) -> str: return "Enumerate the stereo centers and bonds of the molecule, returing the input molecule if valid and any new molecules."
[docs] @classmethod def fail_reason(cls) -> str: return "The molecules stereo centers or bonds could not be enumerated."
[docs] @classmethod def properties(cls) -> ComponentProperties: return ComponentProperties(process_parallel=True, produces_duplicates=True)
def _apply( self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry ) -> ComponentResult: """ Enumerate stereo centers and bonds of the input molecule Parameters: molecules: The list of molecules the component should be applied on. toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits. Returns: A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules that passed and were filtered by the component and details about the component which generated the result. """ result = self._create_result(toolkit_registry=toolkit_registry) for molecule in molecules: try: isomers = molecule.enumerate_stereoisomers( undefined_only=self.undefined_only, max_isomers=self.max_isomers, rationalise=self.rationalise, toolkit_registry=toolkit_registry, ) # now check that each molecule is well defined for isomer in isomers: if not check_missing_stereo(isomer): result.add_molecule(isomer) # now check the input # rationalise if needed if self.rationalise: molecule.generate_conformers(n_conformers=1) if not check_missing_stereo(molecule): result.add_molecule(molecule) except Exception: result.filter_molecule(molecule) return result
[docs]class EnumerateProtomers(ToolkitValidator, CustomWorkflowComponent): """ Enumerate the formal charges of the input molecule using the backend toolkits through the OFFTK. Important: Only Openeye is supported so far. """ type: Literal["EnumerateProtomers"] = "EnumerateProtomers" max_states: int = Field( 10, description="The maximum number of states that should be generated." )
[docs] @classmethod def is_available(cls) -> bool: tk = super().is_available() oe = which_import( ".oechem", return_bool=True, raise_error=True, package="openeye", raise_msg="Please install via `conda install openeye-toolkits -c openeye`", ) return tk and oe
[docs] @classmethod def description(cls) -> str: return "Enumerate the protomers of the molecule, returning the input molecule and any new molecules."
[docs] @classmethod def fail_reason(cls) -> str: return "The molecules formal charges could not be enumerated possibly due to a missing toolkit."
[docs] @classmethod def properties(cls) -> ComponentProperties: return ComponentProperties(process_parallel=True, produces_duplicates=True)
def _apply( self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry ) -> ComponentResult: """ Enumerate the formal charges of the molecule. Parameters: molecules: The list of molecules the component should be applied on. toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits. Returns: A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules that passed and were filtered by the component and details about the component which generated the result. Important: This is only possible using Openeye so far, if openeye is not available this step will fail. """ result = self._create_result(toolkit_registry=toolkit_registry) for molecule in molecules: try: protomers = molecule.enumerate_protomers(max_states=self.max_states) for protomer in protomers: result.add_molecule(protomer) result.add_molecule(molecule) except Exception: # if openeye is not available this will fail result.filter_molecule(molecule) return result
[docs]class ScanEnumerator(ToolkitValidator, CustomWorkflowComponent): """ This module will tag any matching substructures for scanning, useful for torsiondrive datasets. """ type: Literal["ScanEnumerator"] = "ScanEnumerator" torsion_scans: List[Scan1D] = Field( [], description="A list of scan objects which describes the scan range and scan increment" "that should be used with the associated smarts pattern.", ) double_torsion_scans: List[Scan2D] = Field( [], description="A list of double scan objects which describes the scan ranges and scan increments," "that should be used with each of the smarts patterns.", ) improper_scans: List[ImproperScan] = Field( [], description="A list of improper scan objects which describes the scan range and scan increment" "that should be used with the smarts pattern.", )
[docs] @classmethod def description(cls) -> str: return "Tag any matched substructures for scanning."
[docs] @classmethod def fail_reason(cls) -> str: return "The molecule contained no substructure matches."
[docs] @classmethod def properties(cls) -> ComponentProperties: return ComponentProperties(process_parallel=True, produces_duplicates=False)
[docs] def add_torsion_scan( self, smarts: str, scan_rage: Optional[Tuple[int, int]] = None, scan_increment: int = 15, ) -> None: """ Add a targeted 1D torsion scan to the scan enumerator. Args: smarts: The numerically tagged SMARTs pattern that should be used to identify the torsion atoms. scan_rage: The angle in degrees the torsion should be scanned between, from low to high scan_increment: The value in degrees between each grid point in the scan. """ self.torsion_scans.append( Scan1D( smarts1=smarts, scan_range1=scan_rage, scan_increment=[scan_increment] ) )
[docs] def add_double_torsion( self, smarts1: str, smarts2: str, scan_range1: Optional[Tuple[int, int]] = None, scan_range2: Optional[Tuple[int, int]] = None, scan_increments: List[int] = (15, 15), ) -> None: """ Add a targeted 2D torsion scan to the scan enumerator. Args: smarts1: The numerically tagged SMARTs pattern that should be used to identify the first torsion atoms. smarts2: The numerically tagged SMARTs pattern that should be used to identify the second torsion atoms. scan_range1: The angle in degrees the first torsion should be scanned between, from low to high scan_range2: The angle in degrees the second torsion should be scanned between, from low to high scan_increments: A list of the values in degrees between each grid point in the scans. """ self.double_torsion_scans.append( Scan2D( smarts1=smarts1, smarts2=smarts2, scan_range1=scan_range1, scan_range2=scan_range2, scan_increment=scan_increments, ) )
[docs] def add_improper_torsion( self, smarts: str, central_smarts: str, scan_range: Optional[Tuple[int, int]] = None, scan_increment: int = 15, ) -> None: """ Add a targeted Improper torsion to the scan enumerator. Args: smarts: The numerically tagged SMARTs pattern which describes the entire improper. central_smarts: The numerically tagged SMARTs pattern which identifies the central atom in the improper. scan_range: The angles in degrees the improper should be scanned between, from low to high. scan_increment: The value in degrees between each grid point in the scan. """ self.improper_scans.append( ImproperScan( smarts=smarts, central_smarts=central_smarts, scan_range=scan_range, scan_increment=[scan_increment], ) )
def _get_unique_torsions( self, matches: List[Tuple[int, int, int, int]], symmetry_classes: List[int] ) -> List[Tuple[int, int, int, int]]: """ Use the symmetry classes to condense the matches into unique torsions in the molecule by symmetry. """ torsions_by_symmetry = { tuple(sorted(symmetry_classes[idx] for idx in match[1:3])): match for match in matches } unique_torsions = [*torsions_by_symmetry.values()] return unique_torsions def _apply( self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry ) -> ComponentResult: """ Tag any dihedrals which match the defined set of targets in the enumerator. """ result = self._create_result(toolkit_registry=toolkit_registry) for molecule in molecules: symmetry_classes = get_symmetry_classes(molecule) molecule.properties["dihedrals"] = TorsionIndexer() self._tag_torsions( molecule, symmetry_classes, toolkit_registry=toolkit_registry ) self._tag_double_torsions( molecule, symmetry_classes, toolkit_registry=toolkit_registry ) self._tag_improper_torsions( molecule, symmetry_classes, toolkit_registry=toolkit_registry ) indexer = molecule.properties["dihedrals"] if len(indexer.get_dihedrals) == 0: result.filter_molecule(molecule) result.add_molecule(molecule) return result def _tag_torsions( self, molecule: Molecule, symmetry_classes: List[int], toolkit_registry: ToolkitRegistry, ) -> None: """ For each of the torsions in the torsion list try and tag only one in the molecule. """ indexer: TorsionIndexer = molecule.properties["dihedrals"] for torsion in self.torsion_scans: matches = molecule.chemical_environment_matches( query=torsion.smarts1, toolkit_registry=toolkit_registry ) unique_torsions = self._get_unique_torsions( matches=matches, symmetry_classes=symmetry_classes ) for tagged_torsion in unique_torsions: indexer.add_torsion( torsion=tagged_torsion, scan_range=torsion.scan_range1, scan_increment=torsion.scan_increment, symmetry_group=get_symmetry_group( atom_group=tagged_torsion[1:3], symmetry_classes=symmetry_classes, ), ) def _tag_double_torsions( self, molecule: Molecule, symmetry_classes: List[int], toolkit_registry: ToolkitRegistry, ) -> None: """ For each double torsion in the list try and tag the combination in the molecule. """ indexer: TorsionIndexer = molecule.properties["dihedrals"] for double_torsion in self.double_torsion_scans: matches1 = molecule.chemical_environment_matches( query=double_torsion.smarts1, toolkit_registry=toolkit_registry ) matches2 = molecule.chemical_environment_matches( query=double_torsion.smarts2, toolkit_registry=toolkit_registry ) unique_torsions1 = self._get_unique_torsions( matches=matches1, symmetry_classes=symmetry_classes ) unique_torsions2 = self._get_unique_torsions( matches=matches2, symmetry_classes=symmetry_classes ) for tagged_torsion1 in unique_torsions1: symmetry_group1 = get_symmetry_group( atom_group=tagged_torsion1[1:3], symmetry_classes=symmetry_classes ) for tagged_torsion2 in unique_torsions2: symmetry_group2 = get_symmetry_group( atom_group=tagged_torsion2[1:3], symmetry_classes=symmetry_classes, ) indexer.add_double_torsion( torsion1=tagged_torsion1, torsion2=tagged_torsion2, symmetry_group1=symmetry_group1, symmetry_group2=symmetry_group2, scan_range1=double_torsion.scan_range1, scan_range2=double_torsion.scan_range2, scan_increment=double_torsion.scan_increment, ) def _tag_improper_torsions( self, molecule: Molecule, symmetry_classes: List[int], toolkit_registry: ToolkitRegistry, ) -> None: """ For each improper torsion in the list try and tag the combination in the molecule. """ indexer: TorsionIndexer = molecule.properties["dihedrals"] for improper in self.improper_scans: matches = molecule.chemical_environment_matches( query=improper.smarts, toolkit_registry=toolkit_registry ) unique_torsions = self._get_unique_torsions( matches=matches, symmetry_classes=symmetry_classes ) central_atoms = molecule.chemical_environment_matches( improper.central_smarts ) for tagged_torsion in unique_torsions: symmetry_group = get_symmetry_group( atom_group=tagged_torsion, symmetry_classes=symmetry_classes ) for atom in central_atoms: if atom[0] in tagged_torsion: indexer.add_improper( central_atom=atom[0], improper=tagged_torsion, symmetry_group=symmetry_group, scan_range=improper.scan_range, scan_increment=improper.scan_increment, ) break