Source code for openff.qcsubmit.factories

import abc
import os
from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union

import tqdm
from openff.toolkit import topology as off
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, ToolkitRegistry
from pydantic import Field, validator
from qcportal.models.common_models import DriverEnum
from typing_extensions import Literal

from openff.qcsubmit.common_structures import CommonBase, Metadata
from openff.qcsubmit.datasets import (
    BasicDataset,
    OptimizationDataset,
    TorsiondriveDataset,
)
from openff.qcsubmit.exceptions import (
    ComponentRequirementError,
    DihedralConnectionError,
    DriverError,
    InvalidWorkflowComponentError,
    LinearTorsionError,
    MissingWorkflowComponentError,
    MolecularComplexError,
)
from openff.qcsubmit.procedures import GeometricProcedure
from openff.qcsubmit.serializers import deserialize, serialize
from openff.qcsubmit.workflow_components import (
    ComponentResult,
    Components,
    CustomWorkflowComponent,
    get_component,
)

T = TypeVar("T", bound=BasicDataset)


[docs]class BaseDatasetFactory(CommonBase, abc.ABC): """ The Base factory which all other dataset factories should inherit from. """ type: Literal["BaseDatasetFactory"] = Field( "BaseDatasetFactory", description="The type of dataset factory which corresponds to the dataset made.", ) workflow: List[Components] = Field( [], description="The set of workflow components and their settings which will be executed in order on the input molecules to make the dataset.", )
[docs] @classmethod def from_file(cls, file_name: str): """Create a factory from the serialised model file.""" data = deserialize(file_name=file_name) return cls(**data)
@classmethod @abc.abstractmethod def _dataset_type(cls) -> Type[BasicDataset]: """Get the type of dataset made by this factory.""" ... def _molecular_complex_filter( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: """ Make a molecular complex dummy filter and filter a molecule. """ try: dataset.filtered_molecules["MolecularComplexRemoval"].add_molecule( molecule=molecule ) except KeyError: dataset.filter_molecules( molecules=[ molecule, ], component="MolecularComplexRemoval", component_settings={}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), ) def _no_dihedrals_filter( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: """ Fail a molecule for having no tagged torsions. """ try: dataset.filtered_molecules["NoDihedralRemoval"].add_molecule( molecule=molecule ) except KeyError: dataset.filter_molecules( molecules=[ molecule, ], component="NoDihedralRemoval", component_settings={}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), )
[docs] def provenance(self, toolkit_registry: ToolkitRegistry) -> Dict[str, str]: """ Create the provenance of openff-qcsubmit that created the molecule input data. Returns: A dict of the provenance information. Important: We can not check which toolkit was used to generate the Cmiles data but we know that openeye will always be used first when available. """ from openff import qcsubmit, toolkit provenance = { "openff-qcsubmit": qcsubmit.__version__, "openff-toolkit": toolkit.__version__, } # add all toolkit wrappers in the same order they are registered for tk in toolkit_registry.registered_toolkits: if tk.__class__.__name__ != "BuiltInToolkitWrapper": provenance[tk.__class__.__name__] = tk.toolkit_version return provenance
[docs] def clear_workflow(self) -> None: """ Reset the workflow to be empty. """ self.workflow = []
[docs] def add_workflow_components(self, *components: Components) -> None: """ Take the workflow components validate them then insert them into the workflow. Args: components: A list of or an individual workflow component which is to be validated and added to the current workflow. Raises: InvalidWorkflowComponentError: If an invalid workflow component is attempted to be added to the workflow. """ for component in components: if issubclass(type(component), CustomWorkflowComponent): try: component.is_available() self.workflow.append(component) except ModuleNotFoundError as e: raise ComponentRequirementError( f"The component {component.type} could not be added to " f"the workflow due to missing requirements" ) from e else: raise InvalidWorkflowComponentError( f"Component {component} rejected as it is not a sub " f"class of CustomWorkflowComponent." )
[docs] def get_workflow_components(self, component_name: str) -> List[Components]: """ Find any workflow components with this component name. Args: component_name: The name of the component to be gathered from the workflow. Returns: A list of instances of the requested component from the workflow. Raises: MissingWorkflowComponentError: If the component could not be found by its component name in the workflow. """ components = [ component for component in self.workflow if component.type.lower() == component_name.lower() ] if not components: raise MissingWorkflowComponentError( f"The requested component {component_name} " f"was not registered into the workflow." ) return components
[docs] def remove_workflow_component(self, component_name: str) -> None: """ Find and remove any components via its type attribute. Args: component_name: The name of the component to be gathered from the workflow. Raises: MissingWorkflowComponentError: If the component could not be found by its component name in the workflow. """ components = self.get_workflow_components(component_name=component_name) if not components: raise MissingWorkflowComponentError( f"The requested component {component_name} " f"could not be removed as it was not registered." ) for component in components: self.workflow.remove(component)
[docs] def import_workflow( self, workflow: Union[str, Dict], clear_existing: bool = True ) -> None: """ Instance the workflow from a workflow object or from an input file. Args: workflow: The name of the file the workflow should be created from or a workflow dictionary. clear_existing: If the current workflow should be deleted and replaced or extended. """ if clear_existing: self.clear_workflow() if isinstance(workflow, str): workflow = deserialize(workflow) if isinstance(workflow, dict): # this should be a workflow dict that we can just load # if this is from the settings file make sure to unpack the dict first. workflow = workflow.get("workflow", workflow) # load in the workflow for value in workflow.values(): # check if this is not the first instance of the component component = get_component(value["type"]) self.add_workflow_components(component.parse_obj(value))
[docs] def export_workflow(self, file_name: str) -> None: """ Export the workflow components and their settings to file so that they can be loaded later. Args: file_name: The name of the file the workflow should be exported to. Raises: UnsupportedFiletypeError: If the file type is not supported. """ # grab only the workflow workflow = self.dict(include={"workflow"}) serialize(serializable=workflow, file_name=file_name)
[docs] def export(self, file_name: str) -> None: """ Export the whole factory to file including settings and workflow. Args: file_name: The name of the file the factory should be exported to. """ serialize(serializable=self.dict(), file_name=file_name)
[docs] def export_settings(self, file_name: str) -> None: """ Export the current model to file this will include the workflow as well along with each components settings. Args: file_name: The name of the file the settings and workflow should be exported to. Raises: UnsupportedFiletypeError: When the file type requested is not supported. """ serialize(serializable=self, file_name=file_name)
[docs] def import_settings( self, settings: Union[str, Dict], clear_workflow: bool = True ) -> None: """ Import settings and workflow from a file. Args: settings: The name of the file the settings should be extracted from or the reference to a settings dictionary. clear_workflow: If the current workflow should be extended or replaced. """ if isinstance(settings, str): data = deserialize(settings) # take the workflow out and import the settings workflow = data.pop("workflow") elif isinstance(settings, dict): workflow = settings.pop("workflow") data = settings else: raise RuntimeError( "The input type could not be converted into a settings dictionary." ) # now set the factory meta settings for key, value in data.items(): if hasattr(self, key): setattr(self, key, value) else: continue # now we want to add the workflow back in self.import_workflow(workflow=workflow, clear_existing=clear_workflow)
def _create_initial_component_result( self, molecules: Union[str, off.Molecule, List[off.Molecule]], toolkit_registry: ToolkitRegistry, ) -> ComponentResult: """ Create the initial component result which is used for de-duplication. Args: molecules: The input molecules which can be a file name or list of molecule instances Returns: The initial component result used to start the workflow. """ # check if we have been given an input file with molecules inside if isinstance(molecules, str): if os.path.isfile(molecules): workflow_molecules = ComponentResult( component_name=self.type, component_description={"type": self.type}, component_provenance=self.provenance( toolkit_registry=toolkit_registry ), input_file=molecules, ) elif os.path.isdir(molecules): workflow_molecules = ComponentResult( component_name=self.type, component_description={"type": self.type}, component_provenance=self.provenance( toolkit_registry=toolkit_registry ), input_directory=molecules, ) else: raise FileNotFoundError(f"The input {molecules} could not be found.") elif isinstance(molecules, off.Molecule): workflow_molecules = ComponentResult( component_name=self.type, component_description={"type": self.type}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), molecules=[ molecules, ], ) else: workflow_molecules = ComponentResult( component_name=self.type, component_description={"type": self.type}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), molecules=molecules, ) return workflow_molecules @abc.abstractmethod def _process_molecule( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: """ Process the molecules returned from running the workflow into a new dataset. """ ...
[docs] def create_dataset( self, dataset_name: str, molecules: Union[str, List[off.Molecule], off.Molecule], description: str, tagline: str, metadata: Optional[Metadata] = None, processors: Optional[int] = None, toolkit_registry: Optional[ToolkitRegistry] = None, verbose: bool = True, ) -> T: """ Process the input molecules through the given workflow then create and populate the corresponding dataset class which acts as a local representation for the collection and tasks to be performed in qcarchive. Args: dataset_name: The name that will be given to the collection on submission to an archive instance. molecules: The list of molecules which should be processed by the workflow and added to the dataset, this can also be a file name which is to be unpacked by the openforcefield toolkit. description: A string describing the dataset this should be detail the purpose of the dataset and outline the selection method of the molecules. tagline: A short tagline description which will be displayed with collection name in the QCArchive. metadata: Any metadata which should be associated with this dataset this can be changed from the default after making the dataset. processors: The number of processors available to the workflow, note None will use all available processors. toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits and the order in which they should be queried for functionality.If ``None`` is passed the default global registry will be used with all installed toolkits. verbose: If True a progress bar for each workflow component will be shown. Returns: A dataset instance populated with the molecules that have passed through the workflow. """ # TODO set up a logging system to report the components if toolkit_registry is None: toolkit_registry = GLOBAL_TOOLKIT_REGISTRY # create an initial component result workflow_molecules = self._create_initial_component_result( molecules=molecules, toolkit_registry=toolkit_registry ) # create the dataset # first we need to instance the dataset and assign the metadata object_meta = self.dict(exclude={"workflow", "type"}) # the only data missing is the collection name so add it here. object_meta["dataset_name"] = dataset_name object_meta["description"] = description object_meta["provenance"] = self.provenance(toolkit_registry=toolkit_registry) object_meta["dataset_tagline"] = tagline if metadata is not None: object_meta["metadata"] = metadata.dict() dataset = self._dataset_type().parse_obj(object_meta) # if the workflow has components run it if self.workflow: for component in self.workflow: workflow_molecules = component.apply( molecules=workflow_molecules.molecules, processors=processors, toolkit_registry=toolkit_registry, verbose=verbose, ) dataset.filter_molecules( molecules=workflow_molecules.filtered, component=workflow_molecules.component_name, component_settings=workflow_molecules.component_description, component_provenance=workflow_molecules.component_provenance, ) # now add the molecules to the correct attributes for molecule in tqdm.tqdm( workflow_molecules.molecules, total=len(workflow_molecules.molecules), ncols=80, desc="{:30s}".format("Preparation"), disable=not verbose, ): self._process_molecule( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry ) return dataset
[docs] def create_index(self, molecule: off.Molecule) -> str: """ Create an index for the current molecule. Args: molecule: The molecule for which the dataset index will be generated. Returns: The molecule name or the canonical isomeric smiles for the molecule if the name is not assigned or is blank. Important: Each dataset can have a different indexing system depending on the data, in this basic dataset each conformer of a molecule is expanded into its own entry separately indexed entry. This is handled by the dataset however so we just generate a general index for the molecule before adding to the dataset. """ name = molecule.name.lower() if name and name != "unnamed": return name else: return molecule.to_smiles( isomeric=True, explicit_hydrogens=False, mapped=False )
[docs]class BasicDatasetFactory(BaseDatasetFactory): """ Basic dataset generator factory used to build work flows using workflow components before executing them to generate a dataset. The basic dataset is ideal for large collections of single point calculations using any of the energy, gradient or hessian drivers. """ type: Literal["BasicDatasetFactory"] = "BasicDatasetFactory" @classmethod def _dataset_type(cls) -> Type[BasicDataset]: return BasicDataset def _process_molecule( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: # always put the cmiles in the extras from what we have just calculated to ensure correct order extras = molecule.properties.get("extras", {}) keywords = molecule.properties.get("keywords", None) # now submit the molecule try: dataset.add_molecule( index=self.create_index(molecule=molecule), molecule=molecule, extras=extras if bool(extras) else None, keywords=keywords, ) except MolecularComplexError: self._molecular_complex_filter( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry )
[docs]class OptimizationDatasetFactory(BasicDatasetFactory): """ This factory produces OptimisationDatasets which include settings associated with geometric which is used to run the optimisation. """ type: Literal["OptimizationDatasetFactory"] = Field( "OptimizationDatasetFactory", description="The type of dataset factory which corresponds to the dataset made.", ) # set the driver to be gradient this should not be changed when running driver: DriverEnum = DriverEnum.gradient # use the default geometric settings during optimisation optimization_program: GeometricProcedure = Field( GeometricProcedure(), description="The optimization program and settings that should be used.", ) @classmethod def _dataset_type(cls) -> Type[OptimizationDataset]: return OptimizationDataset @validator("driver") def _check_driver(cls, driver): """Make sure that the driver is set to gradient only and not changed.""" available_drivers = ["gradient"] if driver not in available_drivers: raise DriverError( f"The requested driver ({driver}) is not in the list of available " f"drivers: {available_drivers}" ) return driver
[docs]class TorsiondriveDatasetFactory(OptimizationDatasetFactory): """ This factory produces TorsiondriveDatasets which include settings associated with geometric which is used to run the optimisation. """ type: Literal["TorsiondriveDatasetFactory"] = Field( "TorsiondriveDatasetFactory", description="The type of dataset factory which corresponds to the dataset made.", ) grid_spacing: List[int] = Field( [15], description="The grid spcaing that should be used for all torsiondrives, this can be overwriten on a per entry basis.", ) energy_upper_limit: float = Field( 0.05, description="The upper energy limit to spawn new optimizations in the torsiondrive.", ) dihedral_ranges: Optional[List[Tuple[int, int]]] = Field( None, description="The scan range that should be used for each torsiondrive, this can be overwriten on a per entry basis.", ) energy_decrease_thresh: Optional[float] = Field( None, description="The energy lower threshold to trigger new optimizations in the torsiondrive.", ) # set the default settings for a torsiondrive calculation. optimization_program = GeometricProcedure.parse_obj( {"enforce": 0.1, "reset": True, "qccnv": True, "epsilon": 0.0} ) @classmethod def _dataset_type(cls) -> Type[TorsiondriveDataset]: return TorsiondriveDataset def _linear_torsion_filter( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: """ Mock a linear torsion filtering component and filter the molecule. """ try: dataset.filtered_molecules["LinearTorsionRemoval"].add_molecule( molecule=molecule ) except KeyError: dataset.filter_molecules( molecules=[ molecule, ], component="LinearTorsionRemoval", component_settings={}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), ) def _unconnected_torsion_filter( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: """Mock a unconnected torsion filtering component and filter the molecule.""" try: dataset.filtered_molecules["UnconnectedTorsionRemoval"].add_molecule( molecule=molecule ) except KeyError: dataset.filter_molecules( molecules=[ molecule, ], component="UnconnectedTorsionRemoval", component_settings={}, component_provenance=self.provenance(toolkit_registry=toolkit_registry), ) def _process_molecule( self, dataset: T, molecule: off.Molecule, toolkit_registry: ToolkitRegistry ) -> None: # check for extras and keywords extras = molecule.properties.get("extras", {}) keywords = molecule.properties.get("keywords", {}) # now check for the dihedrals if "dihedrals" in molecule.properties: # first do 1-D torsions for dihedral in molecule.properties["dihedrals"].get_dihedrals: # create the index molecule.properties["atom_map"] = dihedral.get_atom_map index = self.create_index(molecule=molecule) del molecule.properties["atom_map"] # get the dihedrals to scan dihedrals = dihedral.get_dihedrals keywords["dihedral_ranges"] = dihedral.get_scan_range keywords["grid_spacing"] = dihedral.scan_increment try: dataset.add_molecule( index=index, molecule=molecule, dihedrals=dihedrals, keywords=keywords, extras=extras, ) except DihedralConnectionError: self._unconnected_torsion_filter( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry, ) except LinearTorsionError: self._linear_torsion_filter( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry, ) except MolecularComplexError: self._molecular_complex_filter( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry, ) else: # if no dihedrals have been tagged we need to fail the molecule self._no_dihedrals_filter( dataset=dataset, molecule=molecule, toolkit_registry=toolkit_registry )
[docs] def create_index(self, molecule: off.Molecule) -> str: """ Create a specific torsion index for the molecule, this will use the atom map on the molecule. Args: molecule: The molecule for which the dataset index will be generated. Returns: The canonical mapped isomeric smiles, where the mapped indices are on the atoms in the torsion. Important: This dataset uses a non-standard indexing with 4 atom mapped indices representing the atoms in the torsion to be rotated. """ assert "atom_map" in molecule.properties.keys() assert ( len(molecule.properties["atom_map"]) == 4 or len(molecule.properties["atom_map"]) == 8 ) index = molecule.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True) return index
def _detect_linear_torsions(self, molecule: off.Molecule) -> List: """ Try and find any linear bonds in the molecule with torsions that should not be driven. Parameters: molecule: An openforcefield molecule instance Returns: A list of the central bond tuples in the molecule which should not be driven, this can then be compared against the torsions which have been selected. """ # this is based on the past submissions to QCarchive which have failed # highlight the central bond of a linear torsion linear_smarts = "[*!D1:1]~[$(*#*)&D2,$(C=*)&D2:2]" matches = molecule.chemical_environment_matches(linear_smarts) return matches