Source code for openff.qcsubmit.datasets.entries

"""
All of the individual dataset entry types are defined here.
"""

from typing import Any, Dict, List, Optional, Tuple

import networkx as nx
import numpy as np
import openff.toolkit.topology as off
import qcelemental as qcel
import qcelemental.models
from pydantic import Field, validator
from typing_extensions import Literal

try:
    from openmm import unit
except ImportError:
    from simtk import unit

from openff.qcsubmit.common_structures import (
    DatasetConfig,
    MoleculeAttributes,
    TDSettings,
)
from openff.qcsubmit.constraints import Constraints, ConstraintType
from openff.qcsubmit.exceptions import ConstraintError, DihedralConnectionError
from openff.qcsubmit.utils.smirnoff import (
    combine_openff_molecules,
    split_openff_molecule,
)
from openff.qcsubmit.validators import (
    check_connectivity,
    check_constraints,
    check_improper_connection,
    check_linear_torsions,
    check_torsion_connection,
)


[docs]class DatasetEntry(DatasetConfig): """ A basic data class to construct the datasets which holds any information about the molecule and options used in the qcarchive calculation. Note: * ``extras`` are passed into the qcelemental.models.Molecule on creation. * any extras that should passed to the calculation like extra constrains should be passed to ``keywords``. """ index: str = Field( ..., description="The index name the molecule will be stored under in QCArchive. Note that if multipule geometries are provided the index will be augmented with a value indecating the conformer number so -0, -1.", ) initial_molecules: List[qcel.models.Molecule] = Field( ..., description="A list of QCElemental Molecule objects which contain the geometries to be used as inputs for the calculation.", ) attributes: MoleculeAttributes = Field( ..., description="The complete set of required cmiles attributes for the molecule.", ) extras: Optional[Dict[str, Any]] = Field( {}, description="Any extra information that should be injected into the QCElemental models before being submited like the cmiles information.", ) keywords: Optional[Dict[str, Any]] = Field( {}, description="Any extra keywords that should be used in the QCArchive calculation should be passed here.", ) def __init__(self, off_molecule: Optional[off.Molecule] = None, **kwargs): """ Init the dataclass handling conversions of the molecule first. This is needed to make sure the extras are passed into the qcschema molecule. """ extras = kwargs.get("extras", {}) # a place holder to check for multiple components molecule_ids, charges = None, None # if we get an off_molecule we need to convert it if off_molecule is not None: split_molecules = split_openff_molecule(molecule=off_molecule) if len(split_molecules) > 1: # recombine the molecule in the correct ordering off_molecule = combine_openff_molecules(split_molecules) # we need to remake the attributes for the new ordering kwargs["attributes"] = MoleculeAttributes.from_openff_molecule( molecule=off_molecule ) molecule_ids = [ list(sorted(ids)) for ids in nx.connected_components(off_molecule.to_networkx()) ] charges = [ sum( [ off_molecule.atoms[atom].formal_charge.value_in_unit( unit.elementary_charge ) for atom in graph ] ) for graph in molecule_ids ] if off_molecule.n_conformers == 0: off_molecule.generate_conformers(n_conformers=1) schema_mols = [ off_molecule.to_qcschema(conformer=conformer, extras=extras) for conformer in range(off_molecule.n_conformers) ] kwargs["initial_molecules"] = schema_mols super().__init__(**kwargs) # now we need to process all of the initial molecules to make sure the cmiles is present # and force c1 symmetry, we also need to split the molecule into fragments if there are multiple present initial_molecules = [] for mol in self.initial_molecules: extras = mol.extras or {} extras[ "canonical_isomeric_explicit_hydrogen_mapped_smiles" ] = self.attributes.canonical_isomeric_explicit_hydrogen_mapped_smiles mol_data = mol.dict() mol_data["extras"] = extras # put into strict c1 symmetry mol_data["fix_symmetry"] = "c1" # add fragment information if we have multiple components if molecule_ids is not None and len(molecule_ids) > 1: mol_data["fragments"] = molecule_ids mol_data["fragment_charges"] = charges initial_molecules.append(qcel.models.Molecule.parse_obj(mol_data)) # try and parse the schema to make sure the fragment order is correct qcel.molparse.from_schema(molschema=initial_molecules[0].dict(), verbose=True) # now assign the new molecules self.initial_molecules = initial_molecules
[docs] def get_off_molecule(self, include_conformers: bool = True) -> off.Molecule: """Build and openforcefield.topology.Molecule representation of the input molecule. Parameters: include_conformers: If `True` all of the input conformers are included else they are dropped. """ molecule = self.attributes.to_openff_molecule() molecule.name = self.index if include_conformers: for conformer in self.initial_molecules: geometry = unit.Quantity(np.array(conformer.geometry), unit=unit.bohr) molecule.add_conformer(geometry.in_units_of(unit.angstrom)) return molecule
[docs]class OptimizationEntry(DatasetEntry): """ An optimization dataset specific entry class which can handle constraints. """ constraints: Constraints = Field( Constraints(), description="Any constraints which should be used during an optimization.", ) def __init__(self, off_molecule: Optional[off.Molecule] = None, **kwargs): """ Here we handle the constraints before calling the super. """ # if the constraints are in the keywords move them out for validation if "constraints" in kwargs["keywords"]: constraint_dict = kwargs["keywords"].pop("constraints") constraints = Constraints(**constraint_dict) kwargs["constraints"] = constraints super().__init__(off_molecule, **kwargs) # validate any constraints being added check_constraints( constraints=self.constraints, molecule=self.get_off_molecule(include_conformers=False), )
[docs] def add_constraint( self, constraint: Literal["set", "freeze"], constraint_type: ConstraintType, indices: List[int], bonded: bool = True, **kwargs, ) -> None: """ Add new constraint of the given type. Parameters: constraint: The major type of constraint, freeze or set constraint_type: the constraint sub type, angle, distance etc indices: The atom indices the constraint should be placed on bonded: If the constraint is intended to be put a bonded set of atoms kwargs: Any extra information needed by the constraint, for the set class they need a value `value=float` """ if constraint.lower() == "freeze": self.constraints.add_freeze_constraint( constraint_type=constraint_type, indices=indices, bonded=bonded ) elif constraint.lower() == "set": self.constraints.add_set_constraint( constraint_type=constraint_type, indices=indices, bonded=bonded, **kwargs, ) else: raise ConstraintError( f"The constraint {constraint} is not available please chose from freeze or set." ) # run the constraint check check_constraints( constraints=self.constraints, molecule=self.get_off_molecule(include_conformers=False), )
@property def formatted_keywords(self) -> Dict[str, Any]: """ Format the keywords with the constraints values. """ import copy if self.constraints.has_constraints: constraints = self.constraints.dict() keywords = copy.deepcopy(self.keywords) keywords["constraints"] = constraints return keywords else: return self.keywords
[docs]class TorsionDriveEntry(OptimizationEntry): """ A Torsiondrive dataset specific class which can check dihedral indices and store torsiondrive specific settings with built in validation. """ dihedrals: List[Tuple[int, int, int, int]] = Field( ..., description="The list of dihedrals that should be driven, currently only 1D or 2D torsions are supported.", ) keywords: Optional[TDSettings] = Field( TDSettings(), description="The torsiondrive keyword settings which can be used to overwrite the general global settings used in the dataset allowing for finner control.", ) # we do not yet support multi component torsion drives so validate # we have to define the validation this way due to pydantic # <https://pydantic-docs.helpmanual.io/usage/validators/#subclass-validators-and-each_item> @validator("initial_molecules") def _check_conectivity( cls, molecules: List[qcelemental.models.Molecule] ) -> List[qcelemental.models.Molecule]: for mol in molecules: check_connectivity(mol) return molecules @property def formatted_keywords(self) -> Dict[str, Any]: """Format the keywords with constraints.""" if self.constraints.has_constraints: keywords = self.keywords.additional_keywords keywords["constraints"] = self.constraints.dict() return keywords else: return self.keywords.additional_keywords def __init__(self, off_molecule: Optional[off.Molecule] = None, **kwargs): super().__init__(off_molecule, **kwargs) # now validate the torsions check proper first off_molecule = self.get_off_molecule(include_conformers=False) # now validate the dihedrals for torsion in self.dihedrals: # check for linear torsions check_linear_torsions(torsion, off_molecule) try: check_torsion_connection(torsion=torsion, molecule=off_molecule) except DihedralConnectionError: # if this fails as well raise try: check_improper_connection(improper=torsion, molecule=off_molecule) except DihedralConnectionError: raise DihedralConnectionError( f"The dihedral {torsion} for molecule {off_molecule} is not a valid" f" proper/improper torsion." )
[docs]class FilterEntry(DatasetConfig): """ A basic data class that contains information on components run in a workflow and the associated molecules which were removed by it. """ component: str = Field( ..., description="The name of the component ran, this should be one of the components registered with qcsubmit.", ) component_settings: Dict[str, Any] = Field( ..., description="The run time settings of the component used to filter the molecules.", ) component_provenance: Dict[str, str] = Field( ..., description="A dictionary of the version information of all dependencies of the component.", ) molecules: List[str] def __init__(self, off_molecules: List[off.Molecule] = None, **kwargs): """ Init the dataclass handling conversions of the molecule first. """ if off_molecules is not None: molecules = [ molecule.to_smiles(isomeric=True, explicit_hydrogens=True) for molecule in off_molecules ] kwargs["molecules"] = molecules super().__init__(**kwargs)
[docs] def add_molecule(self, molecule: off.Molecule) -> None: """ Add a molecule to this filter. """ self.molecules.append( molecule.to_smiles(isomeric=True, explicit_hydrogens=True) )