Source code for openff.nagl.toolkits.rdkit

import copy
import functools
from typing import Tuple, TYPE_CHECKING, List, Union

import numpy as np

from openff.units import unit


from openff.nagl.toolkits._base import NAGLToolkitWrapperBase
from openff.toolkit.utils.rdkit_wrapper import RDKitToolkitWrapper
from openff.nagl.utils._types import HybridizationType

if TYPE_CHECKING:
    from openff.toolkit.topology import Molecule


[docs]class NAGLRDKitToolkitWrapper(NAGLToolkitWrapperBase, RDKitToolkitWrapper): name = "rdkit" def _run_normalization_reactions( self, molecule, normalization_reactions: Tuple[str, ...] = tuple(), max_iter: int = 200, ): """ Normalize the bond orders and charges of a molecule by applying a series of transformations to it. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to normalize normalization_reactions: Tuple[str, ...], default=tuple() A tuple of SMARTS reaction strings representing the reactions to apply to the molecule. max_iter: int, default=200 The maximum number of iterations to perform for each transformation. Returns ------- normalized_molecule: openff.toolkit.topology.Molecule The normalized molecule. This is a new molecule object, not the same as the input molecule. """ from rdkit import Chem from rdkit.Chem import rdChemReactions rdmol = self.to_rdkit(molecule=molecule) original_smiles = new_smiles = Chem.MolToSmiles(rdmol) # track atoms for i, atom in enumerate(rdmol.GetAtoms(), 1): atom.SetAtomMapNum(i) for reaction_smarts in normalization_reactions: reaction = rdChemReactions.ReactionFromSmarts(reaction_smarts) n_iter = 0 keep_changing = True while n_iter < max_iter and keep_changing: n_iter += 1 old_smiles = new_smiles products = reaction.RunReactants((rdmol,), maxProducts=1) if not products: break try: ((rdmol,),) = products except ValueError: raise ValueError( f"Reaction produced multiple products: {reaction_smarts}" ) for atom in rdmol.GetAtoms(): atom.SetAtomMapNum(atom.GetIntProp("react_atom_idx") + 1) new_smiles = Chem.MolToSmiles(Chem.AddHs(rdmol)) # stop changing when smiles converges to same product keep_changing = new_smiles != old_smiles if n_iter >= max_iter and keep_changing: raise ValueError( f"Reaction {reaction_smarts} did not converge after " f"{max_iter} iterations for molecule {original_smiles}" ) new_mol = self.from_rdkit( rdmol, allow_undefined_stereo=True, _cls=molecule.__class__, ) mapping = new_mol.properties.pop("atom_map") adjusted_mapping = dict((current, new - 1) for current, new in mapping.items()) return new_mol.remap(adjusted_mapping, current_to_new=True)
[docs] def get_molecule_hybridizations( self, molecule: "Molecule" ) -> List[HybridizationType]: """ Get the hybridization of each atom in a molecule. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to get the hybridizations of. Returns ------- hybridizations: List[HybridizationType] The hybridization of each atom in the molecule. """ from rdkit.Chem import rdchem conversions = { rdchem.HybridizationType.S: HybridizationType.OTHER, rdchem.HybridizationType.SP: HybridizationType.SP, rdchem.HybridizationType.SP2: HybridizationType.SP2, rdchem.HybridizationType.SP3: HybridizationType.SP3, rdchem.HybridizationType.SP3D: HybridizationType.SP3D, rdchem.HybridizationType.SP3D2: HybridizationType.SP3D2, rdchem.HybridizationType.OTHER: HybridizationType.OTHER, rdchem.HybridizationType.UNSPECIFIED: HybridizationType.OTHER, } hybridizations = [] rdmol = self.to_rdkit(molecule) for atom in rdmol.GetAtoms(): hybridization = atom.GetHybridization() try: hybridizations.append(conversions[hybridization]) except KeyError: raise ValueError(f"Unknown hybridization {hybridization}") return hybridizations
[docs] def to_rdkit(self, molecule: "Molecule"): try: return super().to_rdkit(molecule) except AssertionError: # OpenEye just accepts all stereochemistry # unlike RDKit which e.g. does not allow stereogenic bonds in a ring < 8 # try patching via smiles # smiles = "C1CC/C=C/(CC1)Cl" mapped_smiles = molecule.to_smiles(mapped=True) mol2 = Molecule.from_mapped_smiles( mapped_smiles, allow_undefined_stereo=True, toolkit_registry=RDKitToolkitWrapper(), ) mol2_bonds = { (bd.atom1_index, bd.atom2_index): bd.stereochemistry for bd in mol2.bonds } molecule = copy.deepcopy(molecule) for bond in molecule.bonds: bond._stereochemistry = mol2_bonds[bond.atom1_index, bond.atom2_index] return molecule.to_rdkit()
[docs] @classmethod def smiles_to_mapped_smiles(cls, smiles: str) -> str: """Convert a SMILES string to a mapped SMILES string. Parameters ---------- smiles: str The SMILES string to convert. Returns ------- mapped_smiles: str The mapped SMILES string. """ molecule = cls.from_smiles(smiles, allow_undefined_stereo=True) return cls.to_smiles(molecule, mapped=True)
[docs] @classmethod def mapped_smiles_to_smiles(cls, mapped_smiles: str) -> str: """Convert a mapped SMILES string to a SMILES string. Parameters ---------- mapped_smiles: str The mapped SMILES string to convert. Returns ------- smiles: str The SMILES string. """ molecule = cls.from_smiles(mapped_smiles, allow_undefined_stereo=True) return cls.to_smiles(molecule)
[docs] @classmethod def stream_molecules_from_sdf_file( cls, file: str, as_smiles: bool = False, mapped_smiles: bool = False, explicit_hydrogens: bool = True, **kwargs, ): """ Stream molecules from an SDF file. Parameters ---------- file: str The path to the SDF file to stream molecules from. as_smiles: bool, default=False If True, return a SMILES string instead of an OpenFF Molecule mapped_smiles: bool, default=False If True, return a SMILES string with atom indices as atom map numbers. explicit_hydrogens: bool, default=True If True, keep explicit hydrogens in SMILES outputs. Returns ------- molecules: Generator[openff.toolkit.topology.Molecule or str] """ from rdkit import Chem wrapper = cls() if as_smiles: if mapped_smiles: converter = cls.smiles_to_mapped_smiles else: converter = functools.partial( Chem.MolToSmiles, allHsExplicit=explicit_hydrogens ) else: converter = functools.partial( wrapper.from_rdkit, allow_undefined_stereo=True, _cls=Molecule ) if file.endswith(".gz"): file = file[:-3] for rdmol in Chem.SupplierFromFilename( file, removeHs=False, sanitize=True, strictParsing=True ): if rdmol is not None: yield converter(rdmol)
[docs] def stream_molecules_to_file(self, file: str): """ Stream molecules to an SDF file using a context manager. Parameters ---------- file: str The path to the SDF file to stream molecules to. Examples -------- >>> from openff.toolkit.topology import Molecule >>> from openff.toolkit.utils.toolkits import RDKitToolkitWrapper >>> toolkit_wrapper = RDKitToolkitWrapper() >>> molecule1 = Molecule.from_smiles("CCO") >>> molecule2 = Molecule.from_smiles("CCC") >>> with toolkit_wrapper.stream_molecules_to_file("molecules.sdf") as writer: ... writer(molecule1) ... writer(molecule2) """ from rdkit import Chem stream = Chem.SDWriter(file) def writer(molecule): rdmol = self.to_rdkit(molecule) n_conf = rdmol.GetNumConformers() if not n_conf: stream.write(rdmol) else: for i in range(n_conf): stream.write(rdmol, confId=i) stream.flush() yield writer stream.close()
[docs] def get_best_rmsd( self, molecule: "Molecule", reference_conformer: Union[np.ndarray, unit.Quantity], target_conformer: Union[np.ndarray, unit.Quantity], ) -> unit.Quantity: """ Compute the lowest all-atom RMSD between a reference and target conformer, allowing for symmetry-equivalent atoms to be permuted. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute the RMSD for reference_conformer: np.ndarray or openff.units.unit.Quantity The reference conformer to compare to the target conformer. If a numpy array, it is assumed to be in units of angstrom. target_conformer: np.ndarray or openff.units.unit.Quantity The target conformer to compare to the reference conformer. If a numpy array, it is assumed to be in units of angstrom. Returns ------- rmsd: unit.Quantity Examples -------- >>> from openff.units import unit >>> from openff.toolkit.topology import Molecule >>> from openff.toolkit.utils.toolkits import RDKitToolkitWrapper >>> toolkit_wrapper = RDKitToolkitWrapper() >>> molecule = Molecule.from_smiles("CCCCO") >>> molecule.generate_conformers(n_conformers=2) >>> rmsd = toolkit_wrapper.get_best_rmsd(molecule, molecule.conformers[0], molecule.conformers[1]) >>> print(f"RMSD in angstrom: {rmsd.m_as(unit.angstrom)}") """ from rdkit.Chem import rdMolAlign if not isinstance(reference_conformer, unit.Quantity): reference_conformer = reference_conformer * unit.angstrom if not isinstance(target_conformer, unit.Quantity): target_conformer = target_conformer * unit.angstrom mol1 = copy.deepcopy(molecule) mol1._conformers = [reference_conformer] mol2 = copy.deepcopy(molecule) mol2._conformers = [target_conformer] rdmol1 = self.to_rdkit(mol1) rdmol2 = self.to_rdkit(mol2) rmsd = rdMolAlign.GetBestRMS(rdmol1, rdmol2) return rmsd * unit.angstrom
[docs] def get_atoms_are_in_ring_size( self, molecule: "Molecule", ring_size: int, ) -> List[bool]: """ Determine whether each atom in a molecule is in a ring of a given size. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute ring perception for ring_size: int The size of the ring to check for. Returns ------- in_ring_size: List[bool] """ rdmol = self.to_rdkit(molecule) in_ring_size = [atom.IsInRingSize(ring_size) for atom in rdmol.GetAtoms()] return in_ring_size
[docs] def get_bonds_are_in_ring_size( self, molecule: "Molecule", ring_size: int, ) -> List[bool]: """ Determine whether each bond in a molecule is in a ring of a given size. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute ring perception for ring_size: int The size of the ring to check for. Returns ------- in_ring_size: List[bool] Bonds are in the same order as the molecule's ``bonds`` attribute. """ rdmol = self.to_rdkit(molecule) in_ring_size = [] for bond in molecule.bonds: rdbond = rdmol.GetBondBetweenAtoms(bond.atom1_index, bond.atom2_index) in_ring_size.append(rdbond.IsInRingSize(ring_size)) return in_ring_size
[docs] def assign_partial_charges( self, molecule, partial_charge_method=None, use_conformers=None, strict_n_conformers=False, normalize_partial_charges=True, _cls=None, ): """ Compute partial charges with RDKit, and assign the new values to the partial_charges attribute. .. warning :: This API is experimental and subject to change. Parameters ---------- molecule : openff.toolkit.topology.Molecule Molecule for which partial charges are to be computed partial_charge_method : str, optional, default=None The charge model to use. One of ['mmff94', 'gasteiger']. If None, 'mmff94' will be used. * 'mmff94': Applies partial charges using the Merck Molecular Force Field (MMFF). This method does not make use of conformers, and hence ``use_conformers`` and ``strict_n_conformers`` will not impact the partial charges produced. use_conformers : iterable of unit-wrapped numpy arrays, each with shape (n_atoms, 3) and dimension of distance. Optional, default = None Coordinates to use for partial charge calculation. If None, an appropriate number of conformers will be generated. strict_n_conformers : bool, default=False Whether to raise an exception if an invalid number of conformers is provided for the given charge method. If this is False and an invalid number of conformers is found, a warning will be raised. normalize_partial_charges : bool, default=True Whether to offset partial charges so that they sum to the total formal charge of the molecule. This is used to prevent accumulation of rounding errors when the partial charge generation method has low precision. _cls : class Molecule constructor Raises ------ ChargeMethodUnavailableError if the requested charge method can not be handled by this toolkit ChargeCalculationError if the charge method is supported by this toolkit, but fails """ # TODO: Remove when superseded by toolkit update import numpy as np from rdkit.Chem import AllChem if not partial_charge_method == "gasteiger": return super().assign_partial_charges( molecule, partial_charge_method=partial_charge_method, use_conformers=use_conformers, strict_n_conformers=strict_n_conformers, normalize_partial_charges=normalize_partial_charges, _cls=_cls, ) rdkit_molecule = self.to_rdkit(molecule) AllChem.ComputeGasteigerCharges(rdkit_molecule) charges = [ float(rdatom.GetProp("_GasteigerCharge")) for rdatom in rdkit_molecule.GetAtoms() ] charges = np.asarray(charges) molecule.partial_charges = unit.Quantity(charges, unit.elementary_charge) if normalize_partial_charges: molecule._normalize_partial_charges()
[docs] def calculate_circular_fingerprint_similarity( self, molecule: "Molecule", reference_molecule: "Molecule", radius: int = 3, nbits: int = 2048, ) -> float: """ Compute the similarity between two molecules using a fingerprinting method. Uses a Morgan fingerprint with RDKit and a Circular fingerprint with OpenEye. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute the fingerprint for. reference_molecule: openff.toolkit.topology.Molecule The molecule to compute the fingerprint for. radius: int, default 3 The radius of the fingerprint to use. nbits: int, default 2048 The length of the fingerprint to use. Not used in RDKit. Returns ------- similarity: float The Dice similarity between the two molecules. """ from rdkit import Chem from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint from rdkit.DataStructs import DiceSimilarity rdmol1 = self.to_rdkit(molecule) rdmol2 = self.to_rdkit(reference_molecule) fp1 = GetMorganFingerprint(rdmol1, radius) fp2 = GetMorganFingerprint(rdmol2, radius) return DiceSimilarity(fp1, fp2)