Source code for openff.nagl.toolkits.rdkit
import copy
import functools
from typing import Tuple, TYPE_CHECKING, List, Union
import numpy as np
from openff.units import unit
from openff.nagl.toolkits._base import NAGLToolkitWrapperBase
from openff.toolkit.utils.rdkit_wrapper import RDKitToolkitWrapper
from openff.nagl.utils._types import HybridizationType
if TYPE_CHECKING:
from openff.toolkit.topology import Molecule
[docs]class NAGLRDKitToolkitWrapper(NAGLToolkitWrapperBase, RDKitToolkitWrapper):
name = "rdkit"
def _run_normalization_reactions(
self,
molecule,
normalization_reactions: Tuple[str, ...] = tuple(),
max_iter: int = 200,
):
"""
Normalize the bond orders and charges of a molecule by applying a series of transformations to it.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to normalize
normalization_reactions: Tuple[str, ...], default=tuple()
A tuple of SMARTS reaction strings representing the reactions to apply to the molecule.
max_iter: int, default=200
The maximum number of iterations to perform for each transformation.
Returns
-------
normalized_molecule: openff.toolkit.topology.Molecule
The normalized molecule. This is a new molecule object, not the same as the input molecule.
"""
from rdkit import Chem
from rdkit.Chem import rdChemReactions
rdmol = self.to_rdkit(molecule=molecule)
original_smiles = new_smiles = Chem.MolToSmiles(rdmol)
# track atoms
for i, atom in enumerate(rdmol.GetAtoms(), 1):
atom.SetAtomMapNum(i)
for reaction_smarts in normalization_reactions:
reaction = rdChemReactions.ReactionFromSmarts(reaction_smarts)
n_iter = 0
keep_changing = True
while n_iter < max_iter and keep_changing:
n_iter += 1
old_smiles = new_smiles
products = reaction.RunReactants((rdmol,), maxProducts=1)
if not products:
break
try:
((rdmol,),) = products
except ValueError:
raise ValueError(
f"Reaction produced multiple products: {reaction_smarts}"
)
for atom in rdmol.GetAtoms():
atom.SetAtomMapNum(atom.GetIntProp("react_atom_idx") + 1)
new_smiles = Chem.MolToSmiles(Chem.AddHs(rdmol))
# stop changing when smiles converges to same product
keep_changing = new_smiles != old_smiles
if n_iter >= max_iter and keep_changing:
raise ValueError(
f"Reaction {reaction_smarts} did not converge after "
f"{max_iter} iterations for molecule {original_smiles}"
)
new_mol = self.from_rdkit(
rdmol,
allow_undefined_stereo=True,
_cls=molecule.__class__,
)
mapping = new_mol.properties.pop("atom_map")
adjusted_mapping = dict((current, new - 1) for current, new in mapping.items())
return new_mol.remap(adjusted_mapping, current_to_new=True)
[docs] def get_molecule_hybridizations(
self, molecule: "Molecule"
) -> List[HybridizationType]:
"""
Get the hybridization of each atom in a molecule.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to get the hybridizations of.
Returns
-------
hybridizations: List[HybridizationType]
The hybridization of each atom in the molecule.
"""
from rdkit.Chem import rdchem
conversions = {
rdchem.HybridizationType.S: HybridizationType.OTHER,
rdchem.HybridizationType.SP: HybridizationType.SP,
rdchem.HybridizationType.SP2: HybridizationType.SP2,
rdchem.HybridizationType.SP3: HybridizationType.SP3,
rdchem.HybridizationType.SP3D: HybridizationType.SP3D,
rdchem.HybridizationType.SP3D2: HybridizationType.SP3D2,
rdchem.HybridizationType.OTHER: HybridizationType.OTHER,
rdchem.HybridizationType.UNSPECIFIED: HybridizationType.OTHER,
}
hybridizations = []
rdmol = self.to_rdkit(molecule)
for atom in rdmol.GetAtoms():
hybridization = atom.GetHybridization()
try:
hybridizations.append(conversions[hybridization])
except KeyError:
raise ValueError(f"Unknown hybridization {hybridization}")
return hybridizations
[docs] def to_rdkit(self, molecule: "Molecule"):
try:
return super().to_rdkit(molecule)
except AssertionError:
# OpenEye just accepts all stereochemistry
# unlike RDKit which e.g. does not allow stereogenic bonds in a ring < 8
# try patching via smiles
# smiles = "C1CC/C=C/(CC1)Cl"
mapped_smiles = molecule.to_smiles(mapped=True)
mol2 = Molecule.from_mapped_smiles(
mapped_smiles,
allow_undefined_stereo=True,
toolkit_registry=RDKitToolkitWrapper(),
)
mol2_bonds = {
(bd.atom1_index, bd.atom2_index): bd.stereochemistry
for bd in mol2.bonds
}
molecule = copy.deepcopy(molecule)
for bond in molecule.bonds:
bond._stereochemistry = mol2_bonds[bond.atom1_index, bond.atom2_index]
return molecule.to_rdkit()
[docs] @classmethod
def smiles_to_mapped_smiles(cls, smiles: str) -> str:
"""Convert a SMILES string to a mapped SMILES string.
Parameters
----------
smiles: str
The SMILES string to convert.
Returns
-------
mapped_smiles: str
The mapped SMILES string.
"""
molecule = cls.from_smiles(smiles, allow_undefined_stereo=True)
return cls.to_smiles(molecule, mapped=True)
[docs] @classmethod
def mapped_smiles_to_smiles(cls, mapped_smiles: str) -> str:
"""Convert a mapped SMILES string to a SMILES string.
Parameters
----------
mapped_smiles: str
The mapped SMILES string to convert.
Returns
-------
smiles: str
The SMILES string.
"""
molecule = cls.from_smiles(mapped_smiles, allow_undefined_stereo=True)
return cls.to_smiles(molecule)
[docs] @classmethod
def stream_molecules_from_sdf_file(
cls,
file: str,
as_smiles: bool = False,
mapped_smiles: bool = False,
explicit_hydrogens: bool = True,
**kwargs,
):
"""
Stream molecules from an SDF file.
Parameters
----------
file: str
The path to the SDF file to stream molecules from.
as_smiles: bool, default=False
If True, return a SMILES string instead of an OpenFF Molecule
mapped_smiles: bool, default=False
If True, return a SMILES string with atom indices as atom map numbers.
explicit_hydrogens: bool, default=True
If True, keep explicit hydrogens in SMILES outputs.
Returns
-------
molecules: Generator[openff.toolkit.topology.Molecule or str]
"""
from rdkit import Chem
wrapper = cls()
if as_smiles:
if mapped_smiles:
converter = cls.smiles_to_mapped_smiles
else:
converter = functools.partial(
Chem.MolToSmiles, allHsExplicit=explicit_hydrogens
)
else:
converter = functools.partial(
wrapper.from_rdkit,
allow_undefined_stereo=True,
_cls=Molecule
)
if file.endswith(".gz"):
file = file[:-3]
for rdmol in Chem.SupplierFromFilename(
file, removeHs=False, sanitize=True, strictParsing=True
):
if rdmol is not None:
yield converter(rdmol)
[docs] def stream_molecules_to_file(self, file: str):
"""
Stream molecules to an SDF file using a context manager.
Parameters
----------
file: str
The path to the SDF file to stream molecules to.
Examples
--------
>>> from openff.toolkit.topology import Molecule
>>> from openff.toolkit.utils.toolkits import RDKitToolkitWrapper
>>> toolkit_wrapper = RDKitToolkitWrapper()
>>> molecule1 = Molecule.from_smiles("CCO")
>>> molecule2 = Molecule.from_smiles("CCC")
>>> with toolkit_wrapper.stream_molecules_to_file("molecules.sdf") as writer:
... writer(molecule1)
... writer(molecule2)
"""
from rdkit import Chem
stream = Chem.SDWriter(file)
def writer(molecule):
rdmol = self.to_rdkit(molecule)
n_conf = rdmol.GetNumConformers()
if not n_conf:
stream.write(rdmol)
else:
for i in range(n_conf):
stream.write(rdmol, confId=i)
stream.flush()
yield writer
stream.close()
[docs] def get_best_rmsd(
self,
molecule: "Molecule",
reference_conformer: Union[np.ndarray, unit.Quantity],
target_conformer: Union[np.ndarray, unit.Quantity],
) -> unit.Quantity:
"""
Compute the lowest all-atom RMSD between a reference and target conformer,
allowing for symmetry-equivalent atoms to be permuted.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute the RMSD for
reference_conformer: np.ndarray or openff.units.unit.Quantity
The reference conformer to compare to the target conformer.
If a numpy array, it is assumed to be in units of angstrom.
target_conformer: np.ndarray or openff.units.unit.Quantity
The target conformer to compare to the reference conformer.
If a numpy array, it is assumed to be in units of angstrom.
Returns
-------
rmsd: unit.Quantity
Examples
--------
>>> from openff.units import unit
>>> from openff.toolkit.topology import Molecule
>>> from openff.toolkit.utils.toolkits import RDKitToolkitWrapper
>>> toolkit_wrapper = RDKitToolkitWrapper()
>>> molecule = Molecule.from_smiles("CCCCO")
>>> molecule.generate_conformers(n_conformers=2)
>>> rmsd = toolkit_wrapper.get_best_rmsd(molecule, molecule.conformers[0], molecule.conformers[1])
>>> print(f"RMSD in angstrom: {rmsd.m_as(unit.angstrom)}")
"""
from rdkit.Chem import rdMolAlign
if not isinstance(reference_conformer, unit.Quantity):
reference_conformer = reference_conformer * unit.angstrom
if not isinstance(target_conformer, unit.Quantity):
target_conformer = target_conformer * unit.angstrom
mol1 = copy.deepcopy(molecule)
mol1._conformers = [reference_conformer]
mol2 = copy.deepcopy(molecule)
mol2._conformers = [target_conformer]
rdmol1 = self.to_rdkit(mol1)
rdmol2 = self.to_rdkit(mol2)
rmsd = rdMolAlign.GetBestRMS(rdmol1, rdmol2)
return rmsd * unit.angstrom
[docs] def get_atoms_are_in_ring_size(
self,
molecule: "Molecule",
ring_size: int,
) -> List[bool]:
"""
Determine whether each atom in a molecule is in a ring of a given size.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute ring perception for
ring_size: int
The size of the ring to check for.
Returns
-------
in_ring_size: List[bool]
"""
rdmol = self.to_rdkit(molecule)
in_ring_size = [atom.IsInRingSize(ring_size) for atom in rdmol.GetAtoms()]
return in_ring_size
[docs] def get_bonds_are_in_ring_size(
self,
molecule: "Molecule",
ring_size: int,
) -> List[bool]:
"""
Determine whether each bond in a molecule is in a ring of a given size.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute ring perception for
ring_size: int
The size of the ring to check for.
Returns
-------
in_ring_size: List[bool]
Bonds are in the same order as the molecule's ``bonds`` attribute.
"""
rdmol = self.to_rdkit(molecule)
in_ring_size = []
for bond in molecule.bonds:
rdbond = rdmol.GetBondBetweenAtoms(bond.atom1_index, bond.atom2_index)
in_ring_size.append(rdbond.IsInRingSize(ring_size))
return in_ring_size
[docs] def assign_partial_charges(
self,
molecule,
partial_charge_method=None,
use_conformers=None,
strict_n_conformers=False,
normalize_partial_charges=True,
_cls=None,
):
"""
Compute partial charges with RDKit, and assign
the new values to the partial_charges attribute.
.. warning :: This API is experimental and subject to change.
Parameters
----------
molecule : openff.toolkit.topology.Molecule
Molecule for which partial charges are to be computed
partial_charge_method : str, optional, default=None
The charge model to use. One of ['mmff94', 'gasteiger']. If None, 'mmff94' will be used.
* 'mmff94': Applies partial charges using the Merck Molecular Force Field
(MMFF). This method does not make use of conformers, and hence
``use_conformers`` and ``strict_n_conformers`` will not impact
the partial charges produced.
use_conformers : iterable of unit-wrapped numpy arrays, each with
shape (n_atoms, 3) and dimension of distance. Optional, default = None
Coordinates to use for partial charge calculation. If None, an appropriate number of
conformers will be generated.
strict_n_conformers : bool, default=False
Whether to raise an exception if an invalid number of conformers is provided for
the given charge method.
If this is False and an invalid number of conformers is found, a warning will be raised.
normalize_partial_charges : bool, default=True
Whether to offset partial charges so that they sum to the total formal charge of the molecule.
This is used to prevent accumulation of rounding errors when the partial charge generation method has
low precision.
_cls : class
Molecule constructor
Raises
------
ChargeMethodUnavailableError
if the requested charge method can not be handled by this toolkit
ChargeCalculationError
if the charge method is supported by this toolkit, but fails
"""
# TODO: Remove when superseded by toolkit update
import numpy as np
from rdkit.Chem import AllChem
if not partial_charge_method == "gasteiger":
return super().assign_partial_charges(
molecule,
partial_charge_method=partial_charge_method,
use_conformers=use_conformers,
strict_n_conformers=strict_n_conformers,
normalize_partial_charges=normalize_partial_charges,
_cls=_cls,
)
rdkit_molecule = self.to_rdkit(molecule)
AllChem.ComputeGasteigerCharges(rdkit_molecule)
charges = [
float(rdatom.GetProp("_GasteigerCharge"))
for rdatom in rdkit_molecule.GetAtoms()
]
charges = np.asarray(charges)
molecule.partial_charges = unit.Quantity(charges, unit.elementary_charge)
if normalize_partial_charges:
molecule._normalize_partial_charges()
[docs] def calculate_circular_fingerprint_similarity(
self,
molecule: "Molecule",
reference_molecule: "Molecule",
radius: int = 3,
nbits: int = 2048,
) -> float:
"""
Compute the similarity between two molecules using a fingerprinting method.
Uses a Morgan fingerprint with RDKit and a Circular fingerprint with OpenEye.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute the fingerprint for.
reference_molecule: openff.toolkit.topology.Molecule
The molecule to compute the fingerprint for.
radius: int, default 3
The radius of the fingerprint to use.
nbits: int, default 2048
The length of the fingerprint to use. Not used in RDKit.
Returns
-------
similarity: float
The Dice similarity between the two molecules.
"""
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
from rdkit.DataStructs import DiceSimilarity
rdmol1 = self.to_rdkit(molecule)
rdmol2 = self.to_rdkit(reference_molecule)
fp1 = GetMorganFingerprint(rdmol1, radius)
fp2 = GetMorganFingerprint(rdmol2, radius)
return DiceSimilarity(fp1, fp2)