import contextlib
import copy
import functools
from typing import TYPE_CHECKING, Tuple, List, Union, Dict, NamedTuple, Any, Optional
import numpy as np
from openff.units import unit
from openff.utilities import requires_package
from openff.nagl.toolkits import NAGL_TOOLKIT_REGISTRY
from openff.utilities.exceptions import MissingOptionalDependencyError
if TYPE_CHECKING:
from openff.toolkit.topology import Molecule
from openff.nagl.utils._types import HybridizationType
[docs]@contextlib.contextmanager
@requires_package("openeye.oechem")
def capture_oechem_warnings(): # pragma: no cover
from openeye import oechem
output_stream = oechem.oeosstream()
oechem.OEThrow.SetOutputStream(output_stream)
oechem.OEThrow.Clear()
yield
oechem.OEThrow.SetOutputStream(oechem.oeerr)
[docs]@contextlib.contextmanager
@toolkit_registry_function
def stream_molecules_to_file(
file: str,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
):
"""
Stream molecules to an SDF file using a context manager.
Parameters
----------
file: str
The path to the SDF file to stream molecules to.
toolkit_registry:
The toolkit registry to use to write the molecules.
Examples
--------
>>> from openff.toolkit.topology import Molecule
>>> from openff.toolkit.utils.openff import stream_molecules_to_file
>>> molecule1 = Molecule.from_smiles("CCO")
>>> molecule2 = Molecule.from_smiles("CCC")
>>> with stream_molecules_to_file("molecules.sdf") as writer:
... writer(molecule1)
... writer(molecule2)
"""
pass
[docs]@toolkit_registry_function
def get_molecule_hybridizations(
molecule: "Molecule",
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> List["HybridizationType"]:
"""
Get the hybridization of each atom in a molecule.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to get the hybridizations of.
toolkit_registry:
The toolkit registry to use
Returns
-------
hybridizations: List[HybridizationType]
The hybridization of each atom in the molecule.
"""
pass
[docs]@toolkit_registry_function
def get_atoms_are_in_ring_size(
molecule: "Molecule",
ring_size: int,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> List[bool]:
"""
Determine whether each atom in a molecule is in a ring of a given size.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute ring perception for
ring_size: int
The size of the ring to check for.
toolkit_registry:
The toolkit registry to use
Returns
-------
in_ring_size: List[bool]
"""
pass
[docs]@toolkit_registry_function
def get_bonds_are_in_ring_size(
molecule: "Molecule",
ring_size: int,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> List[bool]:
"""
Determine whether each bond in a molecule is in a ring of a given size.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute ring perception for
ring_size: int
The size of the ring to check for.
toolkit_registry:
The toolkit registry to use
Returns
-------
in_ring_size: List[bool]
Bonds are in the same order as the molecule's ``bonds`` attribute.
"""
pass
[docs]@toolkit_registry_function
def get_best_rmsd(
molecule: "Molecule",
reference_conformer: Union[np.ndarray, unit.Quantity],
target_conformer: Union[np.ndarray, unit.Quantity],
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> unit.Quantity:
"""
Compute the lowest all-atom RMSD between a reference and target conformer,
allowing for symmetry-equivalent atoms to be permuted.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute the RMSD for
reference_conformer: np.ndarray or openff.units.unit.Quantity
The reference conformer to compare to the target conformer.
If a numpy array, it is assumed to be in units of angstrom.
target_conformer: np.ndarray or openff.units.unit.Quantity
The target conformer to compare to the reference conformer.
If a numpy array, it is assumed to be in units of angstrom.
toolkit_registry:
The toolkit registry to use
Returns
-------
rmsd: unit.Quantity
Examples
--------
>>> from openff.units import unit
>>> from openff.toolkit.topology import Molecule
>>> from openff.toolkit.utils.openff import get_best_rmsd
>>> molecule = Molecule.from_smiles("CCCCO")
>>> molecule.generate_conformers(n_conformers=2)
>>> rmsd = get_best_rmsd(molecule, molecule.conformers[0], molecule.conformers[1])
>>> print(f"RMSD in angstrom: {rmsd.m_as(unit.angstrom)}")
"""
[docs]@toolkit_registry_function
def calculate_circular_fingerprint_similarity(
molecule: "Molecule",
reference_molecule: "Molecule",
radius: int = 3,
nbits: int = 2048,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> float:
"""
Compute the similarity between two molecules using a fingerprinting method.
Uses a Morgan fingerprint with RDKit and a Circular fingerprint with OpenEye.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to compute the fingerprint for.
reference_molecule: openff.toolkit.topology.Molecule
The molecule to compute the fingerprint for.
radius: int, default 3
The radius of the fingerprint to use.
nbits: int, default 2048
The length of the fingerprint to use. Not used in RDKit.
toolkit_registry:
The toolkit registry to use
Returns
-------
similarity: float
The Dice similarity between the two molecules.
"""
[docs]def normalize_molecule(
molecule: "Molecule",
max_iter: int = 200,
inplace: bool = False,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> "Molecule":
"""
Normalize the bond orders and charges of a molecule by applying a series of transformations to it.
Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to normalize.
max_iter: int, default=200
The maximum number of iterations to perform for each transformation.
This parameter is only used with the RDKit ToolkitWrapper,
as the OpenEyeToolkitWrapper applies each normalization reaction exhaustively.
inplace: bool, default=False
If the molecule should be normalized in place or a new molecule should be returned.
If a new molecule is returned, atoms remain ordered like the original molecule.
toolkit_registry: openff.toolkit.utils.toolkits.ToolkitRegistry or
lopenff.toolkit.utils.toolkits.ToolkitWrapper, default=GLOBAL_TOOLKIT_REGISTRY
:class:`ToolkitRegistry` or :class:`ToolkitWrapper` to use to perform normalization reactions.
Returns
-------
normalized_molecule: openff.toolkit.topology.Molecule
"""
# normalizations from RDKit's Code/GraphMol/MolStandardize/TransformCatalog/normalizations.in
normalizations = (
"[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1](-[*-1:2])=[*:3]", # Nitro to N+(O-)=O
"[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])", # Sulfone to S(=O)(=O)
"[nH0+0:1]=[OH0+0:2]>>[n+:1]-[O-:2]", # Pyridine oxide to n+O-
"[*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]", # Azide to N=N+=N-
"[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]", # Diazo/azo to =N+=N-
"[!O:1][S+0;X3:2](=[O:3])[!O:4]>>[*:1][S+1:2]([O-:3])[*:4]", # Sulfoxide to -S+(O-)-
"[O,S,Se,Te;-1:1][P+;D4:2][O,S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]", # Phosphate to P(O-)=O
"[C,S&!$([S+]-[O-]);X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]", # C/S+N to C/S=N+
"[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]", # P+N to P=N+
# Normalize hydrazine-diazonium
"[CX4:1][NX3H:2]-[NX3H:3][CX4:4][NX2+:5]#[NX1:6]>>[CX4:1][NH0:2]=[NH+:3][C:4][N+0:5]=[NH:6]",
# Recombine 1,3-separated charges
"[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]",
"[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]", # Recombine 1,3-separated charges
"[N,O,P,S;-1:1]-[a:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]", # Recombine 1,3-separated charges
# Recombine 1,5-separated charges
"[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]", # noqa: E501
# Recombine 1,5-separated charges
"[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]",
# Recombine 1,5-separated charges
"[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]",
"[N,O;+0!H0:1]-[A:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]", # Normalize 1,3 conjugated cation
"[n;+0!H0:1]:[c:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]", # Normalize 1,3 conjugated cation
# Normalize 1,5 conjugated cation
"[N,O;+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*[O-]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]",
# Normalize 1,5 conjugated cation
"[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*[O-]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]",
"[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1]-[O-:2]", # Charge normalization
"[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]", # Charge recombination
)
molecule_ = type(molecule)(molecule)
molecule_._conformers = None
normalized = call_toolkit_function(
"_run_normalization_reactions",
molecule=molecule_,
normalization_reactions=normalizations,
max_iter=max_iter,
toolkit_registry=toolkit_registry,
) # type: ignore
if not inplace:
molecule = copy.deepcopy(molecule)
for self_atom, norm_atom in zip(molecule.atoms, normalized.atoms):
self_atom.formal_charge = norm_atom.formal_charge
for norm_bond in normalized.bonds:
self_bond = molecule.get_bond_between(
norm_bond.atom1_index, norm_bond.atom2_index
)
self_bond._bond_order = norm_bond.bond_order
return molecule
[docs]def enumerate_stereoisomers(
molecule: "Molecule",
undefined_only: bool = True,
max_isomers: int = 20,
rationalize: bool = True,
include_self: bool = False,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
) -> List["Molecule"]:
"""Enumerate stereoisomers for a molecule.
Parameters
----------
molecule
The molecule to enumerate stereoisomers for.
undefined_only: bool, optional, default=True
If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry
max_isomers: int optional, default=20
The maximum amount of molecules that should be returned
rationalize
If we should try to build and rationalise the molecule to ensure it can exist
Returns
-------
A list of stereoisomers.
"""
from openff.toolkit.topology import Molecule
stereoisomers = molecule.enumerate_stereoisomers(
undefined_only=undefined_only,
max_isomers=max_isomers,
rationalise=rationalize,
toolkit_registry=toolkit_registry,
)
if include_self:
if not any(molecule == isomol for isomol in stereoisomers):
stereoisomers.append(copy.deepcopy(molecule))
return stereoisomers
[docs]@toolkit_registry_function
def stream_molecules_from_sdf_file(
file: str,
explicit_hydrogens: bool = True,
as_smiles: bool = False,
mapped_smiles: bool = False,
include_sdf_data: bool = True,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
):
"""Stream molecules from an SDF file.
Parameters
----------
file
The path to the SDF file.
explicit_hydrogens
Leave explicit hydrogens in the molecules.
as_smiles
If True, return the molecules as SMILES strings.
mapped_smiles
If True, return mapped smiles with atom indices
include_sdf_data
If SDF tag data (e.g. charges) should be included in the molecule properties
toolkit_registry
The toolkit registry to use.
Returns
-------
A generator of openff.toolkit.topology.Molecule objects or SMILES strings
"""
pass
[docs]def validate_smiles(smiles: str, toolkit_registry=NAGL_TOOLKIT_REGISTRY):
from openff.toolkit.topology import Molecule
offmol = Molecule.from_smiles(smiles, toolkit_registry=toolkit_registry)
return offmol.to_smiles(
mapped=False, isomeric=True, toolkit_registry=toolkit_registry
)
[docs]def stream_molecules_from_smiles_file(
file: str,
as_smiles: bool = False,
mapped_smiles: bool = False,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
):
"""Stream molecules from a SMILES file.
Parameters
----------
file
The path to the SMILES file.
as_smiles
If True, return the molecules as SMILES strings.
mapped_smiles
If True, return mapped smiles with atom indices
validate_smiles
If True, validate SMILES by converting to and from OpenFF Molecule.
If False, SMILES are assumed to be valid, and `mapped_smiles`
is ignored.
toolkit_registry
The toolkit registry to use.
Returns
-------
A generator of openff.toolkit.topology.Molecule objects or SMILES strings
"""
from openff.toolkit.topology.molecule import SmilesParsingError
from openff.toolkit.topology import Molecule
with open(file, "r") as f:
smiles = [x.strip() for x in f.readlines()]
for line in smiles:
for field in line.split():
try:
offmol = Molecule.from_mapped_smiles(
field, toolkit_registry=toolkit_registry
)
except (ValueError, SmilesParsingError):
offmol = Molecule.from_smiles(field, allow_undefined_stereo=True)
if as_smiles:
offmol = offmol.to_smiles(
mapped=mapped_smiles, toolkit_registry=toolkit_registry
)
yield offmol
[docs]def stream_molecules_from_file(
file: str,
file_format: str = None,
explicit_hydrogens: bool = True,
as_smiles: bool = False,
mapped_smiles: bool = False,
include_sdf_data: bool = True,
toolkit_registry=NAGL_TOOLKIT_REGISTRY,
):
"""Stream molecules from a file.
Parameters
----------
file: str
The path to the file.
file_format: str, optional
The file format. If not provided, the format will be guessed from the file extension.
explicit_hydrogens: bool, optional
Keep explicit hydrogens if molecule are output as SMILES.
as_smiles: bool, optional
If True, return the molecules as SMILES strings.
mapped_smiles: bool, optional
If True, return mapped smiles with atom indices
include_sdf_data: bool, optional
If SDF tag data (e.g. charges) should be included in the molecule properties
toolkit_registry
The toolkit registry to use.
Returns
-------
molecules: Generator[openff.toolkit.topology.Molecule or str]
A generator of openff.toolkit.topology.Molecule objects or SMILES strings
"""
if file_format is None:
file_format = guess_file_format(file)
if file_format in ("sdf", "sdf.gz"):
func = functools.partial(
stream_molecules_from_sdf_file,
explicit_hydrogens=explicit_hydrogens,
as_smiles=as_smiles,
mapped_smiles=mapped_smiles,
include_sdf_data=include_sdf_data,
toolkit_registry=toolkit_registry,
)
elif file_format == "smiles":
func = functools.partial(
stream_molecules_from_smiles_file,
as_smiles=as_smiles,
mapped_smiles=mapped_smiles,
toolkit_registry=toolkit_registry,
)
for mol in func(file):
yield mol
[docs]def smiles_to_inchi_key(smiles: str) -> str:
"""Convert a SMILES string to an InChI key.
Parameters
----------
smiles
The SMILES string to convert.
Returns
-------
inchi_key
The InChI key corresponding to the SMILES string.
"""
from openff.toolkit.topology import Molecule
offmol = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
return offmol.to_inchikey(fixed_hydrogens=True)
[docs]def get_openff_molecule_bond_indices(molecule: "Molecule") -> List[Tuple[int, int]]:
"""Get the atom indices of each bond in an OpenFF molecule.
Parameters
----------
molecule
The molecule to get the bond indices for.
Returns
-------
bond_indices
The atom indices of each bond in the molecule.
The indices are sorted in ascending order for each bond.
The bonds are ordered by the molecule bond order.
"""
return [
tuple(sorted((bond.atom1_index, bond.atom2_index))) for bond in molecule.bonds
]
[docs]def map_indexed_smiles(reference_smiles: str, target_smiles: str) -> Dict[int, int]:
"""
Map the indices of the target SMILES to the indices of the reference SMILES.
Parameters
----------
reference_smiles
The reference SMILES string, mapped with atom indices.
target_smiles
The target SMILES string, mapped with atom indices.
Returns
-------
atom_map
A dictionary in the form of {reference_atom_index: target_atom_index}
"""
from openff.toolkit.topology import Molecule
reference_molecule = Molecule.from_mapped_smiles(
reference_smiles, allow_undefined_stereo=True
)
target_molecule = Molecule.from_mapped_smiles(
target_smiles, allow_undefined_stereo=True
)
_, atom_map = Molecule.are_isomorphic(
reference_molecule,
target_molecule,
return_atom_map=True,
)
return atom_map
[docs]def molecule_from_networkx(graph):
from openff.toolkit.topology import Molecule
molecule = Molecule()
for _, info in graph.nodes(data=True):
molecule.add_atom(
atomic_number=info["atomic_number"],
formal_charge=info["formal_charge"],
is_aromatic=info["is_aromatic"],
stereochemistry=info.get("stereochemistry", None),
)
for u, v, info in graph.edges(data=True):
molecule.add_bond(
u,
v,
bond_order=info["bond_order"],
is_aromatic=info["is_aromatic"],
stereochemistry=info.get("stereochemistry", None),
)
return molecule
def _molecule_to_dict(molecule: "Molecule") -> dict[str, dict]:
"""
Convert an OpenFF molecule to a graph representation.
Parameters
----------
molecule
The molecule to convert.
Returns
-------
graph: dict[str, dict]
This is a dictionary with the keys "atoms" and "bonds".
The "atoms" key maps to a dictionary of atom indices to atom information.
Each atom information dictionary contains the following keys:
atomic_number, formal_charge, is_aromatic, stereochemistry.
The "bonds" key maps to a dictionary of bond indices as a tuple of integers.
The bond indices are sorted so the lowest value is first.
Each bond indices tuple is mapped to bond information.
Each bond information dictionary contains the following keys:
bond_order, is_aromatic, stereochemistry.
"""
atoms = {}
for i, atom in enumerate(molecule.atoms):
atoms[i] = {
"atomic_number": atom.atomic_number,
"formal_charge": atom.formal_charge,
"is_aromatic": atom.is_aromatic,
"stereochemistry": atom.stereochemistry,
}
bonds = {}
for bond in molecule.bonds:
indices = tuple(sorted((bond.atom1_index, bond.atom2_index)))
bonds[indices] = {
"bond_order": bond.bond_order,
"is_aromatic": bond.is_aromatic,
"stereochemistry": bond.stereochemistry,
}
return {"atoms": atoms, "bonds": bonds}
def _molecule_from_dict(graph: dict[str, dict]) -> "Molecule":
"""
Convert a graph representation to an OpenFF molecule.
Parameters
----------
graph
The graph representation to convert.
This is a dictionary with the keys "atoms" and "bonds".
The "atoms" key maps to a dictionary of atom indices to atom information.
Each atom information dictionary contains the following keys:
atomic_number, formal_charge, is_aromatic, stereochemistry.
The "bonds" key maps to a dictionary of bond indices as a tuple of integers.
The bond indices are sorted so the lowest value is first.
Each bond indices tuple is mapped to bond information.
Each bond information dictionary contains the following keys:
bond_order, is_aromatic, stereochemistry.
Returns
-------
molecule
The OpenFF molecule representation.
"""
from openff.toolkit.topology import Molecule
molecule = Molecule()
for atom_index in sorted(graph["atoms"]):
atom = graph["atoms"][atom_index]
molecule.add_atom(
atomic_number=atom["atomic_number"],
formal_charge=atom["formal_charge"],
is_aromatic=atom["is_aromatic"],
stereochemistry=atom.get("stereochemistry", None),
)
for (u, v), info in graph["bonds"].items():
molecule.add_bond(
u,
v,
bond_order=info["bond_order"],
is_aromatic=info["is_aromatic"],
stereochemistry=info.get("stereochemistry", None),
)
return molecule