Source code for openff.nagl.toolkits.openff

import contextlib
import copy
import functools
from typing import TYPE_CHECKING, Tuple, List, Union, Dict, NamedTuple, Any, Optional

import numpy as np

from openff.units import unit

from openff.utilities import requires_package
from openff.nagl.toolkits import NAGL_TOOLKIT_REGISTRY
from openff.utilities.exceptions import MissingOptionalDependencyError

if TYPE_CHECKING:
    from openff.toolkit.topology import Molecule
    from openff.nagl.utils._types import HybridizationType


[docs]def call_toolkit_function(function_name, toolkit_registry, *args, **kwargs): """ Call a function from a toolkit wrapper or toolkit registry. Parameters ---------- function: function The function to call. toolkit_registry: ToolkitWrapperBase or ToolkitRegistry The toolkit wrapper or registry to call the function from. *args: The positional arguments to pass to the function. **kwargs: The keyword arguments to pass to the function. """ from openff.nagl.toolkits.registry import NAGLToolkitRegistry from openff.nagl.toolkits._base import ( NAGLToolkitWrapperMeta, NAGLToolkitWrapperBase, ) from openff.toolkit.utils.exceptions import InvalidToolkitRegistryError if isinstance(toolkit_registry, NAGLToolkitWrapperMeta): toolkit_registry = toolkit_registry() if isinstance(toolkit_registry, NAGLToolkitWrapperBase): toolkit_function = getattr(toolkit_registry, function_name) return toolkit_function(*args, **kwargs) elif isinstance(toolkit_registry, NAGLToolkitRegistry): return toolkit_registry.call(function_name, *args, **kwargs) else: raise InvalidToolkitRegistryError( "toolkit_registry must be instance of OpenFF NAGL " "ToolkitWrapperBase or ToolkitRegistry. " f"Given: {toolkit_registry}" )
[docs]def toolkit_registry_function(function): @functools.wraps(function) def wrapper(*args, toolkit_registry=NAGL_TOOLKIT_REGISTRY, **kwargs): return call_toolkit_function( function.__name__, toolkit_registry, *args, **kwargs ) return wrapper
[docs]@contextlib.contextmanager @requires_package("openeye.oechem") def capture_oechem_warnings(): # pragma: no cover from openeye import oechem output_stream = oechem.oeosstream() oechem.OEThrow.SetOutputStream(output_stream) oechem.OEThrow.Clear() yield oechem.OEThrow.SetOutputStream(oechem.oeerr)
[docs]@contextlib.contextmanager def capture_toolkit_warnings(run: bool = True): # pragma: no cover """A convenience method to capture and discard any warning produced by external cheminformatics toolkits excluding the OpenFF toolkit. This should be used with extreme caution and is only really intended for use when processing tens of thousands of molecules at once.""" import logging import warnings if not run: yield return toolkit_logger = logging.getLogger("openff.toolkit") openff_logger_level = toolkit_logger.getEffectiveLevel() toolkit_logger.setLevel(logging.ERROR) try: with capture_oechem_warnings(): yield except MissingOptionalDependencyError: yield toolkit_logger.setLevel(openff_logger_level)
[docs]@contextlib.contextmanager @toolkit_registry_function def stream_molecules_to_file( file: str, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ): """ Stream molecules to an SDF file using a context manager. Parameters ---------- file: str The path to the SDF file to stream molecules to. toolkit_registry: The toolkit registry to use to write the molecules. Examples -------- >>> from openff.toolkit.topology import Molecule >>> from openff.toolkit.utils.openff import stream_molecules_to_file >>> molecule1 = Molecule.from_smiles("CCO") >>> molecule2 = Molecule.from_smiles("CCC") >>> with stream_molecules_to_file("molecules.sdf") as writer: ... writer(molecule1) ... writer(molecule2) """ pass
[docs]@toolkit_registry_function def get_molecule_hybridizations( molecule: "Molecule", toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> List["HybridizationType"]: """ Get the hybridization of each atom in a molecule. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to get the hybridizations of. toolkit_registry: The toolkit registry to use Returns ------- hybridizations: List[HybridizationType] The hybridization of each atom in the molecule. """ pass
[docs]@toolkit_registry_function def get_atoms_are_in_ring_size( molecule: "Molecule", ring_size: int, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> List[bool]: """ Determine whether each atom in a molecule is in a ring of a given size. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute ring perception for ring_size: int The size of the ring to check for. toolkit_registry: The toolkit registry to use Returns ------- in_ring_size: List[bool] """ pass
[docs]@toolkit_registry_function def get_bonds_are_in_ring_size( molecule: "Molecule", ring_size: int, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> List[bool]: """ Determine whether each bond in a molecule is in a ring of a given size. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute ring perception for ring_size: int The size of the ring to check for. toolkit_registry: The toolkit registry to use Returns ------- in_ring_size: List[bool] Bonds are in the same order as the molecule's ``bonds`` attribute. """ pass
[docs]@toolkit_registry_function def get_best_rmsd( molecule: "Molecule", reference_conformer: Union[np.ndarray, unit.Quantity], target_conformer: Union[np.ndarray, unit.Quantity], toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> unit.Quantity: """ Compute the lowest all-atom RMSD between a reference and target conformer, allowing for symmetry-equivalent atoms to be permuted. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute the RMSD for reference_conformer: np.ndarray or openff.units.unit.Quantity The reference conformer to compare to the target conformer. If a numpy array, it is assumed to be in units of angstrom. target_conformer: np.ndarray or openff.units.unit.Quantity The target conformer to compare to the reference conformer. If a numpy array, it is assumed to be in units of angstrom. toolkit_registry: The toolkit registry to use Returns ------- rmsd: unit.Quantity Examples -------- >>> from openff.units import unit >>> from openff.toolkit.topology import Molecule >>> from openff.toolkit.utils.openff import get_best_rmsd >>> molecule = Molecule.from_smiles("CCCCO") >>> molecule.generate_conformers(n_conformers=2) >>> rmsd = get_best_rmsd(molecule, molecule.conformers[0], molecule.conformers[1]) >>> print(f"RMSD in angstrom: {rmsd.m_as(unit.angstrom)}") """
[docs]@toolkit_registry_function def calculate_circular_fingerprint_similarity( molecule: "Molecule", reference_molecule: "Molecule", radius: int = 3, nbits: int = 2048, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> float: """ Compute the similarity between two molecules using a fingerprinting method. Uses a Morgan fingerprint with RDKit and a Circular fingerprint with OpenEye. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute the fingerprint for. reference_molecule: openff.toolkit.topology.Molecule The molecule to compute the fingerprint for. radius: int, default 3 The radius of the fingerprint to use. nbits: int, default 2048 The length of the fingerprint to use. Not used in RDKit. toolkit_registry: The toolkit registry to use Returns ------- similarity: float The Dice similarity between the two molecules. """
[docs]def is_conformer_identical( molecule: "Molecule", reference_conformer: Union[np.ndarray, unit.Quantity], target_conformer: Union[np.ndarray, unit.Quantity], atol: float = 1.0e-3, ) -> bool: """ Determine if two conformers are identical with an RMSD tolerance, allowing for symmetry-equivalent atoms to be permuted. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to compute the RMSD for reference_conformer: np.ndarray or openff.units.unit.Quantity The reference conformer to compare to the target conformer. If a numpy array, it is assumed to be in units of angstrom. target_conformer: np.ndarray or openff.units.unit.Quantity The target conformer to compare to the reference conformer. If a numpy array, it is assumed to be in units of angstrom. atol: float, default=1.0e-3 The absolute tolerance to use when comparing the RMSD. This is given in angstrom. Returns ------- is_identical: bool """ rmsd = get_best_rmsd(molecule, reference_conformer, target_conformer) return rmsd.m_as(unit.angstrom) < atol
[docs]def normalize_molecule( molecule: "Molecule", max_iter: int = 200, inplace: bool = False, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> "Molecule": """ Normalize the bond orders and charges of a molecule by applying a series of transformations to it. Parameters ---------- molecule: openff.toolkit.topology.Molecule The molecule to normalize. max_iter: int, default=200 The maximum number of iterations to perform for each transformation. This parameter is only used with the RDKit ToolkitWrapper, as the OpenEyeToolkitWrapper applies each normalization reaction exhaustively. inplace: bool, default=False If the molecule should be normalized in place or a new molecule should be returned. If a new molecule is returned, atoms remain ordered like the original molecule. toolkit_registry: openff.toolkit.utils.toolkits.ToolkitRegistry or lopenff.toolkit.utils.toolkits.ToolkitWrapper, default=GLOBAL_TOOLKIT_REGISTRY :class:`ToolkitRegistry` or :class:`ToolkitWrapper` to use to perform normalization reactions. Returns ------- normalized_molecule: openff.toolkit.topology.Molecule """ # normalizations from RDKit's Code/GraphMol/MolStandardize/TransformCatalog/normalizations.in normalizations = ( "[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1](-[*-1:2])=[*:3]", # Nitro to N+(O-)=O "[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])", # Sulfone to S(=O)(=O) "[nH0+0:1]=[OH0+0:2]>>[n+:1]-[O-:2]", # Pyridine oxide to n+O- "[*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]", # Azide to N=N+=N- "[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]", # Diazo/azo to =N+=N- "[!O:1][S+0;X3:2](=[O:3])[!O:4]>>[*:1][S+1:2]([O-:3])[*:4]", # Sulfoxide to -S+(O-)- "[O,S,Se,Te;-1:1][P+;D4:2][O,S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]", # Phosphate to P(O-)=O "[C,S&!$([S+]-[O-]);X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]", # C/S+N to C/S=N+ "[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]", # P+N to P=N+ # Normalize hydrazine-diazonium "[CX4:1][NX3H:2]-[NX3H:3][CX4:4][NX2+:5]#[NX1:6]>>[CX4:1][NH0:2]=[NH+:3][C:4][N+0:5]=[NH:6]", # Recombine 1,3-separated charges "[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]", "[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]", # Recombine 1,3-separated charges "[N,O,P,S;-1:1]-[a:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]", # Recombine 1,3-separated charges # Recombine 1,5-separated charges "[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]", # noqa: E501 # Recombine 1,5-separated charges "[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]", # Recombine 1,5-separated charges "[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]", "[N,O;+0!H0:1]-[A:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]", # Normalize 1,3 conjugated cation "[n;+0!H0:1]:[c:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]", # Normalize 1,3 conjugated cation # Normalize 1,5 conjugated cation "[N,O;+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*[O-]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]", # Normalize 1,5 conjugated cation "[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*[O-]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]", "[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1]-[O-:2]", # Charge normalization "[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]", # Charge recombination ) molecule_ = type(molecule)(molecule) molecule_._conformers = None normalized = call_toolkit_function( "_run_normalization_reactions", molecule=molecule_, normalization_reactions=normalizations, max_iter=max_iter, toolkit_registry=toolkit_registry, ) # type: ignore if not inplace: molecule = copy.deepcopy(molecule) for self_atom, norm_atom in zip(molecule.atoms, normalized.atoms): self_atom.formal_charge = norm_atom.formal_charge for norm_bond in normalized.bonds: self_bond = molecule.get_bond_between( norm_bond.atom1_index, norm_bond.atom2_index ) self_bond._bond_order = norm_bond.bond_order return molecule
[docs]def enumerate_stereoisomers( molecule: "Molecule", undefined_only: bool = True, max_isomers: int = 20, rationalize: bool = True, include_self: bool = False, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ) -> List["Molecule"]: """Enumerate stereoisomers for a molecule. Parameters ---------- molecule The molecule to enumerate stereoisomers for. undefined_only: bool, optional, default=True If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry max_isomers: int optional, default=20 The maximum amount of molecules that should be returned rationalize If we should try to build and rationalise the molecule to ensure it can exist Returns ------- A list of stereoisomers. """ from openff.toolkit.topology import Molecule stereoisomers = molecule.enumerate_stereoisomers( undefined_only=undefined_only, max_isomers=max_isomers, rationalise=rationalize, toolkit_registry=toolkit_registry, ) if include_self: if not any(molecule == isomol for isomol in stereoisomers): stereoisomers.append(copy.deepcopy(molecule)) return stereoisomers
[docs]def guess_file_format(file: str) -> str: """ Guess the file format of a file from the extension Parameters ---------- file: str The file to guess the format of Returns ------- str The guessed file format """ file = str(file) if file.endswith("sdf"): file_format = "sdf" elif file.endswith("sdf.gz"): file_format = "sdf.gz" elif file.endswith("smi") or file.endswith("smiles"): file_format = "smiles" else: raise ValueError(f"Unknown file format for file: {file}") return file_format
[docs]@toolkit_registry_function def stream_molecules_from_sdf_file( file: str, explicit_hydrogens: bool = True, as_smiles: bool = False, mapped_smiles: bool = False, include_sdf_data: bool = True, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ): """Stream molecules from an SDF file. Parameters ---------- file The path to the SDF file. explicit_hydrogens Leave explicit hydrogens in the molecules. as_smiles If True, return the molecules as SMILES strings. mapped_smiles If True, return mapped smiles with atom indices include_sdf_data If SDF tag data (e.g. charges) should be included in the molecule properties toolkit_registry The toolkit registry to use. Returns ------- A generator of openff.toolkit.topology.Molecule objects or SMILES strings """ pass
[docs]def validate_smiles(smiles: str, toolkit_registry=NAGL_TOOLKIT_REGISTRY): from openff.toolkit.topology import Molecule offmol = Molecule.from_smiles(smiles, toolkit_registry=toolkit_registry) return offmol.to_smiles( mapped=False, isomeric=True, toolkit_registry=toolkit_registry )
[docs]def stream_molecules_from_smiles_file( file: str, as_smiles: bool = False, mapped_smiles: bool = False, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ): """Stream molecules from a SMILES file. Parameters ---------- file The path to the SMILES file. as_smiles If True, return the molecules as SMILES strings. mapped_smiles If True, return mapped smiles with atom indices validate_smiles If True, validate SMILES by converting to and from OpenFF Molecule. If False, SMILES are assumed to be valid, and `mapped_smiles` is ignored. toolkit_registry The toolkit registry to use. Returns ------- A generator of openff.toolkit.topology.Molecule objects or SMILES strings """ from openff.toolkit.topology.molecule import SmilesParsingError from openff.toolkit.topology import Molecule with open(file, "r") as f: smiles = [x.strip() for x in f.readlines()] for line in smiles: for field in line.split(): try: offmol = Molecule.from_mapped_smiles( field, toolkit_registry=toolkit_registry ) except (ValueError, SmilesParsingError): offmol = Molecule.from_smiles(field, allow_undefined_stereo=True) if as_smiles: offmol = offmol.to_smiles( mapped=mapped_smiles, toolkit_registry=toolkit_registry ) yield offmol
[docs]def stream_molecules_from_file( file: str, file_format: str = None, explicit_hydrogens: bool = True, as_smiles: bool = False, mapped_smiles: bool = False, include_sdf_data: bool = True, toolkit_registry=NAGL_TOOLKIT_REGISTRY, ): """Stream molecules from a file. Parameters ---------- file: str The path to the file. file_format: str, optional The file format. If not provided, the format will be guessed from the file extension. explicit_hydrogens: bool, optional Keep explicit hydrogens if molecule are output as SMILES. as_smiles: bool, optional If True, return the molecules as SMILES strings. mapped_smiles: bool, optional If True, return mapped smiles with atom indices include_sdf_data: bool, optional If SDF tag data (e.g. charges) should be included in the molecule properties toolkit_registry The toolkit registry to use. Returns ------- molecules: Generator[openff.toolkit.topology.Molecule or str] A generator of openff.toolkit.topology.Molecule objects or SMILES strings """ if file_format is None: file_format = guess_file_format(file) if file_format in ("sdf", "sdf.gz"): func = functools.partial( stream_molecules_from_sdf_file, explicit_hydrogens=explicit_hydrogens, as_smiles=as_smiles, mapped_smiles=mapped_smiles, include_sdf_data=include_sdf_data, toolkit_registry=toolkit_registry, ) elif file_format == "smiles": func = functools.partial( stream_molecules_from_smiles_file, as_smiles=as_smiles, mapped_smiles=mapped_smiles, toolkit_registry=toolkit_registry, ) for mol in func(file): yield mol
[docs]def smiles_to_inchi_key(smiles: str) -> str: """Convert a SMILES string to an InChI key. Parameters ---------- smiles The SMILES string to convert. Returns ------- inchi_key The InChI key corresponding to the SMILES string. """ from openff.toolkit.topology import Molecule offmol = Molecule.from_smiles(smiles, allow_undefined_stereo=True) return offmol.to_inchikey(fixed_hydrogens=True)
[docs]def get_openff_molecule_bond_indices(molecule: "Molecule") -> List[Tuple[int, int]]: """Get the atom indices of each bond in an OpenFF molecule. Parameters ---------- molecule The molecule to get the bond indices for. Returns ------- bond_indices The atom indices of each bond in the molecule. The indices are sorted in ascending order for each bond. The bonds are ordered by the molecule bond order. """ return [ tuple(sorted((bond.atom1_index, bond.atom2_index))) for bond in molecule.bonds ]
[docs]def map_indexed_smiles(reference_smiles: str, target_smiles: str) -> Dict[int, int]: """ Map the indices of the target SMILES to the indices of the reference SMILES. Parameters ---------- reference_smiles The reference SMILES string, mapped with atom indices. target_smiles The target SMILES string, mapped with atom indices. Returns ------- atom_map A dictionary in the form of {reference_atom_index: target_atom_index} """ from openff.toolkit.topology import Molecule reference_molecule = Molecule.from_mapped_smiles( reference_smiles, allow_undefined_stereo=True ) target_molecule = Molecule.from_mapped_smiles( target_smiles, allow_undefined_stereo=True ) _, atom_map = Molecule.are_isomorphic( reference_molecule, target_molecule, return_atom_map=True, ) return atom_map
[docs]def molecule_from_networkx(graph): from openff.toolkit.topology import Molecule molecule = Molecule() for _, info in graph.nodes(data=True): molecule.add_atom( atomic_number=info["atomic_number"], formal_charge=info["formal_charge"], is_aromatic=info["is_aromatic"], stereochemistry=info.get("stereochemistry", None), ) for u, v, info in graph.edges(data=True): molecule.add_bond( u, v, bond_order=info["bond_order"], is_aromatic=info["is_aromatic"], stereochemistry=info.get("stereochemistry", None), ) return molecule
def _molecule_to_dict(molecule: "Molecule") -> dict[str, dict]: """ Convert an OpenFF molecule to a graph representation. Parameters ---------- molecule The molecule to convert. Returns ------- graph: dict[str, dict] This is a dictionary with the keys "atoms" and "bonds". The "atoms" key maps to a dictionary of atom indices to atom information. Each atom information dictionary contains the following keys: atomic_number, formal_charge, is_aromatic, stereochemistry. The "bonds" key maps to a dictionary of bond indices as a tuple of integers. The bond indices are sorted so the lowest value is first. Each bond indices tuple is mapped to bond information. Each bond information dictionary contains the following keys: bond_order, is_aromatic, stereochemistry. """ atoms = {} for i, atom in enumerate(molecule.atoms): atoms[i] = { "atomic_number": atom.atomic_number, "formal_charge": atom.formal_charge, "is_aromatic": atom.is_aromatic, "stereochemistry": atom.stereochemistry, } bonds = {} for bond in molecule.bonds: indices = tuple(sorted((bond.atom1_index, bond.atom2_index))) bonds[indices] = { "bond_order": bond.bond_order, "is_aromatic": bond.is_aromatic, "stereochemistry": bond.stereochemistry, } return {"atoms": atoms, "bonds": bonds} def _molecule_from_dict(graph: dict[str, dict]) -> "Molecule": """ Convert a graph representation to an OpenFF molecule. Parameters ---------- graph The graph representation to convert. This is a dictionary with the keys "atoms" and "bonds". The "atoms" key maps to a dictionary of atom indices to atom information. Each atom information dictionary contains the following keys: atomic_number, formal_charge, is_aromatic, stereochemistry. The "bonds" key maps to a dictionary of bond indices as a tuple of integers. The bond indices are sorted so the lowest value is first. Each bond indices tuple is mapped to bond information. Each bond information dictionary contains the following keys: bond_order, is_aromatic, stereochemistry. Returns ------- molecule The OpenFF molecule representation. """ from openff.toolkit.topology import Molecule molecule = Molecule() for atom_index in sorted(graph["atoms"]): atom = graph["atoms"][atom_index] molecule.add_atom( atomic_number=atom["atomic_number"], formal_charge=atom["formal_charge"], is_aromatic=atom["is_aromatic"], stereochemistry=atom.get("stereochemistry", None), ) for (u, v), info in graph["bonds"].items(): molecule.add_bond( u, v, bond_order=info["bond_order"], is_aromatic=info["is_aromatic"], stereochemistry=info.get("stereochemistry", None), ) return molecule