Download Notebook View in GitHub Open in Google Colab
Using QCArchive with the OpenFF Toolkit
Here we show how to create OpenFF molecules safely from data in the QCArchive using the cmiles entries, specifically we want to use the canonical_isomeric_explicit_hydrogen_mapped_smiles data which is metadata stored at the entry-level of a collection.
First load up the client you wish to connect to, in this case, we use the public instance.
import qcportal as ptl
from openff.toolkit import Molecule
client = ptl.FractalClient()
# list the collections available
client.list_collections()
tagline | ||
---|---|---|
collection | name | |
Dataset | A Benchmark Data Set for Hydrogen Combustion | A Benchmark Data Set for Hydrogen Combustion |
ANI-1 | 22 million off-equilibrium conformations and e... | |
ANI-1ccx | Coupled cluster properties for molecules | |
ANI-1x | Density functional theory properties for molec... | |
COMP6 ANI-MD | Benchmark containing MD trajectories from the ... | |
... | ... | ... |
TorsionDriveDataset | OpenFF-benchmark-ligand-fragments-v2.0 | Ligand fragments from the JACS benchmark systems. |
Pfizer Discrepancy Torsion Dataset 1 | None | |
SMIRNOFF Coverage Torsion Set 1 | None | |
SiliconTX Torsion Benchmark Set 1 | None | |
TorsionDrive Paper | None |
207 rows × 1 columns
Now let us grab a molecule from an optimization dataset
ds = client.get_collection(
"OptimizationDataset", "Kinase Inhibitors: WBO Distributions"
)
Take the first entry from the collection.
entry = ds.get_entry(ds.df.index[0])
We can view the entry in detail by looking at the dictionary representation.
entry.dict()
{'name': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C-0',
'initial_molecule': '9589274',
'additional_keywords': {},
'attributes': {'canonical_explicit_hydrogen_smiles': '[H]c1c(c(c(nc1[H])[H])c2c(c(nc(n2)N([H])c3c(c(c(c(c3C([H])([H])[H])[H])[H])N([H])C(=O)c4c(c(c(c(c4[H])[H])C([H])([H])N5C(C(N(C(C5([H])[H])([H])[H])C([H])([H])[H])([H])[H])([H])[H])[H])[H])[H])[H])[H])[H]',
'canonical_isomeric_explicit_hydrogen_mapped_smiles': '[H:38][c:1]1[c:2]([c:14]([c:13]([n:30][c:11]1[H:48])[H:50])[c:20]2[c:9]([c:12]([n:31][c:21]([n:32]2)[N:35]([H:67])[c:19]3[c:10]([c:18]([c:8]([c:7]([c:17]3[C:27]([H:59])([H:60])[H:61])[H:44])[H:45])[N:36]([H:68])[C:22](=[O:37])[c:15]4[c:3]([c:5]([c:16]([c:6]([c:4]4[H:41])[H:43])[C:29]([H:65])([H:66])[N:34]5[C:25]([C:23]([N:33]([C:24]([C:26]5([H:57])[H:58])([H:53])[H:54])[C:28]([H:62])([H:63])[H:64])([H:51])[H:52])([H:55])[H:56])[H:42])[H:40])[H:47])[H:49])[H:46])[H:39]',
'canonical_isomeric_explicit_hydrogen_smiles': '[H]c1c(c(c(nc1[H])[H])c2c(c(nc(n2)N([H])c3c(c(c(c(c3C([H])([H])[H])[H])[H])N([H])C(=O)c4c(c(c(c(c4[H])[H])C([H])([H])N5C(C(N(C(C5([H])[H])([H])[H])C([H])([H])[H])([H])[H])([H])[H])[H])[H])[H])[H])[H])[H]',
'canonical_isomeric_smiles': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
'canonical_smiles': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
'inchi_key': 'KTUFNOKKBVMGRW-UHFFFAOYSA-N',
'molecular_formula': 'C29H31N7O',
'provenance': 'cmiles_v0.1.5+1.gdbd63e8_openeye_2019.Apr.2',
'standard_inchi': 'InChI=1S/C29H31N7O/c1-21-5-10-25(18-27(21)34-29-31-13-11-26(33-29)24-4-3-12-30-19-24)32-28(37)23-8-6-22(7-9-23)20-36-16-14-35(2)15-17-36/h3-13,18-19H,14-17,20H2,1-2H3,(H,32,37)(H,31,33,34)',
'unique_protomer_representation': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
'unique_tautomer_representation': 'Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1N=c1nc(-c2cccnc2)cc[nH]1'},
'object_map': {'HF3c': '9425725', 'default': '9527898'}}
Now we can make a molecule using a few different input options.
# first make a molecule using this record object
mol_record = Molecule.from_qcschema(entry)
# we could have also used the dictionary representation of the object
mol_dict = Molecule.from_qcschema(entry.dict(encoding="json"))
# we check that the molecule has been ordered to match the ordering used in the data base
# by printing out the atomic numbers of both objects in order
# first lets get the initial molecule from the database
initial_mol = client.query_molecules(id=entry.initial_molecule)[0]
for atoms in zip(mol_record.atoms, initial_mol.atomic_numbers):
print(atoms[0].atomic_number, atoms[1])
# we can also check that the molecules are the same regardless of how they are made
assert mol_dict == mol_record
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
7 7
7 7
7 7
7 7
7 7
7 7
7 7
8 8
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
# we can also compare the graph representations of the molecules to make sure they are in the same order
import networkx as nx
# make a graph of the initial molecule using newtorkx and the data in the record
initial_network = nx.Graph()
for i, atom_num in enumerate(initial_mol.atomic_numbers):
initial_network.add_node(i, atomic_number=atom_num)
for bond in initial_mol.connectivity:
initial_network.add_edge(*bond[:2])
# now we can use the new isomorphic check to get the atom mapping
isomorphic, atom_map = Molecule.are_isomorphic(
mol_record,
initial_network,
return_atom_map=True,
aromatic_matching=False,
formal_charge_matching=False,
bond_order_matching=False,
bond_stereochemistry_matching=False,
atom_stereochemistry_matching=False,
)
# we can print if the graph was found to be isomorphic and then the atom mapping
# the atoms are in the same order here as the idexes are the same in the mapping
print(isomorphic)
print(atom_map)
True
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67}
Now that we have seen how to make the molecule, lets look at also getting the geometry as currently we have none.
# check there is no geometry for the molecule
assert mol_record.n_conformers == 0
# if we also want the input geometry for the molecule, we just need to pass the relavent client instance
mol_dict = Molecule.from_qcschema(entry.dict(encoding="json"), client=client)
# check that there is a conformer
mol_dict.n_conformers
1
# Thanks to the qcschema method we also get visualisation for free, along with being able to compute
# properties like energy, gradient and hessian with qcengine using QM, rdkit, openmm, or ANI1
mol_dict.to_qcschema()
Here we will try and compute the energy using RDKit (only run this cell if qcengine is installed.)
# for example this molecules energy can be computed using qcengine and RDKit
import qcengine
# set up the RDKit task
rdkit_task = {
"schema_name": "qcschema_input",
"schema_version": 2,
"molecule": mol_dict.to_qcschema(),
"driver": "energy",
"model": {"method": "uff", "basis": None},
"keywords": {"scf_type": "df"},
}
# now lets compute the energy using qcengine and RDKit and print the result
result = qcengine.compute(rdkit_task, "rdkit")
# note the result is in QC units of hartrees
print(result.return_result)
0.053473233262621864