Using QCArchive with the OpenFF Toolkit

Here we show how to create OpenFF molecules safely from data in the QCArchive using the cmiles entries, specifically we want to use the canonical_isomeric_explicit_hydrogen_mapped_smiles data which is metadata stored at the entry-level of a collection.

First load up the client you wish to connect to, in this case, we use the public instance.

import qcportal as ptl

from openff.toolkit import Molecule

client = ptl.FractalClient()
# list the collections available
client.list_collections()
tagline
collection name
Dataset A Benchmark Data Set for Hydrogen Combustion A Benchmark Data Set for Hydrogen Combustion
ANI-1 22 million off-equilibrium conformations and e...
ANI-1ccx Coupled cluster properties for molecules
ANI-1x Density functional theory properties for molec...
COMP6 ANI-MD Benchmark containing MD trajectories from the ...
... ... ...
TorsionDriveDataset OpenFF-benchmark-ligand-fragments-v2.0 Ligand fragments from the JACS benchmark systems.
Pfizer Discrepancy Torsion Dataset 1 None
SMIRNOFF Coverage Torsion Set 1 None
SiliconTX Torsion Benchmark Set 1 None
TorsionDrive Paper None

207 rows × 1 columns

Now let us grab a molecule from an optimization dataset

ds = client.get_collection(
    "OptimizationDataset", "Kinase Inhibitors: WBO Distributions"
)

Take the first entry from the collection.

entry = ds.get_entry(ds.df.index[0])

We can view the entry in detail by looking at the dictionary representation.

entry.dict()
{'name': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C-0',
 'initial_molecule': '9589274',
 'additional_keywords': {},
 'attributes': {'canonical_explicit_hydrogen_smiles': '[H]c1c(c(c(nc1[H])[H])c2c(c(nc(n2)N([H])c3c(c(c(c(c3C([H])([H])[H])[H])[H])N([H])C(=O)c4c(c(c(c(c4[H])[H])C([H])([H])N5C(C(N(C(C5([H])[H])([H])[H])C([H])([H])[H])([H])[H])([H])[H])[H])[H])[H])[H])[H])[H]',
  'canonical_isomeric_explicit_hydrogen_mapped_smiles': '[H:38][c:1]1[c:2]([c:14]([c:13]([n:30][c:11]1[H:48])[H:50])[c:20]2[c:9]([c:12]([n:31][c:21]([n:32]2)[N:35]([H:67])[c:19]3[c:10]([c:18]([c:8]([c:7]([c:17]3[C:27]([H:59])([H:60])[H:61])[H:44])[H:45])[N:36]([H:68])[C:22](=[O:37])[c:15]4[c:3]([c:5]([c:16]([c:6]([c:4]4[H:41])[H:43])[C:29]([H:65])([H:66])[N:34]5[C:25]([C:23]([N:33]([C:24]([C:26]5([H:57])[H:58])([H:53])[H:54])[C:28]([H:62])([H:63])[H:64])([H:51])[H:52])([H:55])[H:56])[H:42])[H:40])[H:47])[H:49])[H:46])[H:39]',
  'canonical_isomeric_explicit_hydrogen_smiles': '[H]c1c(c(c(nc1[H])[H])c2c(c(nc(n2)N([H])c3c(c(c(c(c3C([H])([H])[H])[H])[H])N([H])C(=O)c4c(c(c(c(c4[H])[H])C([H])([H])N5C(C(N(C(C5([H])[H])([H])[H])C([H])([H])[H])([H])[H])([H])[H])[H])[H])[H])[H])[H])[H]',
  'canonical_isomeric_smiles': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
  'canonical_smiles': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
  'inchi_key': 'KTUFNOKKBVMGRW-UHFFFAOYSA-N',
  'molecular_formula': 'C29H31N7O',
  'provenance': 'cmiles_v0.1.5+1.gdbd63e8_openeye_2019.Apr.2',
  'standard_inchi': 'InChI=1S/C29H31N7O/c1-21-5-10-25(18-27(21)34-29-31-13-11-26(33-29)24-4-3-12-30-19-24)32-28(37)23-8-6-22(7-9-23)20-36-16-14-35(2)15-17-36/h3-13,18-19H,14-17,20H2,1-2H3,(H,32,37)(H,31,33,34)',
  'unique_protomer_representation': 'Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C',
  'unique_tautomer_representation': 'Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1N=c1nc(-c2cccnc2)cc[nH]1'},
 'object_map': {'HF3c': '9425725', 'default': '9527898'}}

Now we can make a molecule using a few different input options.

# first make a molecule using this record object
mol_record = Molecule.from_qcschema(entry)

# we could have also used the dictionary representation of the object
mol_dict = Molecule.from_qcschema(entry.dict(encoding="json"))
# we check that the molecule has been ordered to match the ordering used in the data base
# by printing out the atomic numbers of both objects in order

# first lets get the initial molecule from the database
initial_mol = client.query_molecules(id=entry.initial_molecule)[0]

for atoms in zip(mol_record.atoms, initial_mol.atomic_numbers):
    print(atoms[0].atomic_number, atoms[1])

# we can also check that the molecules are the same regardless of how they are made
assert mol_dict == mol_record
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
6 6
7 7
7 7
7 7
7 7
7 7
7 7
7 7
8 8
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
# we can also compare the graph representations of the molecules to make sure they are in the same order
import networkx as nx

# make a graph of the initial molecule using newtorkx and the data in the record
initial_network = nx.Graph()
for i, atom_num in enumerate(initial_mol.atomic_numbers):
    initial_network.add_node(i, atomic_number=atom_num)

for bond in initial_mol.connectivity:
    initial_network.add_edge(*bond[:2])
# now we can use the new isomorphic check to get the atom mapping
isomorphic, atom_map = Molecule.are_isomorphic(
    mol_record,
    initial_network,
    return_atom_map=True,
    aromatic_matching=False,
    formal_charge_matching=False,
    bond_order_matching=False,
    bond_stereochemistry_matching=False,
    atom_stereochemistry_matching=False,
)

# we can print if the graph was found to be isomorphic and then the atom mapping
# the atoms are in the same order here as the idexes are the same in the mapping
print(isomorphic)
print(atom_map)
True
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67}

Now that we have seen how to make the molecule, lets look at also getting the geometry as currently we have none.

# check there is no geometry for the molecule
assert mol_record.n_conformers == 0

# if we also want the input geometry for the molecule, we just need to pass the relavent client instance
mol_dict = Molecule.from_qcschema(entry.dict(encoding="json"), client=client)

# check that there is a conformer
mol_dict.n_conformers
1
# Thanks to the qcschema method we also get visualisation for free, along with being able to compute
# properties like energy, gradient and hessian with qcengine using QM, rdkit, openmm, or ANI1
mol_dict.to_qcschema()

Here we will try and compute the energy using RDKit (only run this cell if qcengine is installed.)

# for example this molecules energy can be computed using qcengine and RDKit
import qcengine

# set up the RDKit task
rdkit_task = {
    "schema_name": "qcschema_input",
    "schema_version": 2,
    "molecule": mol_dict.to_qcschema(),
    "driver": "energy",
    "model": {"method": "uff", "basis": None},
    "keywords": {"scf_type": "df"},
}

# now lets compute the energy using qcengine and RDKit and print the result
result = qcengine.compute(rdkit_task, "rdkit")
# note the result is in QC units of hartrees
print(result.return_result)
0.053473233262621864