Source code for openff.recharge.charges.resp._resp

import functools
import warnings
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple

import numpy
from openff.toolkit.utils.exceptions import AtomMappingWarning
from openff.units import unit

from openff.recharge.charges.library import (
    LibraryChargeCollection,
    LibraryChargeParameter,
)
from openff.recharge.charges.resp.solvers import IterativeSolver, RESPNonLinearSolver
from openff.recharge.esp.storage import MoleculeESPRecord
from openff.recharge.optimize import ESPObjective, ESPObjectiveTerm
from openff.recharge.utilities.toolkits import (
    get_atom_symmetries,
    molecule_to_tagged_smiles,
)

if TYPE_CHECKING:
    from openff.toolkit import Molecule


def _generate_dummy_values(smiles: str) -> List[float]:
    """A convenience method for generating a list of dummy values for a
    ``LibraryChargeParameter`` that sums to the correct total charge.
    """

    from openff.toolkit import Molecule

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", AtomMappingWarning)
        molecule: Molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

    total_charge = molecule.total_charge.m_as(unit.elementary_charge)
    per_atom_charge = total_charge / molecule.n_atoms

    n_values = len(set(molecule.properties["atom_map"].values()))
    return [per_atom_charge] * n_values


def molecule_to_resp_library_charge(
    molecule: "Molecule",
    equivalize_within_methyl_carbons: bool,
    equivalize_within_methyl_hydrogens: bool,
    equivalize_within_other_heavy_atoms: bool,
    equivalize_within_other_hydrogen_atoms: bool,
) -> LibraryChargeParameter:
    """Creates a library charge parameter from a molecule where each atom as been
    assigned a map index that represents which equivalence group the atom is in.

    Notes
    -----
    * The ``value`` of the returned parameter will contain a set of dummy values that
      sum to the correct total charge.
    * The ``provenance`` dictionary contains the indices into the ``value`` array of
      any charges applied to methyl(ene) carbons / hydrogens and other heavy atoms
      / hydrogens
    * Topologically symmetry is detected using the ``get_atom_symmetries`` utility.

    Parameters
    ----------
    molecule
        The molecule to create the SMILES pattern from.
    equivalize_within_methyl_carbons
        Whether all topologically symmetric methyl(ene) carbons (i.e. those matched by
        '[#6X4H3,#6H4,#6X4H2:1]') in **the same** conformer should be assigned an
        equivalent charge.
    equivalize_within_methyl_hydrogens
        Whether all topologically symmetric methyl(ene) hydrogens (i.e. those attached to
        a methyl(ene) carbon) in **the same** conformer should be assigned an
        equivalent charge.
    equivalize_within_other_heavy_atoms
        Whether all topologically symmetric heavy atoms that are not methyl(ene) carbons
        in **the same** conformer should be assigned an equivalent charge.
    equivalize_within_other_hydrogen_atoms
        Whether all topologically symmetric hydrogens that are not methyl(ene) hydrogens
        in **the same** conformer should be assigned an equivalent charge.

    Returns
    -------
        The library charge with atoms assigned their correct symmetry groups.
    """

    atom_symmetries = get_atom_symmetries(molecule)
    max_symmetry_group = max(atom_symmetries) + 1

    methyl_carbons = [
        index
        for index, in molecule.chemical_environment_matches("[#6X4H3,#6H4,#6X4H2:1]")
    ]
    methyl_hydrogens = [
        atom.molecule_atom_index
        for index in methyl_carbons
        for atom in molecule.atoms[index].bonded_atoms
        if atom.atomic_number == 1
    ]
    other_heavy_atoms = [
        i
        for i, atom in enumerate(molecule.atoms)
        if atom.atomic_number != 1 and i not in methyl_carbons
    ]
    other_hydrogen_atoms = [
        i
        for i, atom in enumerate(molecule.atoms)
        if atom.atomic_number == 1 and i not in methyl_hydrogens
    ]

    atoms_not_to_equivalize = (
        ([] if equivalize_within_methyl_carbons else methyl_carbons)
        + ([] if equivalize_within_methyl_hydrogens else methyl_hydrogens)
        + ([] if equivalize_within_other_heavy_atoms else other_heavy_atoms)
        + ([] if equivalize_within_other_hydrogen_atoms else other_hydrogen_atoms)
    )

    for index in atoms_not_to_equivalize:
        atom_symmetries[index] = max_symmetry_group
        max_symmetry_group += 1

    symmetry_groups = sorted(set(atom_symmetries))

    atom_indices = [symmetry_groups.index(group) + 1 for group in atom_symmetries]
    tagged_smiles = molecule_to_tagged_smiles(molecule, atom_indices)

    return LibraryChargeParameter(
        smiles=tagged_smiles,
        value=_generate_dummy_values(tagged_smiles),
        provenance={
            "methyl-carbon-indices": sorted(
                {atom_indices[i] - 1 for i in methyl_carbons}
            ),
            "methyl-hydrogen-indices": sorted(
                {atom_indices[i] - 1 for i in methyl_hydrogens}
            ),
            "other-heavy-indices": sorted(
                {atom_indices[i] - 1 for i in other_heavy_atoms}
            ),
            "other-hydrogen-indices": sorted(
                {atom_indices[i] - 1 for i in other_hydrogen_atoms}
            ),
        },
    )


def _deduplicate_constraints(
    constraint_matrix: numpy.ndarray, constraint_values: numpy.ndarray
) -> Tuple[numpy.ndarray, numpy.ndarray]:
    """Removes duplicate rows from a constraint matrix and corresponding values are.

    Parameters
    ----------
    constraint_matrix
        The constraint matrix to de-duplicate with shape=(n_constraints, n_values).
    constraint_values
        The expected values of the constraints with shape=(n_constraints, 1)

    Returns
    -------
        A de-duplicated representation of the constraint matrix and values.
    """

    found_constraints = set()

    deduplicated_rows = []
    deduplicated_values = []

    for constraint_row, constraint_value in zip(constraint_matrix, constraint_values):
        constraint_row = tuple(int(i) for i in constraint_row)

        if constraint_row in found_constraints:
            continue

        deduplicated_rows.append(constraint_row)
        deduplicated_values.append([float(constraint_value)])

        found_constraints.add(constraint_row)

    return numpy.array(deduplicated_rows), numpy.array(deduplicated_values)


def generate_resp_systems_of_equations(
    charge_parameter: LibraryChargeParameter,
    qc_data_records: List[MoleculeESPRecord],
    equivalize_between_methyl_carbons: bool,
    equivalize_between_methyl_hydrogens: bool,
    equivalize_between_other_heavy_atoms: bool,
    equivalize_between_other_hydrogen_atoms: bool,
    fix_methyl_carbons: bool,
    fix_methyl_hydrogens: bool,
    fix_other_heavy_atoms: bool,
    fix_other_hydrogen_atoms: bool,
) -> Tuple[
    numpy.ndarray,
    numpy.ndarray,
    numpy.ndarray,
    numpy.ndarray,
    List[int],
    Dict[int, int],
]:
    """Generates the matrices that encode the systems of equations that form the RESP
    loss function.

    Parameters
    ----------
    charge_parameter
        The parameter that will be fit to ESP data.
    qc_data_records
        The records containing the reference QC ESP data.
    equivalize_between_methyl_carbons
        Whether all topologically symmetric methyl(ene) carbons (i.e. those matched by
        '[#6X4H3,#6H4,#6X4H2:1]') in **different** conformers should be assigned an
        equivalent charge.
    equivalize_between_methyl_hydrogens
        Whether all topologically symmetric methyl(ene) hydrogens (i.e. those attached to
        a methyl(ene) carbon) in **different** conformers should be assigned an
        equivalent charge.
    equivalize_between_other_heavy_atoms
        Whether all topologically symmetric heavy atoms that are not methyl(ene) carbons
        in **different** conformers should be assigned an equivalent charge.
    equivalize_between_other_hydrogen_atoms
        Whether all topologically symmetric hydrogens that are not methyl(ene) hydrogens
        should be assigned an equivalent charge.
    fix_methyl_carbons
        Whether to fix the charges on methyl(ene) carbons.
    fix_methyl_hydrogens
        Whether to fix the charges on methyl(ene) hydrogens.
    fix_other_heavy_atoms
        Whether to fix the charges on heavy atoms that are not methyl(ene) carbons.
    fix_other_hydrogen_atoms
        Whether to fix the charges on heavy atoms that are not methyl(ene) hydrogens.

    Returns
    -------
        A tuple of:

        * a design matrix with shape=(n_grid_points, n_not_fixed_charges)
        * a vector of reference ESP values with shape=(n_grid_points, 1)
        * a constraint matrix with shape=(n_constraints, n_not_fixed_charges)
        * a vector of values the constraints should equal with
          shape=(n_constraints, 1)
        * the indices of the `n_trainable_charges` that should be restrained
        * a dictionary that maps the indices of the not-fixed charges back to their
          original indices in the ``charge_parameter.value`` list.
    """

    methyl_carbons = charge_parameter.provenance["methyl-carbon-indices"]
    methyl_hydrogens = charge_parameter.provenance["methyl-hydrogen-indices"]
    other_heavy_atoms = charge_parameter.provenance["other-heavy-indices"]
    other_hydrogen_atoms = charge_parameter.provenance["other-hydrogen-indices"]

    charges_to_train = sorted(
        ([] if fix_methyl_carbons else methyl_carbons)
        + ([] if fix_methyl_hydrogens else methyl_hydrogens)
        + ([] if fix_other_heavy_atoms else other_heavy_atoms)
        + ([] if fix_other_hydrogen_atoms else other_hydrogen_atoms)
    )

    objective_terms = list(
        ESPObjective.compute_objective_terms(
            qc_data_records,
            charge_collection=LibraryChargeCollection(parameters=[charge_parameter]),
            charge_parameter_keys=[(charge_parameter.smiles, tuple(charges_to_train))],
        )
    )

    (
        constraint_matrix,
        constraint_vector,
    ) = charge_parameter.generate_constraint_matrix(charges_to_train)

    # We need to handle the special case of not equivalizing certain charges between
    # conformers. This is done by augmenting the design matrix to accommodate a set of
    # 'dummy' charges per conformer which are not equivalized between conformers.
    charges_not_to_equivalize = (
        ([] if equivalize_between_methyl_carbons else methyl_carbons)
        + ([] if equivalize_between_methyl_hydrogens else methyl_hydrogens)
        + ([] if equivalize_between_other_heavy_atoms else other_heavy_atoms)
        + ([] if equivalize_between_other_hydrogen_atoms else other_hydrogen_atoms)
    )
    charges_not_to_equivalize = [
        charges_to_train.index(i)
        for i in charges_not_to_equivalize
        if i in charges_to_train
    ]

    n_dummy_charges = (len(objective_terms) - 1) * len(charges_not_to_equivalize)

    conformer_constraint_matrices = []
    conformer_constraint_vectors = [constraint_vector] * len(objective_terms)

    for conformer_index, objective_term in enumerate(objective_terms):
        design_matrix = objective_term.atom_charge_design_matrix.copy()

        padded_design_matrix = numpy.pad(
            objective_term.atom_charge_design_matrix,
            pad_width=[(0, 0), (0, n_dummy_charges)],
            mode="constant",
        )
        padded_constraint = numpy.pad(
            constraint_matrix,
            pad_width=[(0, 0), (0, n_dummy_charges)],
            mode="constant",
        )

        if conformer_index > 0:
            old_order = list(range(padded_constraint.shape[1]))
            new_order = [*old_order]

            for i, charge_index in enumerate(charges_not_to_equivalize):
                new_order[
                    design_matrix.shape[1]
                    + (conformer_index - 1) * len(charges_not_to_equivalize)
                    + i
                ] = charge_index

            padded_design_matrix[:, old_order] = padded_design_matrix[:, new_order]
            padded_design_matrix[:, charges_not_to_equivalize] = 0.0
            padded_constraint[:, old_order] = padded_constraint[:, new_order]
            padded_constraint[:, charges_not_to_equivalize] = 0.0

        conformer_constraint_matrices.append(padded_constraint)
        objective_term.atom_charge_design_matrix = padded_design_matrix

    combined_term = ESPObjectiveTerm.combine(*objective_terms)

    restraint_indices = [
        charges_to_train.index(i)
        for i in (methyl_carbons + other_heavy_atoms)
        if i in charges_to_train
    ]
    charge_array_to_value = {i: index for i, index in enumerate(charges_to_train)}

    combined_constraint_matrix, combined_constraint_values = _deduplicate_constraints(
        numpy.vstack(conformer_constraint_matrices),
        numpy.vstack(conformer_constraint_vectors),
    )

    return (
        combined_term.atom_charge_design_matrix,
        combined_term.reference_values,
        combined_constraint_matrix,
        combined_constraint_values,
        restraint_indices,
        charge_array_to_value,
    )


@functools.lru_cache(10000)
def _canonicalize_smiles(smiles: str) -> str:
    """Attempts to canonicalize a SMILES pattern, stripping any map indices in the
    process
    """

    from openff.toolkit import Molecule

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", AtomMappingWarning)
        return Molecule.from_smiles(smiles, allow_undefined_stereo=True).to_smiles(
            mapped=False
        )


[docs]def generate_resp_charge_parameter( qc_data_records: List[MoleculeESPRecord], solver: Optional[RESPNonLinearSolver] ) -> LibraryChargeParameter: """Generates a set of RESP charges for a molecule in multiple conformers. Notes ----- * Methyl(ene) carbons are detected as any carbon matched by '[#6X4H3,#6H4,#6X4H2:1]', methyl(ene) hydrogens are any hydrogen attached to a methyl(ene) carbon, otherwise the atom is treated as any other heavy atom / hydrogen. * All heavy atom charge and non-methyl(ene) hydrogen charge will be equivalized in stage 1 of the fit both within and between multiple conformers, while methyl(ene) hydrogen charge will not be either within or between multiple conformers. * All methyl(ene) charges (both carbon and hydrogen) will be equivalized both within and between multiple conformers in stage 2 of the fit. * All atom charge will be free to vary during stage 1 of the fit, while only methyl(ene) charges (both carbon and hydrogen) will be free to vary during stage 2. * A value of b=0.1 is used for the hyperbolic restraints applied to all heavy atom charges in both stages, while a value of a=0.0005 and 0.001 will be used for stages 1 and 2 respectively. Parameters ---------- qc_data_records The computed ESP for a molecule in different conformers. solver The solver to use when finding the charges that minimize the RESP loss function. By default, the iterative solver described in the RESP publications is used. Returns ------- The RESP charges generated for the molecule. """ from openff.toolkit import Molecule solver = IterativeSolver() if solver is None else solver unique_smiles = { _canonicalize_smiles(record.tagged_smiles) for record in qc_data_records } assert ( len(unique_smiles) == 1 ), "all QC records must be generated for the same molecule" molecule = Molecule.from_smiles( next(iter(unique_smiles)), allow_undefined_stereo=True ) b = 0.1 ################################################################################### # STAGE 1 # # # # * METHYL(ENE) C : constrain equiv within=[✓] between=[✓] conformers fixed=[x] # # * METHYL(ENE) H : constrain equiv within=[x] between=[x] conformers fixed=[x] # # * HEAVY ATOMS : constrain equiv within=[✓] between=[✓] conformers fixed=[x] # # * HYDROGEN : constrain equiv within=[✓] between=[✓] conformers fixed=[x] # # # ################################################################################### a = 0.0005 resp_parameter_1 = molecule_to_resp_library_charge( molecule, equivalize_within_methyl_hydrogens=False, equivalize_within_methyl_carbons=True, equivalize_within_other_hydrogen_atoms=True, equivalize_within_other_heavy_atoms=True, ) ( design_matrix_1, reference_esp_1, constraint_matrix_1, constraint_vector_1, restraint_indices_1, _, ) = generate_resp_systems_of_equations( resp_parameter_1, qc_data_records, equivalize_between_methyl_hydrogens=False, equivalize_between_methyl_carbons=True, equivalize_between_other_hydrogen_atoms=True, equivalize_between_other_heavy_atoms=True, fix_methyl_hydrogens=False, fix_methyl_carbons=False, fix_other_hydrogen_atoms=False, fix_other_heavy_atoms=False, ) resp_charges_1 = solver.solve( design_matrix_1, reference_esp_1, constraint_matrix_1, constraint_vector_1, a, b, restraint_indices_1, len(qc_data_records), ) resp_parameter_1.value = ( resp_charges_1[: len(resp_parameter_1.value)].flatten().tolist() ) ################################################################################### # STAGE 2 # # # # * METHYL(ENE) C : constrain equiv within=[✓] between=[✓] conformers fixed=[x] # # * METHYL(ENE) H : constrain equiv within=[✓] between=[✓] conformers fixed=[x] # # * HEAVY ATOMS : constrain equiv within=[-] between=[-] conformers fixed=[✓] # # * HYDROGEN : constrain equiv within=[-] between=[-] conformers fixed=[✓] # # # ################################################################################### a = 0.001 resp_parameter_2 = molecule_to_resp_library_charge( molecule, equivalize_within_methyl_hydrogens=True, equivalize_within_methyl_carbons=True, equivalize_within_other_hydrogen_atoms=True, equivalize_within_other_heavy_atoms=True, ) resp_parameter_2.copy_value_from_other(resp_parameter_1) ( design_matrix_2, reference_esp_2, constraint_matrix_2, constraint_vector_2, restraint_indices_2, charge_map, ) = generate_resp_systems_of_equations( resp_parameter_2, qc_data_records, equivalize_between_methyl_hydrogens=True, equivalize_between_methyl_carbons=True, equivalize_between_other_hydrogen_atoms=True, equivalize_between_other_heavy_atoms=True, fix_methyl_hydrogens=False, fix_methyl_carbons=False, fix_other_hydrogen_atoms=True, fix_other_heavy_atoms=True, ) resp_charges_2 = solver.solve( design_matrix_2, reference_esp_2, constraint_matrix_2, constraint_vector_2, a, b, restraint_indices_2, len(qc_data_records), ) for array_index, value_index in charge_map.items(): resp_parameter_2.value[value_index] = float(resp_charges_2[array_index]) return resp_parameter_2