Source code for openforcefield.utils.utils

#!/usr/bin/env python
"""
Utility subroutines

"""

__all__ = [
    "MessageException",
    "IncompatibleUnitError",
    "inherit_docstrings",
    "all_subclasses",
    "temporary_cd",
    "get_data_file_path",
    "unit_to_string",
    "quantity_to_string",
    "string_to_unit",
    "string_to_quantity",
    "object_to_quantity",
    "check_units_are_compatible",
    "extract_serialized_units_from_dict",
    "attach_units",
    "detach_units",
    "serialize_numpy",
    "deserialize_numpy",
    "convert_all_quantities_to_string",
    "convert_all_strings_to_quantity",
    "convert_0_1_smirnoff_to_0_2",
    "convert_0_2_smirnoff_to_0_3",
    "get_molecule_parameterIDs",
]

# =============================================================================================
# GLOBAL IMPORTS
# =============================================================================================

import contextlib
import functools
import logging

from simtk import unit

# =============================================================================================
# CONFIGURE LOGGER
# =============================================================================================

logger = logging.getLogger(__name__)


# =============================================================================================
# COMMON EXCEPTION TYPES
# =============================================================================================


class MessageException(Exception):
    """A base class for exceptions that print out a string given in their constructor"""

    def __init__(self, msg):
        super().__init__(self, msg)
        self.msg = msg

    def __str__(self):
        return self.msg


class IncompatibleUnitError(MessageException):
    """
    Exception for when a parameter is in the wrong units for a ParameterHandler's unit system
    """

    pass


# =============================================================================================
# UTILITY SUBROUTINES
# =============================================================================================


[docs]def inherit_docstrings(cls):
    """Inherit docstrings from parent class"""
    from inspect import getmembers, isfunction

    for name, func in getmembers(cls, isfunction):
        if func.__doc__:
            continue
        for parent in cls.__mro__[1:]:
            if hasattr(parent, name):
                func.__doc__ = getattr(parent, name).__doc__
    return cls


[docs]def all_subclasses(cls):
    """Recursively retrieve all subclasses of the specified class"""
    return cls.__subclasses__() + [
        g for s in cls.__subclasses__() for g in all_subclasses(s)
    ]


[docs]@contextlib.contextmanager
def temporary_cd(dir_path):
    """Context to temporary change the working directory.
    Parameters
    ----------
    dir_path : str
        The directory path to enter within the context
    Examples
    --------
    >>> dir_path = '/tmp'
    >>> with temporary_cd(dir_path):
    ...     pass  # do something in dir_path
    """
    import os

    prev_dir = os.getcwd()
    os.chdir(os.path.abspath(dir_path))
    try:
        yield
    finally:
        os.chdir(prev_dir)


[docs]def get_data_file_path(relative_path):
    """Get the full path to one of the reference files in testsystems.
    In the source distribution, these files are in ``openforcefield/data/``,
    but on installation, they're moved to somewhere in the user's python
    site-packages directory.
    Parameters
    ----------
    name : str
        Name of the file to load (with respect to the repex folder).
    """

    import os

    from pkg_resources import resource_filename

    fn = resource_filename("openforcefield", os.path.join("data", relative_path))

    if not os.path.exists(fn):
        raise ValueError(
            f"Sorry! {fn} does not exist. If you just added it, you'll have to re-install"
        )

    return fn


def unit_to_string(input_unit):
    """
    Serialize a simtk.unit.Unit and return it as a string.

    Parameters
    ----------
    input_unit : A simtk.unit
        The unit to serialize

    Returns
    -------
    unit_string : str
        The serialized unit.
    """

    if input_unit == unit.dimensionless:
        return "dimensionless"

    # Decompose output_unit into a tuples of (base_dimension_unit, exponent)
    unit_string = None

    for unit_component in input_unit.iter_base_or_scaled_units():
        unit_component_name = unit_component[0].name
        # Convert, for example "elementary charge" --> "elementary_charge"
        unit_component_name = unit_component_name.replace(" ", "_")
        if unit_component[1] == 1:
            contribution = "{}".format(unit_component_name)
        else:
            contribution = "{}**{}".format(unit_component_name, int(unit_component[1]))
        if unit_string is None:
            unit_string = contribution
        else:
            unit_string += " * {}".format(contribution)

    return unit_string


def quantity_to_string(input_quantity):
    """
    Serialize a simtk.unit.Quantity to a string.

    Parameters
    ----------
    input_quantity : simtk.unit.Quantity
        The quantity to serialize

    Returns
    -------
    output_string : str
        The serialized quantity

    """
    import numpy as np

    if input_quantity is None:
        return None
    unitless_value = input_quantity.value_in_unit(input_quantity.unit)
    # The string representation of a numpy array doesn't have commas and breaks the
    # parser, thus we convert any arrays to list here
    if isinstance(unitless_value, np.ndarray):
        unitless_value = list(unitless_value)
    unit_string = unit_to_string(input_quantity.unit)
    output_string = "{} * {}".format(unitless_value, unit_string)
    return output_string


def _ast_eval(node):
    """
    Performs an algebraic syntax tree evaluation of a unit.

    Parameters
    ----------
    node : An ast parsing tree node
    """
    import ast
    import operator as op

    operators = {
        ast.Add: op.add,
        ast.Sub: op.sub,
        ast.Mult: op.mul,
        ast.Div: op.truediv,
        ast.Pow: op.pow,
        ast.BitXor: op.xor,
        ast.USub: op.neg,
    }

    if isinstance(node, ast.Num):  # <number>
        return node.n
    elif isinstance(node, ast.BinOp):  # <left> <operator> <right>
        return operators[type(node.op)](_ast_eval(node.left), _ast_eval(node.right))
    elif isinstance(node, ast.UnaryOp):  # <operator> <operand> e.g., -1
        return operators[type(node.op)](_ast_eval(node.operand))
    elif isinstance(node, ast.Name):
        # see if this is a simtk unit
        b = getattr(unit, node.id)
        return b
    # TODO: This was a quick hack that surprisingly worked. We should validate this further.
    elif isinstance(node, ast.List):
        return ast.literal_eval(node)
    else:
        raise TypeError(node)


def string_to_unit(unit_string):
    """
    Deserializes a simtk.unit.Quantity from a string representation, for
    example: "kilocalories_per_mole / angstrom ** 2"


    Parameters
    ----------
    unit_string : dict
        Serialized representation of a simtk.unit.Quantity.

    Returns
    -------
    output_unit: simtk.unit.Quantity
        The deserialized unit from the string
    """
    import ast

    output_unit = _ast_eval(ast.parse(unit_string, mode="eval").body)
    return output_unit

    # if (serialized['unitless_value'] is None) and (serialized['unit'] is None):
    #    return None
    # quantity_unit = None
    # for unit_name, power in serialized['unit']:
    #    unit_name = unit_name.replace(
    #        ' ', '_')  # Convert eg. 'elementary charge' to 'elementary_charge'
    #    if quantity_unit is None:
    #        quantity_unit = (getattr(unit, unit_name)**power)
    #    else:
    #        quantity_unit *= (getattr(unit, unit_name)**power)
    # quantity = unit.Quantity(serialized['unitless_value'], quantity_unit)
    # return quantity


def string_to_quantity(quantity_string):
    """
    Takes a string representation of a quantity and returns a simtk.unit.Quantity

    Parameters
    ----------
    quantity_string : str
        The quantity to deserialize

    Returns
    -------
    output_quantity : simtk.unit.Quantity
        The deserialized quantity
    """
    if quantity_string is None:
        return None
    # This can be the exact same as string_to_unit
    import ast

    output_quantity = _ast_eval(ast.parse(quantity_string, mode="eval").body)
    return output_quantity


def convert_all_strings_to_quantity(smirnoff_data):
    """
    Traverses a SMIRNOFF data structure, attempting to convert all
    quantity-defining strings into simtk.unit.Quantity objects.

    Integers and floats are ignored and not converted into a dimensionless
    ``simtk.unit.Quantity`` object.

    Parameters
    ----------
    smirnoff_data : dict
        A hierarchical dict structured in compliance with the SMIRNOFF spec

    Returns
    -------
    converted_smirnoff_data : dict
        A hierarchical dict structured in compliance with the SMIRNOFF spec,
        with quantity-defining strings converted to simtk.unit.Quantity objects
    """
    if isinstance(smirnoff_data, dict):
        for key, value in smirnoff_data.items():
            smirnoff_data[key] = convert_all_strings_to_quantity(value)
        obj_to_return = smirnoff_data

    elif isinstance(smirnoff_data, list):
        for index, item in enumerate(smirnoff_data):
            smirnoff_data[index] = convert_all_strings_to_quantity(item)
        obj_to_return = smirnoff_data

    elif isinstance(smirnoff_data, int) or isinstance(smirnoff_data, float):
        obj_to_return = smirnoff_data

    else:
        try:
            obj_to_return = object_to_quantity(smirnoff_data)
        except (AttributeError, TypeError, SyntaxError):
            obj_to_return = smirnoff_data

    return obj_to_return


def convert_all_quantities_to_string(smirnoff_data):
    """
    Traverses a SMIRNOFF data structure, attempting to convert all
    quantities into strings.

    Parameters
    ----------
    smirnoff_data : dict
        A hierarchical dict structured in compliance with the SMIRNOFF spec

    Returns
    -------
    converted_smirnoff_data : dict
        A hierarchical dict structured in compliance with the SMIRNOFF spec,
        with simtk.unit.Quantitys converted to string
    """

    if isinstance(smirnoff_data, dict):
        for key, value in smirnoff_data.items():
            smirnoff_data[key] = convert_all_quantities_to_string(value)
        obj_to_return = smirnoff_data
    elif isinstance(smirnoff_data, list):
        for index, item in enumerate(smirnoff_data):
            smirnoff_data[index] = convert_all_quantities_to_string(item)
        obj_to_return = smirnoff_data
    elif isinstance(smirnoff_data, unit.Quantity):
        obj_to_return = quantity_to_string(smirnoff_data)
    else:
        obj_to_return = smirnoff_data

    return obj_to_return


@functools.singledispatch
def object_to_quantity(object):
    """
    Attempts to turn the provided object into simtk.unit.Quantity(s).

    Can handle float, int, strings, quantities, or iterators over
    the same. Raises an exception if unable to convert all inputs.

    Parameters
    ----------
    object : int, float, string, quantity, or iterator of strings of quantities
        The object to convert to a ``simtk.unit.Quantity`` object.

    Returns
    -------
    converted_object : simtk.unit.Quantity or List[simtk.unit.Quantity]

    """
    # If we can't find a custom type, we treat this as a generic iterator.
    return [object_to_quantity(sub_obj) for sub_obj in object]


@object_to_quantity.register(unit.Quantity)
def _(obj):
    return obj


@object_to_quantity.register(str)
def _(obj):
    return string_to_quantity(obj)


@object_to_quantity.register(int)
@object_to_quantity.register(float)
def _(obj):
    return unit.Quantity(obj)


def check_units_are_compatible(object_name, object, unit_to_check, context=None):
    """
    Checks whether a simtk.unit.Quantity or list of simtk.unit.Quantitys is compatible with given unit.

    Parameters
    ----------
    object_name : string
        Name of object, used in printing exception.
    object : A simtk.unit.Quantity or list of simtk.unit.Quantitys
    unit_to_check : A simtk.unit.Unit
    context : string, optional. Default=None
        Additional information to provide at the beginning of the exception message if raised

    Raises
    ------
    IncompatibleUnitError
    """
    from simtk import unit

    # If context is not provided, explicitly make it a blank string
    if context is None:
        context = ""
    # Otherwise add a space after the end of it to correct message printing
    else:
        context += " "

    if isinstance(object, list):
        for sub_object in object:
            check_units_are_compatible(
                object_name, sub_object, unit_to_check, context=context
            )
    elif isinstance(object, unit.Quantity):
        if not object.unit.is_compatible(unit_to_check):
            msg = (
                f"{context}{object_name} with "
                f"value {object} is incompatible with expected unit {unit_to_check}"
            )
            raise IncompatibleUnitError(msg)
    else:
        msg = (
            f"{context}{object_name} with "
            f"value {object} is incompatible with expected unit {unit_to_check}"
        )
        raise IncompatibleUnitError(msg)


def extract_serialized_units_from_dict(input_dict):
    """
    Create a mapping of (potentially unit-bearing) quantities from a dictionary, where some keys exist in pairs like
    {'length': 8, 'length_unit':'angstrom'}.

    Parameters
    ----------
    input_dict : dict
       Dictionary where some keys are paired like {'X': 1.0, 'X_unit': angstrom}.

    Returns
    -------
    unitless_dict : dict
       input_dict, but with keys ending in ``_unit`` removed.
    attached_units : dict str : simtk.unit.Unit
       ``attached_units[parameter_name]`` is the simtk.unit.Unit combination that should be attached to corresponding
       parameter ``parameter_name``. For example ``attached_units['X'] = simtk.unit.angstrom.

    """

    # TODO: Should this scheme also convert "1" to int(1) and "8.0" to float(8.0)?
    from collections import OrderedDict

    attached_units = OrderedDict()
    unitless_dict = input_dict.copy()
    keys_to_delete = []
    for key in input_dict.keys():
        if key.endswith("_unit"):
            parameter_name = key[:-5]
            parameter_units_string = input_dict[key]
            try:
                parameter_units = string_to_unit(parameter_units_string)
            except Exception as e:
                e.msg = (
                    "Could not parse units {}\n".format(parameter_units_string) + e.msg
                )
                raise e
            attached_units[parameter_name] = parameter_units
            # Remember this key and delete it later (we break the dict if we delete a key in the loop)
            keys_to_delete.append(key)
    # Clean out the '*_unit' keys that we processed
    for key in keys_to_delete:
        del unitless_dict[key]

    return unitless_dict, attached_units


def attach_units(unitless_dict, attached_units):
    """
    Attach units to dict entries for which units are specified.

    Parameters
    ----------
    unitless_dict : dict
       Dictionary, where some items are to have units applied.
    attached_units : dict [str : simtk.unit.Unit]
       ``attached_units[parameter_name]`` is the simtk.unit.Unit combination that should be attached to corresponding
       parameter ``parameter_name``

    Returns
    -------
    unit_bearing_dict : dict
       Updated dict with simtk.unit.Unit units attached to values for which units were specified for their keys

    """
    temp_dict = unitless_dict.copy()
    for parameter_name, units_to_attach in attached_units.items():
        if parameter_name in temp_dict.keys():
            parameter_attrib_string = temp_dict[parameter_name]
            try:
                temp_dict[parameter_name] = (
                    float(parameter_attrib_string) * units_to_attach
                )
            except ValueError as e:
                e.msg = (
                    "Expected numeric value for parameter '{}',"
                    "instead found '{}' when trying to attach units '{}'\n"
                ).format(parameter_name, parameter_attrib_string, units_to_attach)
                raise e

        # Now check for matches like "phase1", "phase2"
        c = 1
        while (parameter_name + str(c)) in temp_dict.keys():
            indexed_parameter_name = parameter_name + str(c)
            parameter_attrib_string = temp_dict[indexed_parameter_name]
            try:
                temp_dict[indexed_parameter_name] = (
                    float(parameter_attrib_string) * units_to_attach
                )
            except ValueError as e:
                e.msg = "Expected numeric value for parameter '{}', instead found '{}' when trying to attach units '{}'\n".format(
                    indexed_parameter_name, parameter_attrib_string, units_to_attach
                )
                raise e
            c += 1
    return temp_dict


def detach_units(unit_bearing_dict, output_units=None):
    """
    Given a dict which may contain some simtk.unit.Quantity objects, return the same dict with the Quantities
    replaced with unitless values, and a new dict containing entries with the suffix "_unit" added, containing
    the units.

    Parameters
    ----------
    unit_bearing_dict : dict
        A dictionary potentially containing simtk.unit.Quantity objects as values.
    output_units : dict[str : simtk.unit.Unit], optional. Default = None
        A mapping from parameter fields to the output unit its value should be converted to.
        For example, {'length_unit': unit.angstrom}. If no output_unit is defined for a key:value pair in which
        the value is a simtk.unit.Quantity, the output unit will be the Quantity's unit, and this information
        will be included in the unit_dict return value.

    Returns
    -------
    unitless_dict : dict
        The input smirnoff_dict object, with all simtk.unit.Quantity values converted to unitless values.
    unit_dict : dict
        A dictionary in which keys are keys of simtk.unit.Quantity values in unit_bearing_dict,
        but suffixed with "_unit". Values are simtk.unit.Unit .
    """
    from simtk import unit

    if output_units is None:
        output_units = {}

    # initialize dictionaries for outputs
    unit_dict = {}
    unitless_dict = unit_bearing_dict.copy()

    for key, value in unit_bearing_dict.items():
        # If no conversion is needed, skip this item
        if not isinstance(value, unit.Quantity):
            continue

        # If conversion is needed, see if the user has requested an output unit
        unit_key = key + "_unit"

        if unit_key in output_units:
            output_unit = output_units[unit_key]
        else:
            output_unit = value.unit
        if not (output_unit.is_compatible(value.unit)):
            raise ValueError(
                "Requested output unit {} is not compatible with "
                "quantity unit {} .".format(output_unit, value.unit)
            )
        unitless_dict[key] = value.value_in_unit(output_unit)
        unit_dict[unit_key] = output_unit

    return unitless_dict, unit_dict


def serialize_numpy(np_array):
    """
    Serializes a numpy array into a JSON-compatible string. Leverages the numpy.save function,
    thereby preserving the shape of the input array

    from https://stackoverflow.com/questions/30698004/how-can-i-serialize-a-numpy-array-while-preserving-matrix-dimensions#30699208

    Parameters
    ----------
    np_array : A numpy array
        Input numpy array

    Returns
    -------
    serialized : str
        A serialized representation of the numpy array.
    shape : tuple of ints
        The shape of the serialized array
    """

    bigendian_array = np_array.newbyteorder(">")
    serialized = bigendian_array.tobytes()
    shape = np_array.shape
    return serialized, shape


def deserialize_numpy(serialized_np, shape):
    """
    Deserializes a numpy array from a JSON-compatible string.

    from https://stackoverflow.com/questions/30698004/how-can-i-serialize-a-numpy-array-while-preserving-matrix-dimensions#30699208

    Parameters
    ----------
    serialized_np : str
        A serialized numpy array
    shape : tuple of ints
        The shape of the serialized array
    Returns
    -------
    np_array : numpy.ndarray
        The deserialized numpy array
    """

    import numpy as np

    dt = np.dtype("float")
    dt.newbyteorder(">")  # set to big-endian
    np_array = np.frombuffer(serialized_np, dtype=dt)
    np_array = np_array.reshape(shape)
    return np_array


[docs]def convert_0_2_smirnoff_to_0_3(smirnoff_data_0_2):
    """
    Convert an 0.2-compliant SMIRNOFF dict to an 0.3-compliant one.
    This involves removing units from header tags and adding them
    to attributes of child elements.
    It also requires converting ProperTorsions and ImproperTorsions
    potentials from "charmm" to "fourier".

    Parameters
    ----------
    smirnoff_data_0_2 : dict
        Hierarchical dict representing a SMIRNOFF data structure according the the 0.2 spec

    Returns
    -------
    smirnoff_data_0_3
        Hierarchical dict representing a SMIRNOFF data structure according the the 0.3 spec
    """
    # Legacy forcefields sometimes specify the NonbondedForce's sigma_unit value, but then provide
    # atom size as rmin_half. Here we correct for this behavior by explicitly defining both as
    # the same unit if either one is defined.
    if "vdW" in smirnoff_data_0_2["SMIRNOFF"].keys():
        rmh_unit = smirnoff_data_0_2["SMIRNOFF"]["vdW"].get("rmin_half_unit", None)
        sig_unit = smirnoff_data_0_2["SMIRNOFF"]["vdW"].get("sigma_unit", None)
        if (rmh_unit is not None) and (sig_unit is None):
            smirnoff_data_0_2["SMIRNOFF"]["vdW"]["sigma_unit"] = rmh_unit
        elif (sig_unit is not None) and (rmh_unit is None):
            smirnoff_data_0_2["SMIRNOFF"]["vdW"]["rmin_half_unit"] = sig_unit
        # If both are None, or both are defined, don't overwrite anything
        else:
            pass

    # Recursively attach unit strings
    smirnoff_data = recursive_attach_unit_strings(smirnoff_data_0_2, {})

    # Change TorsionHandler potential from "charmm" to "k*(1+cos(periodicity*theta-phase))". Note that, scientifically,
    # we should have used "k*(1+cos(periodicity*theta-phase))" all along, since "charmm" technically
    # implies that we would support a harmonic potential for torsion terms with periodicity 0
    # More at: https://github.com/openforcefield/openforcefield/issues/303#issuecomment-490156779
    if "ProperTorsions" in smirnoff_data["SMIRNOFF"]:
        if "potential" in smirnoff_data["SMIRNOFF"]["ProperTorsions"]:
            if smirnoff_data["SMIRNOFF"]["ProperTorsions"]["potential"] == "charmm":
                smirnoff_data["SMIRNOFF"]["ProperTorsions"][
                    "potential"
                ] = "k*(1+cos(periodicity*theta-phase))"
    if "ImproperTorsions" in smirnoff_data["SMIRNOFF"]:
        if "potential" in smirnoff_data["SMIRNOFF"]["ImproperTorsions"]:
            if smirnoff_data["SMIRNOFF"]["ImproperTorsions"]["potential"] == "charmm":
                smirnoff_data["SMIRNOFF"]["ImproperTorsions"][
                    "potential"
                ] = "k*(1+cos(periodicity*theta-phase))"

    # Add per-section tag
    sections_not_to_version_0_3 = ["Author", "Date", "version", "aromaticity_model"]
    for l1_tag in smirnoff_data["SMIRNOFF"].keys():
        if l1_tag not in sections_not_to_version_0_3:

            if smirnoff_data["SMIRNOFF"][l1_tag] is None:
                # Handle empty entries, such as the ToolkitAM1BCC handler.
                smirnoff_data["SMIRNOFF"][l1_tag] = {}

            smirnoff_data["SMIRNOFF"][l1_tag]["version"] = 0.3

    # Update top-level tag
    smirnoff_data["SMIRNOFF"]["version"] = 0.3

    return smirnoff_data


[docs]def convert_0_1_smirnoff_to_0_2(smirnoff_data_0_1):
    """
    Convert an 0.1-compliant SMIRNOFF dict to an 0.2-compliant one.
    This involves renaming several tags, adding Electrostatics and ToolkitAM1BCC tags, and
    separating improper torsions into their own section.

    Parameters
    ----------
    smirnoff_data_0_1 : dict
        Hierarchical dict representing a SMIRNOFF data structure according the the 0.1 spec

    Returns
    -------
    smirnoff_data_0_2
        Hierarchical dict representing a SMIRNOFF data structure according the the 0.2 spec
    """
    smirnoff_data = smirnoff_data_0_1.copy()

    l0_replacement_dict = {"SMIRFF": "SMIRNOFF"}
    l1_replacement_dict = {
        "HarmonicBondForce": "Bonds",
        "HarmonicAngleForce": "Angles",
        "PeriodicTorsionForce": "ProperTorsions",
        "NonbondedForce": "vdW",
    }
    for old_l0_tag, new_l0_tag in l0_replacement_dict.items():
        # Convert first-level smirnoff_data tags.
        # Right now this just changes the SMIRFF tag to SMIRNOFF
        if old_l0_tag in smirnoff_data.keys():
            smirnoff_data[new_l0_tag] = smirnoff_data[old_l0_tag]
            del smirnoff_data[old_l0_tag]

    # SMIRFF tag will have been converted to SMIRNOFF here
    # Convert second-level tags here
    for old_l1_tag, new_l1_tag in l1_replacement_dict.items():
        if old_l1_tag in smirnoff_data["SMIRNOFF"].keys():
            smirnoff_data["SMIRNOFF"][new_l1_tag] = smirnoff_data["SMIRNOFF"][
                old_l1_tag
            ]
            del smirnoff_data["SMIRNOFF"][old_l1_tag]

    # Add 'potential' field to each l1 tag
    default_potential = {
        "Bonds": "harmonic",
        "Angles": "harmonic",
        "ProperTorsions": "charmm",
        # Note that "charmm" isn't actually correct, and was later changed
        # in the 0.3 spec. More info at
        # https://github.com/openforcefield/openforcefield/pull/311#commitcomment-33494506
        "vdW": "Lennard-Jones-12-6",
    }
    for l1_tag in smirnoff_data["SMIRNOFF"].keys():
        if l1_tag in default_potential.keys():
            # Ensure that it isn't there already (shouldn't happen, but better to be safe)
            if "potential" in smirnoff_data["SMIRNOFF"][l1_tag].keys():
                assert smirnoff_data[l1_tag].keys == default_potential[l1_tag]
                continue
            # Issue an informative warning about assumptions made during conversion.
            logger.warning(
                f"0.1 SMIRNOFF spec file does not contain 'potential' attribute for '{l1_tag}' tag. "
                f"The SMIRNOFF spec converter is assuming it has a value of '{default_potential[l1_tag]}'"
            )
            smirnoff_data["SMIRNOFF"][l1_tag]["potential"] = default_potential[l1_tag]

    # Separate improper torsions from propers
    if "ProperTorsions" in smirnoff_data["SMIRNOFF"]:
        if "Improper" in smirnoff_data["SMIRNOFF"]["ProperTorsions"]:
            # First generate an ImproperTorsions header, taking the relevant values from the ProperTorsions header
            improper_section = {
                "k_unit": smirnoff_data["SMIRNOFF"]["ProperTorsions"]["k_unit"],
                "phase_unit": smirnoff_data["SMIRNOFF"]["ProperTorsions"]["phase_unit"],
                "potential": smirnoff_data["SMIRNOFF"]["ProperTorsions"]["potential"],
                "Improper": smirnoff_data["SMIRNOFF"]["ProperTorsions"]["Improper"],
            }

            # Then, attach the newly-made ImproperTorsions section
            smirnoff_data["SMIRNOFF"]["ImproperTorsions"] = improper_section
            del smirnoff_data["SMIRNOFF"]["ProperTorsions"]["Improper"]

    # Add Electrostatics tag, setting several values to their defaults and
    # warning about assumptions that are being made
    electrostatics_section = {
        "method": "PME",
        "scale12": 0.0,
        "scale13": 0.0,
        "scale15": 1.0,
        "cutoff": 9.0,
        "cutoff_unit": "angstrom",
    }
    logger.warning(
        "0.1 SMIRNOFF spec did not allow the 'Electrostatics' tag. Adding it in 0.2 spec conversion, and "
        "assuming the following values:"
    )
    for key, val in electrostatics_section.items():
        logger.warning(f"\t{key}: {val}")

    # Take electrostatics 1-4 scaling term from 0.1 spec's NonBondedForce tag
    electrostatics_section["scale14"] = smirnoff_data["SMIRNOFF"]["vdW"][
        "coulomb14scale"
    ]
    del smirnoff_data["SMIRNOFF"]["vdW"]["coulomb14scale"]
    smirnoff_data["SMIRNOFF"]["Electrostatics"] = electrostatics_section

    # Change vdW's lj14scale to 14scale, add other scaling terms
    vdw_section_additions = {
        "method": "cutoff",
        "combining_rules": "Lorentz-Berthelot",
        "scale12": "0.0",
        "scale13": "0.0",
        "scale15": "1",
        "switch_width": "1.0",
        "switch_width_unit": "angstrom",
        "cutoff": "9.0",
        "cutoff_unit": "angstrom",
    }
    for key, val in vdw_section_additions.items():
        if not key in smirnoff_data["SMIRNOFF"]["vdW"].keys():
            logger.warning(
                f"0.1 SMIRNOFF spec file does not contain '{key}' attribute for 'NonBondedMethod/vdW'' tag. "
                f"The SMIRNOFF spec converter is assuming it has a value of '{val}'"
            )
            smirnoff_data["SMIRNOFF"]["vdW"][key] = val

    # Rename L-J 1-4 scaling term from 0.1 spec's NonBondedForce tag to vdW's scale14
    smirnoff_data["SMIRNOFF"]["vdW"]["scale14"] = smirnoff_data["SMIRNOFF"]["vdW"][
        "lj14scale"
    ]
    del smirnoff_data["SMIRNOFF"]["vdW"]["lj14scale"]

    # Add <ToolkitAM1BCC/> tag
    smirnoff_data["SMIRNOFF"]["ToolkitAM1BCC"] = {}

    # Update top-level tag
    smirnoff_data["SMIRNOFF"]["version"] = 0.2

    return smirnoff_data


def recursive_attach_unit_strings(smirnoff_data, units_to_attach):
    """
    Recursively traverse a SMIRNOFF data structure, appending "* {unit}" to values in key:value pairs
    where "key_unit":"unit_string" is present at a higher level in the hierarchy.
    This function expects all items in smirnoff_data to be formatted as strings.

    Parameters
    ----------
    smirnoff_data : dict
        Any level of hierarchy that is part of a SMIRNOFF dict, with all data members
        formatted as string.
    units_to_attach : dict
        Dict of the form {key:unit_string}

    Returns
    -------
    unit_appended_smirnoff_data: dict
    """
    import re

    # Make a copy of units_to_attach so we don't modify the original (otherwise things like k_unit could
    # leak between sections)
    units_to_attach = units_to_attach.copy()
    # smirnoff_data = smirnoff_data.copy()

    # If we're working with a dict, see if there are any new unit entries and store them,
    # then operate recursively on the values in the dict.
    if isinstance(smirnoff_data, dict):

        # Go over all key:value pairs once to see if there are new units to attach.
        # Note that units to be attached can be defined in the same dict as the
        # key:value pair they will be attached to, so we need to complete this check
        # before we are able to check other items in the dict.
        for key, value in list(smirnoff_data.items()):
            if key[-5:] == "_unit":
                units_to_attach[key[:-5]] = value
                del smirnoff_data[key]

        # Go through once more to attach units as appropriate
        for key in smirnoff_data.keys():

            # We use regular expressions to catch possible indexed attributes
            attach_unit = None
            for unit_key, unit_string in units_to_attach.items():
                if re.match(f"{unit_key}[0-9]*", key):
                    attach_unit = unit_string

            if attach_unit is not None:
                smirnoff_data[key] = str(smirnoff_data[key]) + " * " + attach_unit

            # And recursively act on value, in case it's a deeper level of hierarchy
            smirnoff_data[key] = recursive_attach_unit_strings(
                smirnoff_data[key], units_to_attach
            )

    # If it's a list, operate on each member of the list
    elif isinstance(smirnoff_data, list):
        for index, value in enumerate(smirnoff_data):
            smirnoff_data[index] = recursive_attach_unit_strings(value, units_to_attach)

    # Otherwise, just return smirnoff_data unchanged
    else:
        pass

    return smirnoff_data


[docs]def get_molecule_parameterIDs(molecules, forcefield):
    """Process a list of molecules with a specified SMIRNOFF ffxml file and determine which parameters are used by
    which molecules, returning collated results.

    Parameters
    ----------
    molecules : list of openforcefield.topology.Molecule
        List of molecules (with explicit hydrogens) to parse
    forcefield : openforcefield.typing.engines.smirnoff.ForceField
        The ForceField to apply

    Returns
    -------
    parameters_by_molecule : dict
        Parameter IDs used in each molecule, keyed by isomeric SMILES
        generated from provided OEMols. Each entry in the dict is a list
        which does not necessarily have unique entries; i.e. parameter IDs
        which are used more than once will occur multiple times.

    parameters_by_ID : dict
        Molecules in which each parameter ID occur, keyed by parameter ID.
        Each entry in the dict is a set of isomeric SMILES for molecules
        in which that parameter occurs. No frequency information is stored.

    """

    from openforcefield.topology import Topology

    # Create storage
    parameters_by_molecule = dict()
    parameters_by_ID = dict()

    # Generate isomeric SMILES for each molecule, ensuring all molecules are unique
    isosmiles = [molecule.to_smiles() for molecule in molecules]
    already_seen = set()
    duplicates = set(
        smiles
        for smiles in isosmiles
        if smiles in already_seen or already_seen.add(smiles)
    )
    if len(duplicates) > 0:
        raise ValueError(
            "Error: get_molecule_parameterIDs has been provided a list of oemols which contains some duplicates: {}".format(
                duplicates
            )
        )

    # Assemble molecules into a Topology
    topology = Topology()
    for molecule in molecules:
        topology.add_molecule(molecule)

    # Label molecules
    labels = forcefield.label_molecules(topology)

    # Organize labels into output dictionary by looping over all molecules/smiles
    for idx in range(len(isosmiles)):
        # Pull smiles, initialize storage
        smi = isosmiles[idx]
        parameters_by_molecule[smi] = []

        # Organize data for this molecule
        data = labels[idx]
        for force_type in data.keys():
            for atom_indices, parameter_type in data[force_type].items():

                pid = parameter_type.id
                # Store pid to molecule
                parameters_by_molecule[smi].append(pid)

                # Store which molecule this pid occurred in
                if pid not in parameters_by_ID:
                    parameters_by_ID[pid] = set()
                    parameters_by_ID[pid].add(smi)
                else:
                    parameters_by_ID[pid].add(smi)

    return parameters_by_molecule, parameters_by_ID


def sort_smirnoff_dict(data):
    """
    Recursively sort the keys in a dict of SMIRNOFF data.

    Adapted from https://stackoverflow.com/a/47882384/4248961

    TODO: Should this live elsewhere?
    """
    sorted_dict = dict()
    for key, val in sorted(data.items()):
        if isinstance(val, dict):
            # This should hit each ParameterHandler and dicts within them
            sorted_dict[key] = sort_smirnoff_dict(val)
        elif isinstance(val, list):
            # Handle case of ParameterLists, which show up in
            # the smirnoff dicts as lists of OrderedDicts
            new_parameter_list = list()
            for param in val:
                new_parameter_list.append(sort_smirnoff_dict(param))
            sorted_dict[key] = new_parameter_list
        else:
            # Handle metadata or the bottom of a recursive dict
            sorted_dict[key] = val
    return sorted_dict