Source code for openff.evaluator.protocols.analysis

"""
A collection of protocols for running analysing the results of molecular simulations.
"""
import abc
import itertools
import typing
from os import path

import numpy as np
from openff.units import unit
from openff.units.openmm import from_openmm

from openff.evaluator.attributes import UNDEFINED
from openff.evaluator.forcefield import ParameterGradient, SmirnoffForceFieldSource
from openff.evaluator.forcefield.system import ParameterizedSystem
from openff.evaluator.thermodynamics import ThermodynamicState
from openff.evaluator.utils import timeseries
from openff.evaluator.utils.observables import (
    Observable,
    ObservableArray,
    ObservableFrame,
    bootstrap,
)
from openff.evaluator.utils.openmm import system_subset
from openff.evaluator.utils.timeseries import (
    TimeSeriesStatistics,
    analyze_time_series,
    get_uncorrelated_indices,
)
from openff.evaluator.workflow import Protocol, workflow_protocol
from openff.evaluator.workflow.attributes import (
    InequalityMergeBehavior,
    InputAttribute,
    OutputAttribute,
)

if typing.TYPE_CHECKING:

    import openmm


E0 = 8.854187817e-12 * unit.farad / unit.meter  # Taken from QCElemental


def compute_dielectric_constant(
    dipole_moments: ObservableArray,
    volumes: ObservableArray,
    temperature: unit.Quantity,
    average_function,
) -> Observable:
    """A function to compute the average dielectric constant from an array of
    dipole moments and an array of volumes, whereby the average values of the
    observables are computed using a custom function.

    Parameters
    ----------
    dipole_moments
        The dipole moments array.
    volumes
        The volume array.
    temperature
        The temperature at which the dipole_moments and volumes were sampled.
    average_function
        The function to use when evaluating the average of an observable.

    Returns
    -------
        The average value of the dielectric constant.
    """

    dipole_moments_sqr = dipole_moments * dipole_moments
    dipole_moments_sqr = ObservableArray(
        value=dipole_moments_sqr.value.sum(axis=1),
        gradients=[
            ParameterGradient(gradient.key, gradient.value.sum(axis=1))
            for gradient in dipole_moments_sqr.gradients
        ],
    )

    avg_sqr_dipole_moments = average_function(observable=dipole_moments_sqr)
    avg_sqr_dipole_moments = ObservableArray(
        avg_sqr_dipole_moments.value, avg_sqr_dipole_moments.gradients
    )

    avg_dipole_moment = average_function(observable=dipole_moments)

    avg_dipole_moment_sqr = avg_dipole_moment * avg_dipole_moment
    avg_dipole_moment_sqr = ObservableArray(
        value=avg_dipole_moment_sqr.value.sum(axis=1),
        gradients=[
            ParameterGradient(gradient.key, gradient.value.sum(axis=1))
            for gradient in avg_dipole_moment_sqr.gradients
        ],
    )

    avg_volume = average_function(observable=volumes)
    avg_volume = ObservableArray(avg_volume.value, avg_volume.gradients)

    dipole_variance = avg_sqr_dipole_moments - avg_dipole_moment_sqr

    prefactor = 1.0 / (3.0 * E0 * unit.boltzmann_constant * temperature)

    dielectric_constant = 1.0 * unit.dimensionless + prefactor * (
        dipole_variance / avg_volume
    )

    return Observable(
        value=dielectric_constant.value.item().to(unit.dimensionless),
        gradients=[
            ParameterGradient(
                gradient.key,
                gradient.value.item().to(
                    unit.dimensionless
                    * gradient.value.units
                    / dielectric_constant.value.units
                ),
            )
            for gradient in dielectric_constant.gradients
        ],
    )


[docs]class BaseAverageObservable(Protocol, abc.ABC): """An abstract base class for protocols which will calculate the average value of an observable and its uncertainty via bootstrapping. """ bootstrap_iterations = InputAttribute( docstring="The number of bootstrap iterations to perform.", type_hint=int, default_value=250, merge_behavior=InequalityMergeBehavior.LargestValue, ) bootstrap_sample_size = InputAttribute( docstring="The relative sample size to use for bootstrapping.", type_hint=float, default_value=1.0, merge_behavior=InequalityMergeBehavior.LargestValue, ) thermodynamic_state = InputAttribute( docstring="The state at which the observables were computed. This is required " "to compute ensemble averages of the gradients of the observable with respect " "to force field parameters.", type_hint=ThermodynamicState, default_value=UNDEFINED, optional=True, ) potential_energies = InputAttribute( docstring="The potential energies which were evaluated at the same " "configurations and using the same force field parameters as the observable to " "average. This is required to compute ensemble averages of the gradients of " "the observable with respect to force field parameters.", type_hint=ObservableArray, default_value=UNDEFINED, optional=True, ) value = OutputAttribute( docstring="The average value of the observable.", type_hint=Observable ) time_series_statistics = OutputAttribute( docstring="Statistics about the observables from which the average was computed." "These include the statistical inefficiency and the index after which the " "observables have become stationary (i.e. equilibrated).", type_hint=TimeSeriesStatistics, ) @abc.abstractmethod def _observables(self) -> typing.Dict[str, ObservableArray]: """A function which should return the observables to pass to the ``_bootstrap_function`` function.""" raise NotImplementedError() def _bootstrap_function(self, **kwargs: ObservableArray) -> Observable: """The function to perform on the data set being sampled by bootstrapping. Parameters ---------- observables The bootstrap sample values. Returns ------- The result of evaluating the data. """ # The simple base function only supports a single observable. assert len(kwargs) == 1 # Compute the mean observable. sample_observable = next(iter(kwargs.values())) mean_observable = np.mean(sample_observable.value, axis=0) if sample_observable.value.shape[1] > 1: mean_observable = mean_observable.reshape(1, -1) else: mean_observable = mean_observable.item() # Retrieve the potential gradients for easy access potential_gradients = { gradient.key: gradient.value for gradient in ( [] if self.potential_energies == UNDEFINED else self.potential_energies.gradients ) } observable_gradients = { gradient.key: gradient for gradient in ( [] if self.potential_energies == UNDEFINED else sample_observable.gradients ) } # Compute the mean gradients. gradients = [] for gradient_key in observable_gradients: gradient = observable_gradients[gradient_key] value = np.mean(gradient.value, axis=0) - self.thermodynamic_state.beta * ( np.mean( sample_observable.value * potential_gradients[gradient.key], axis=0, ) - ( np.mean(sample_observable.value, axis=0) * np.mean(potential_gradients[gradient.key], axis=0) ) ) if sample_observable.value.shape[1] > 1: value = value.reshape(1, -1) else: value = value.item() gradients.append(ParameterGradient(key=gradient.key, value=value)) return_type = ( Observable if sample_observable.value.shape[1] == 1 else ObservableArray ) return return_type(value=mean_observable, gradients=gradients) def _execute(self, directory, available_resources): # Retrieve the list of observables to compute the average using. observables = self._observables() if len(observables) == 0: raise ValueError("There are no observables to average.") expected_length = len(next(iter(observables.values()))) if not any(len(observable) != expected_length for observable in observables): raise ValueError("The observables to average must have the same length.") if ( self.potential_energies != UNDEFINED and self.thermodynamic_state == UNDEFINED ): raise ValueError( "The `thermodynamic_state` must be provided when the " "`potential_energies` input is specified (i.e. when gradients should " "be computed." ) if self.potential_energies != UNDEFINED: potential_gradients = { gradient.key for gradient in self.potential_energies.gradients } if any( {gradient.key for gradient in observable.gradients} != potential_gradients for observable in observables.values() ): raise ValueError( "The potential energies must have been differentiated with respect " "to the same force field parameters as the observables of interest." ) # Find the largest equilibration time and statistical inefficiency for each of # the observables. equilibration_index = -1 statistical_inefficiency = 0.0 for observable in observables.values(): observable_statistics = analyze_time_series(observable.value.magnitude) equilibration_index = max( equilibration_index, observable_statistics.equilibration_index ) statistical_inefficiency = max( statistical_inefficiency, observable_statistics.statistical_inefficiency ) equilibration_index = int(equilibration_index) statistical_inefficiency = float(statistical_inefficiency) uncorrelated_indices = get_uncorrelated_indices( expected_length - equilibration_index, statistical_inefficiency ) self.time_series_statistics = TimeSeriesStatistics( n_total_points=expected_length, n_uncorrelated_points=len(uncorrelated_indices), statistical_inefficiency=statistical_inefficiency, equilibration_index=equilibration_index, ) # Decorrelate the observables. uncorrelated_observables = { key: observable.subset( [index + equilibration_index for index in uncorrelated_indices] ) for key, observable in observables.items() } if self.potential_energies != UNDEFINED: self.potential_energies = self.potential_energies.subset( [index + equilibration_index for index in uncorrelated_indices] ) self.value = bootstrap( self._bootstrap_function, self.bootstrap_iterations, self.bootstrap_sample_size, **uncorrelated_observables, )
[docs]@workflow_protocol() class AverageObservable(BaseAverageObservable): """Computes the average value of an observable as well as bootstrapped uncertainties for the average. """ observable = InputAttribute( docstring="The file path to the observable which should be averaged.", type_hint=ObservableArray, default_value=UNDEFINED, ) divisor = InputAttribute( docstring="A value to divide the statistic by. This is useful if a statistic " "(such as enthalpy) needs to be normalised by the number of molecules.", type_hint=typing.Union[int, float, unit.Quantity], default_value=1.0, ) def _observables(self) -> typing.Dict[str, ObservableArray]: return {"observable": self.observable / self.divisor}
[docs]@workflow_protocol() class AverageDielectricConstant(BaseAverageObservable): """Computes the average value of the dielectric constant from a set of dipole moments (M) and volumes (V) sampled over the course of a molecular simulation such that ``eps = 1 + (<M^2> - <M>^2) / (3.0 * eps_0 * <V> * kb * T)`` [1]_. References ---------- [1] A. Glattli, X. Daura and W. F. van Gunsteren. Derivation of an improved simple point charge model for liquid water: SPC/A and SPC/L. J. Chem. Phys. 116(22): 9811-9828, 2002 """ dipole_moments = InputAttribute( docstring="The dipole moments of each sampled configuration.", type_hint=ObservableArray, default_value=UNDEFINED, ) volumes = InputAttribute( docstring="The volume of each sampled configuration.", type_hint=ObservableArray, default_value=UNDEFINED, ) def _observables(self): return {"dipole_moments": self.dipole_moments, "volumes": self.volumes} def _bootstrap_function(self, **kwargs: ObservableArray): return compute_dielectric_constant( kwargs.pop("dipole_moments"), kwargs.pop("volumes"), self.thermodynamic_state.temperature, super(AverageDielectricConstant, self)._bootstrap_function, )
[docs]@workflow_protocol() class AverageFreeEnergies(Protocol): """A protocol which computes the Boltzmann weighted average (ΔG° = -RT × Log[ Σ_{n} exp(-βΔG°_{n}) ]) of a set of free energies which were measured at the same thermodynamic state. Confidence intervals are computed by bootstrapping with replacement. """ values: typing.List[Observable] = InputAttribute( docstring="The values to add together.", type_hint=list, default_value=UNDEFINED ) thermodynamic_state = InputAttribute( docstring="The thermodynamic state at which the free energies were measured.", type_hint=ThermodynamicState, default_value=UNDEFINED, ) bootstrap_cycles = InputAttribute( docstring="The number of bootstrap cycles to perform when estimating " "the uncertainty in the combined free energies.", type_hint=int, default_value=2000, ) result = OutputAttribute(docstring="The sum of the values.", type_hint=Observable) confidence_intervals = OutputAttribute( docstring="The 95% confidence intervals on the average free energy.", type_hint=unit.Quantity, ) def _execute(self, directory, available_resources): from scipy.special import logsumexp default_unit = unit.kilocalorie / unit.mole boltzmann_factor = ( self.thermodynamic_state.temperature * unit.molar_gas_constant ) boltzmann_factor.ito(default_unit) beta = 1.0 / boltzmann_factor values = [ (-beta * value.value.to(default_unit)).to(unit.dimensionless).magnitude for value in self.values ] # Compute the mean. mean = logsumexp(values) # Compute the gradients of the mean. value_gradients = [ {gradient.key: -beta * gradient.value for gradient in value.gradients} for value in self.values ] value_gradients_by_key = { gradient_key: [ gradients_by_key[gradient_key] for gradients_by_key in value_gradients ] for gradient_key in value_gradients[0] } mean_gradients = [] for gradient_key, gradient_values in value_gradients_by_key.items(): expected_unit = value_gradients[0][gradient_key].units d_log_mean_numerator, d_mean_numerator_sign = logsumexp( values, b=[x.to(expected_unit).magnitude for x in gradient_values], return_sign=True, ) d_mean_numerator = d_mean_numerator_sign * np.exp(d_log_mean_numerator) d_mean_d_theta = d_mean_numerator / np.exp(mean) mean_gradients.append( ParameterGradient( key=gradient_key, value=-boltzmann_factor * d_mean_d_theta * expected_unit, ) ) # Compute the standard error and 95% CI cycle_result = np.empty(self.bootstrap_cycles) for cycle_index, cycle in enumerate(range(self.bootstrap_cycles)): cycle_values = np.empty(len(self.values)) for value_index, value in enumerate(self.values): cycle_mean = value.value.to(default_unit).magnitude cycle_sem = value.error.to(default_unit).magnitude sampled_value = np.random.normal(cycle_mean, cycle_sem) * default_unit cycle_values[value_index] = ( (-beta * sampled_value).to(unit.dimensionless).magnitude ) # ΔG° = -RT × Log[ Σ_{n} exp(-βΔG°_{n}) ] cycle_result[cycle_index] = logsumexp(cycle_values) mean = -boltzmann_factor * mean sem = np.std(-boltzmann_factor * cycle_result) confidence_intervals = np.empty(2) sorted_statistics = np.sort(cycle_result) confidence_intervals[0] = sorted_statistics[int(0.025 * self.bootstrap_cycles)] confidence_intervals[1] = sorted_statistics[int(0.975 * self.bootstrap_cycles)] confidence_intervals = -boltzmann_factor * confidence_intervals self.result = Observable(value=mean.plus_minus(sem), gradients=mean_gradients) self.confidence_intervals = confidence_intervals
[docs] def validate(self, attribute_type=None): super(AverageFreeEnergies, self).validate(attribute_type) assert all(isinstance(x, Observable) for x in self.values) if len(self.values) == 0: return expected_gradients = {gradient.key for gradient in self.values[0].gradients} if not all( expected_gradients == {gradient.key for gradient in value.gradients} for value in self.values ): raise ValueError( "The values must contain gradient information for the same set of " "force field parameters." )
[docs]@workflow_protocol() class ComputeDipoleMoments(Protocol): """A protocol which will compute the dipole moment for each configuration in a trajectory and for a given parameterized system.""" parameterized_system = InputAttribute( docstring="The parameterized system which encodes the charge on each atom " "in the system.", type_hint=ParameterizedSystem, default_value=UNDEFINED, ) trajectory_path = InputAttribute( docstring="The file path to the trajectory of configurations.", type_hint=str, default_value=UNDEFINED, ) gradient_parameters = InputAttribute( docstring="An optional list of parameters to differentiate the dipole moments " "with respect to.", type_hint=list, default_value=lambda: list(), ) dipole_moments = OutputAttribute( docstring="The computed dipole moments.", type_hint=ObservableArray ) @classmethod def _extract_charges( cls, system: "openmm.System" ) -> typing.Optional[unit.Quantity]: """Retrieve the charge on each atom from an OpenMM system object. Parameters ---------- The system object containing the charges. Returns ------- The charge on each atom in the system if any are present, otherwise none. """ import openmm from openmm import unit as openmm_unit forces = [ system.getForce(force_index) for force_index in range(system.getNumForces()) if isinstance(system.getForce(force_index), openmm.NonbondedForce) ] if len(forces) > 1: raise ValueError( f"The system must contain no more than one non-bonded force, however " f"{len(forces)} were found." ) if len(forces) == 0: return None charges = np.array( [ forces[0] .getParticleParameters(atom_index)[0] .value_in_unit(openmm_unit.elementary_charge) for atom_index in range(forces[0].getNumParticles()) ] ) return charges * unit.elementary_charge def _compute_charge_derivatives(self, n_atoms: int): d_charge_d_theta = {key: np.zeros(n_atoms) for key in self.gradient_parameters} if len(self.gradient_parameters) > 0 and not isinstance( self.parameterized_system.force_field, SmirnoffForceFieldSource ): raise ValueError( "Derivates can only be computed for systems parameterized with " "SMIRNOFF force fields." ) force_field = self.parameterized_system.force_field.to_force_field() topology = self.parameterized_system.topology for key in self.gradient_parameters: reverse_system, reverse_value = system_subset(key, force_field, topology) forward_system, forward_value = system_subset( key, force_field, topology, 0.1 ) reverse_value = from_openmm(reverse_value) forward_value = from_openmm(forward_value) reverse_charges = self._extract_charges(reverse_system) forward_charges = self._extract_charges(forward_system) if reverse_charges is None and forward_charges is None: d_charge_d_theta[key] /= forward_value.units else: d_charge_d_theta[key] = (forward_charges - reverse_charges) / ( forward_value - reverse_value ) return d_charge_d_theta def _execute(self, directory, available_resources): import mdtraj charges = self._extract_charges(self.parameterized_system.system) charge_derivatives = self._compute_charge_derivatives(len(charges)) dipole_moments = [] dipole_gradients = {key: [] for key in self.gradient_parameters} for chunk in mdtraj.iterload( self.trajectory_path, top=self.parameterized_system.topology_path, chunk=50 ): xyz = chunk.xyz.transpose(0, 2, 1) * unit.nanometers dipole_moments.extend(xyz.dot(charges)) for key in self.gradient_parameters: dipole_gradients[key].extend(xyz.dot(charge_derivatives[key])) self.dipole_moments = ObservableArray( value=np.vstack(dipole_moments), gradients=[ ParameterGradient(key=key, value=np.vstack(dipole_gradients[key])) for key in self.gradient_parameters ], )
[docs]class BaseDecorrelateProtocol(Protocol, abc.ABC): """An abstract base class for protocols which will subsample a set of data, yielding only equilibrated, uncorrelated data. """ time_series_statistics: typing.Union[ TimeSeriesStatistics, typing.List[TimeSeriesStatistics] ] = InputAttribute( docstring="Statistics about the data to decorrelate. This should include the " "statistical inefficiency and the index after which the observables have " "become stationary (i.e. equilibrated). If a list of such statistics are " "provided it will be assumed that multiple time series which have been " "joined together are being decorrelated and hence will each be decorrelated " "separately.", type_hint=typing.Union[list, TimeSeriesStatistics], default_value=UNDEFINED, ) def _n_expected(self) -> int: """Returns the expected number of samples to decorrelate.""" time_series_statistics = self.time_series_statistics if isinstance(time_series_statistics, TimeSeriesStatistics): time_series_statistics = [time_series_statistics] return sum(statistics.n_total_points for statistics in time_series_statistics) def _uncorrelated_indices(self) -> typing.List[int]: """Returns the indices of the time series being decorrelated to retain.""" time_series_statistics = self.time_series_statistics if isinstance(time_series_statistics, TimeSeriesStatistics): time_series_statistics = [time_series_statistics] n_cumulative = [ 0, *itertools.accumulate( [statistics.n_total_points for statistics in time_series_statistics] ), ] uncorrelated_indices = [ n_cumulative[statistics_index] + index + statistics.equilibration_index for statistics_index, statistics in enumerate(time_series_statistics) for index in timeseries.get_uncorrelated_indices( statistics.n_total_points - statistics.equilibration_index, statistics.statistical_inefficiency, ) ] assert len(uncorrelated_indices) == sum( statistics.n_uncorrelated_points for statistics in time_series_statistics ) return uncorrelated_indices
[docs]@workflow_protocol() class DecorrelateTrajectory(BaseDecorrelateProtocol): """A protocol which will subsample frames from a trajectory, yielding only uncorrelated frames as determined from a provided statistical inefficiency and equilibration time. """ input_coordinate_file = InputAttribute( docstring="The file path to the starting coordinates of a trajectory.", type_hint=str, default_value=UNDEFINED, ) input_trajectory_path = InputAttribute( docstring="The file path to the trajectory to subsample.", type_hint=str, default_value=UNDEFINED, ) output_trajectory_path = OutputAttribute( docstring="The file path to the subsampled trajectory.", type_hint=str ) @staticmethod def _yield_frame(file, topology, stride): """A generator which yields frames of a DCD trajectory. Parameters ---------- file: mdtraj.DCDTrajectoryFile The file object being used to read the trajectory. topology: mdtraj.Topology The object which describes the topology of the trajectory. stride Only read every stride-th frame. Returns ------- mdtraj.Trajectory A trajectory containing only a single frame. """ while True: frame = file.read_as_traj(topology, n_frames=1, stride=stride) if len(frame) == 0: return yield frame def _execute(self, directory, available_resources): import mdtraj from mdtraj.formats.dcd import DCDTrajectoryFile from mdtraj.utils import in_units_of # Set the output path. self.output_trajectory_path = path.join( directory, "uncorrelated_trajectory.dcd" ) # Load in the trajectories topology. topology = mdtraj.load_frame(self.input_coordinate_file, 0).topology # Parse the internal mdtraj distance unit. While private access is undesirable, # this is never publicly defined and I believe this route to be preferable # over hard coding this unit. # noinspection PyProtectedMember base_distance_unit = mdtraj.Trajectory._distance_unit # Determine the frames to retrain uncorrelated_indices = {*self._uncorrelated_indices()} frame_count = 0 with DCDTrajectoryFile(self.input_trajectory_path, "r") as input_file: with DCDTrajectoryFile(self.output_trajectory_path, "w") as output_file: for frame in self._yield_frame(input_file, topology, 1): if frame_count in uncorrelated_indices: output_file.write( xyz=in_units_of( frame.xyz, base_distance_unit, output_file.distance_unit ), cell_lengths=in_units_of( frame.unitcell_lengths, base_distance_unit, output_file.distance_unit, ), cell_angles=frame.unitcell_angles[0], ) frame_count += 1 assert frame_count == self._n_expected()
[docs]@workflow_protocol() class DecorrelateObservables(BaseDecorrelateProtocol): """A protocol which will subsample a trajectory of observables, yielding only uncorrelated entries as determined from a provided statistical inefficiency and equilibration time. """ input_observables = InputAttribute( docstring="The observables to decorrelate.", type_hint=typing.Union[ObservableArray, ObservableFrame], default_value=UNDEFINED, ) output_observables = OutputAttribute( docstring="The decorrelated observables.", type_hint=typing.Union[ObservableArray, ObservableFrame], ) def _execute(self, directory, available_resources): assert len(self.input_observables) == self._n_expected() uncorrelated_indices = self._uncorrelated_indices() uncorrelated_observable = self.input_observables.subset(uncorrelated_indices) self.output_observables = uncorrelated_observable