Source code for openff.evaluator.workflow.protocols

"""
A collection of specialized workflow building blocks, which when chained together,
form a larger property estimation workflow.
"""
import abc
import copy
import json
import logging
import os
import time
from collections import defaultdict

from openff.evaluator.attributes import Attribute, AttributeClass, PlaceholderValue
from openff.evaluator.backends import ComputeResources
from openff.evaluator.utils import graph
from openff.evaluator.utils.exceptions import EvaluatorException
from openff.evaluator.utils.serialization import TypedJSONDecoder, TypedJSONEncoder
from openff.evaluator.utils.string import extract_variable_index_and_name
from openff.evaluator.utils.utils import get_nested_attribute, set_nested_attribute
from openff.evaluator.workflow import (
    ProtocolGroupSchema,
    ProtocolSchema,
    WorkflowException,
    registered_workflow_protocols,
    workflow_protocol,
)
from openff.evaluator.workflow.attributes import (
    InequalityMergeBehaviour,
    InputAttribute,
    MergeBehaviour,
    OutputAttribute,
)
from openff.evaluator.workflow.utils import ProtocolPath

logger = logging.getLogger(__name__)


class ProtocolMeta(abc.ABCMeta):
    def __init__(cls, name, bases, dct):

        super().__init__(name, bases, dct)

        cls._input_attributes = [
            ProtocolPath(x) for x in cls.get_attributes(InputAttribute)
        ]
        cls._output_attributes = [
            ProtocolPath(x) for x in cls.get_attributes(OutputAttribute)
        ]


[docs]class Protocol(AttributeClass, abc.ABC, metaclass=ProtocolMeta):
    """The base class for a protocol which would form one
    step of a larger property calculation workflow.

    A protocol may for example:

        * create the coordinates of a mixed simulation box
        * set up a bound ligand-protein system
        * build the simulation topology
        * perform an energy minimisation

    An individual protocol may require a set of inputs, which may either be
    set as constants

    >>> from openff.evaluator.protocols.openmm import OpenMMSimulation
    >>>
    >>> npt_equilibration = OpenMMSimulation('npt_equilibration')
    >>> npt_equilibration.ensemble = OpenMMSimulation.Ensemble.NPT

    or from the output of another protocol, pointed to by a ProtocolPath

    >>> npt_production = OpenMMSimulation('npt_production')
    >>> # Use the coordinate file output by the npt_equilibration protocol
    >>> # as the input to the npt_production protocol
    >>> npt_production.input_coordinate_file = ProtocolPath('output_coordinate_file',
    >>>                                                     npt_equilibration.id)

    In this way protocols may be chained together, thus defining a larger property
    calculation workflow from simple, reusable building blocks.
    """

    id = Attribute(docstring="The unique id of this protocol.", type_hint=str)
    allow_merging = InputAttribute(
        docstring="Defines whether this protocols is allowed "
        "to merge with other protocols.",
        type_hint=bool,
        default_value=True,
    )

    @property
    def schema(self):
        """ProtocolSchema: A serializable schema for this object."""
        return self._get_schema()

    @schema.setter
    def schema(self, schema_value):
        self._set_schema(schema_value)

    @property
    def required_inputs(self):
        """list of ProtocolPath: The inputs which must be set on this protocol."""
        return [x.copy() for x in self._input_attributes]

    @property
    def outputs(self):
        """dict of ProtocolPath and Any: A dictionary of the outputs of this property."""

        outputs = {}

        for output_attribute in self._output_attributes:
            outputs[output_attribute] = getattr(self, output_attribute.property_name)

        return outputs

    @property
    def dependencies(self):
        """list of ProtocolPath: A list of pointers to the protocols which this
        protocol takes input from.
        """
        return_dependencies = []

        for input_path in self.required_inputs:

            value_references = self.get_value_references(input_path)

            if len(value_references) == 0:
                continue

            for value_reference in value_references.values():

                if value_reference in return_dependencies:
                    continue

                if (
                    value_reference.start_protocol is None
                    or value_reference.start_protocol == self.id
                ):

                    continue

                return_dependencies.append(value_reference)

        return return_dependencies

[docs]    def __init__(self, protocol_id):
        self.id = protocol_id

    def _get_schema(self, schema_type=ProtocolSchema, *args):
        """Returns the schema representation of this protocol.

        Parameters
        ----------
        schema_type: type of ProtocolSchema
            The type of schema to create.

        Returns
        -------
        schema_type
            The schema representation.
        """

        inputs = {}

        for input_path in self.required_inputs:

            if (
                len(input_path.protocol_path) > 0
                and input_path.protocol_path != self.id
            ):
                continue

            # Always make sure to only pass a copy of the input.
            # Changing the schema should NOT change the protocol.
            inputs[input_path.full_path] = copy.deepcopy(self.get_value(input_path))

        schema = schema_type(self.id, self.__class__.__name__, inputs, *args)
        return schema

    def _set_schema(self, schema):
        """Sets this protocols properties from a `ProtocolSchema`

        Parameters
        ----------
        schema: ProtocolSchema
            The schema to set.
        """

        # Make sure this protocol matches the schema type.
        if self.__class__.__name__ != schema.type:

            raise ValueError(
                f"The schema type {schema.type} does not match this protocol."
            )

        self.id = schema.id

        for input_full_path in schema.inputs:

            value = copy.deepcopy(schema.inputs[input_full_path])

            input_path = ProtocolPath.from_string(input_full_path)
            self.set_value(input_path, value)

[docs]    @classmethod
    def from_schema(cls, schema):
        """Initializes a protocol from it's schema definition.

        Parameters
        ----------
        schema: ProtocolSchema
            The schema to initialize the protocol using.

        Returns
        -------
        cls
            The initialized protocol.
        """
        protocol = registered_workflow_protocols[schema.type](schema.id)
        protocol.schema = schema
        return protocol

[docs]    def set_uuid(self, value):
        """Prepend a unique identifier to this protocols id. If the id
        already has a prepended uuid, it will be overwritten by this value.

        Parameters
        ----------
        value : str
            The uuid to prepend.
        """

        if len(value) == 0:
            return

        id_with_uuid = graph.append_uuid(self.id, value)

        if self.id == id_with_uuid:
            return

        self.id = graph.append_uuid(self.id, value)

        for input_path in self.required_inputs:

            input_path.append_uuid(value)

            value_references = self.get_value_references(input_path)

            for key, value_reference in value_references.items():
                value_reference.append_uuid(value)
                self.set_value(key, value_reference)

[docs]    def replace_protocol(self, old_id, new_id):
        """Finds each input which came from a given protocol
         and redirects it to instead take input from a new one.

        Notes
        -----
        This method is mainly intended to be used only when merging
        multiple protocols into one.

        Parameters
        ----------
        old_id : str
            The id of the old input protocol.
        new_id : str
            The id of the new input protocol.
        """

        for input_path in self.required_inputs:

            input_path.replace_protocol(old_id, new_id)

            if input_path.start_protocol is not None or (
                input_path.start_protocol != input_path.last_protocol
                and input_path.start_protocol != self.id
            ):
                continue

            value_references = self.get_value_references(input_path)

            for key, value_reference in value_references.items():
                value_reference.replace_protocol(old_id, new_id)
                self.set_value(key, value_reference)

        if self.id == old_id:
            self.id = new_id

    def _find_inputs_to_merge(self):
        """Returns a list of those inputs which should
        be considered when attempting to merge two different
        protocols of the same type.

        Returns
        -------
        set of ProtocolPath
            References to those inputs which should be
            considered.
        """
        inputs_to_consider = set()

        for input_path in self.required_inputs:

            # Do not consider paths that point to child (e.g grouped) protocols.
            # These should be handled by the container classes themselves.
            if (
                input_path.start_protocol is not None
                and input_path.start_protocol != self.id
            ):
                continue

            if not (
                input_path.start_protocol is None
                or (
                    input_path.start_protocol == input_path.last_protocol
                    and input_path.start_protocol == self.id
                )
            ):

                continue

            # If no merge behaviour flag is present (for example in the case of
            # ConditionalGroup conditions), simply assume this is handled explicitly
            # elsewhere.
            if not hasattr(
                getattr(type(self), input_path.property_name), "merge_behavior"
            ):
                continue

            inputs_to_consider.add(input_path)

        return inputs_to_consider

[docs]    def can_merge(self, other, path_replacements=None):
        """Determines whether this protocol can be merged with another.

        Parameters
        ----------
        other : :obj:`Protocol`
            The protocol to compare against.
        path_replacements: list of tuple of str, optional
            Replacements to make in any value reference protocol paths
            before comparing for equality.

        Returns
        ----------
        bool
            True if the two protocols are safe to merge.
        """

        if not self.allow_merging or not isinstance(self, type(other)):
            return False

        if path_replacements is None:
            path_replacements = []

        inputs_to_consider = self._find_inputs_to_merge()

        for input_path in inputs_to_consider:

            # Do a quick sanity check that the other protocol
            # does in fact also require this input.
            if input_path not in other.required_inputs:
                return False

            merge_behavior = getattr(
                type(self), input_path.property_name
            ).merge_behavior

            self_value = self.get_value(input_path)
            other_value = other.get_value(input_path)

            if (
                isinstance(self_value, PlaceholderValue)
                and not isinstance(other_value, PlaceholderValue)
            ) or (
                isinstance(other_value, PlaceholderValue)
                and not isinstance(self_value, PlaceholderValue)
            ):

                # We cannot safely merge inputs when only one of the values
                # is currently known.
                return False

            if isinstance(self_value, ProtocolPath) and isinstance(
                other_value, ProtocolPath
            ):

                other_value_post_merge = ProtocolPath.from_string(other_value.full_path)

                for original_id, new_id in path_replacements:
                    other_value_post_merge.replace_protocol(original_id, new_id)

                # We cannot safely choose which value to take when the
                # values are not know ahead of time unless the two values
                # come from the exact same source.
                if self_value.protocol_path != other_value_post_merge.protocol_path:
                    return False

            elif isinstance(self_value, PlaceholderValue) and isinstance(
                other_value, PlaceholderValue
            ):
                return False

            elif (
                merge_behavior == MergeBehaviour.ExactlyEqual
                and self_value != other_value
            ):
                return False

        return True

[docs]    def merge(self, other):
        """Merges another Protocol with this one. The id
        of this protocol will remain unchanged.

        Parameters
        ----------
        other: Protocol
            The protocol to merge into this one.

        Returns
        -------
        Dict[str, str]
            A map between any original protocol ids and their new merged values.
        """

        if not self.can_merge(other):
            raise ValueError("These protocols cannot be safely merged.")

        inputs_to_consider = self._find_inputs_to_merge()

        for input_path in inputs_to_consider:

            merge_behavior = getattr(
                type(self), input_path.property_name
            ).merge_behavior

            if (
                merge_behavior == MergeBehaviour.ExactlyEqual
                or merge_behavior == MergeBehaviour.Custom
            ):
                continue

            if isinstance(self.get_value(input_path), ProtocolPath) or isinstance(
                other.get_value(input_path), ProtocolPath
            ):

                continue

            if merge_behavior == InequalityMergeBehaviour.SmallestValue:
                value = min(self.get_value(input_path), other.get_value(input_path))
            elif merge_behavior == InequalityMergeBehaviour.LargestValue:
                value = max(self.get_value(input_path), other.get_value(input_path))
            else:
                raise NotImplementedError()

            self.set_value(input_path, value)

        return {}

[docs]    def get_value_references(self, input_path):
        """Returns a dictionary of references to the protocols which one of this
        protocols inputs (specified by `input_path`) takes its value from.

        Notes
        -----
        Currently this method only functions correctly for an input value which
        is either currently a :obj:`ProtocolPath`, or a `list` / `dict` which contains
        at least one :obj:`ProtocolPath`.

        Parameters
        ----------
        input_path: ProtocolPath
            The input value to check.

        Returns
        -------
        dict of ProtocolPath and ProtocolPath
            A dictionary of the protocol paths that the input targeted by `input_path` depends upon.
        """
        input_value = self.get_value(input_path)

        if isinstance(input_value, ProtocolPath):
            return {input_path: input_value}

        if (
            not isinstance(input_value, list)
            and not isinstance(input_value, tuple)
            and not isinstance(input_value, dict)
        ):

            return {}

        return_paths = {}

        if isinstance(input_value, list) or isinstance(input_value, tuple):

            for index, list_value in enumerate(input_value):

                if not isinstance(list_value, ProtocolPath):
                    continue

                path_index = ProtocolPath(
                    input_path.property_name + f"[{index}]", *input_path.protocol_ids
                )
                return_paths[path_index] = list_value

        else:

            for dict_key in input_value:

                if not isinstance(input_value[dict_key], ProtocolPath):
                    continue

                path_index = ProtocolPath(
                    input_path.property_name + f"[{dict_key}]", *input_path.protocol_ids
                )
                return_paths[path_index] = input_value[dict_key]

        return return_paths

[docs]    def get_class_attribute(self, reference_path):
        """Returns one of this protocols, or any of its children's,
        attributes directly (rather than its value).

        Parameters
        ----------
        reference_path: ProtocolPath
            The path pointing to the attribute to return.

        Returns
        ----------
        object:
            The class attribute.
        """

        if (
            reference_path.start_protocol is not None
            and reference_path.start_protocol != self.id
        ):
            raise ValueError(
                "The reference path {} does not point to this protocol".format(
                    reference_path
                )
            )

        if (
            reference_path.property_name.count(ProtocolPath.property_separator) >= 1
            or reference_path.property_name.find("[") > 0
        ):

            raise ValueError(
                "The expected attribute cannot be found for "
                "nested property names: {}".format(reference_path.property_name)
            )

        return getattr(type(self), reference_path.property_name)

[docs]    def get_value(self, reference_path):
        """Returns the value of one of this protocols inputs / outputs.

        Parameters
        ----------
        reference_path: ProtocolPath
            The path pointing to the value to return.

        Returns
        ----------
        Any:
            The value of the input / output
        """

        if (
            reference_path.start_protocol is not None
            and reference_path.start_protocol != self.id
        ):

            raise ValueError("The reference path does not target this protocol.")

        if reference_path.property_name is None or reference_path.property_name == "":
            raise ValueError("The reference path does specify a property to return.")

        return get_nested_attribute(self, reference_path.property_name)

[docs]    def set_value(self, reference_path, value):
        """Sets the value of one of this protocols inputs.

        Parameters
        ----------
        reference_path: ProtocolPath
            The path pointing to the value to return.
        value: Any
            The value to set.
        """

        if (
            reference_path.start_protocol is not None
            and reference_path.start_protocol != self.id
        ):

            raise ValueError("The reference path does not target this protocol.")

        if reference_path.property_name is None or reference_path.property_name == "":
            raise ValueError("The reference path does specify a property to set.")

        set_nested_attribute(self, reference_path.property_name, value)

[docs]    def apply_replicator(
        self,
        replicator,
        template_values,
        template_index=-1,
        template_value=None,
        update_input_references=False,
    ):
        """Applies a `ProtocolReplicator` to this protocol. This method
        should clone any protocols whose id contains the id of the
        replicator (in the format `$(replicator.id)`).

        Parameters
        ----------
        replicator: ProtocolReplicator
            The replicator to apply.
        template_values: list of Any
            A list of the values which will be inserted
            into the newly replicated protocols.

            This parameter is mutually exclusive with
            `template_index` and `template_value`
        template_index: int, optional
            A specific value which should be used for any
            protocols flagged as to be replicated by the
            replicator. This option is mainly used when
            replicating children of an already replicated
            protocol.

            This parameter is mutually exclusive with
            `template_values` and must be set along with
            a `template_value`.
        template_value: Any, optional
            A specific index which should be used for any
            protocols flagged as to be replicated by the
            replicator. This option is mainly used when
            replicating children of an already replicated
            protocol.

            This parameter is mutually exclusive with
            `template_values` and must be set along with
            a `template_index`.
        update_input_references: bool
            If true, any protocols which take their input from a protocol
            which was flagged for replication will be updated to take input
            from the actually replicated protocol. This should only be set
            to true if this protocol is not nested within a workflow or a
            protocol group.

            This option cannot be used when a specific `template_index` or
            `template_value` is providied.

        Returns
        -------
        dict of ProtocolPath and list of tuple of ProtocolPath and int
            A dictionary of references to all of the protocols which have
            been replicated, with keys of original protocol ids. Each value
            is comprised of a list of the replicated protocol ids, and their
            index into the `template_values` array.
        """
        return {}

    @abc.abstractmethod
    def _execute(self, directory, available_resources):
        """The implementation of the public facing `execute`
        method.

        This method will be called by `execute` after all inputs
        have been validated.

        Parameters
        ----------
        directory: str
            The directory to store output data in.
        available_resources: ComputeResources
            The resources available to execute on.
        """

[docs]    def execute(self, directory="", available_resources=None):
        """Execute the protocol.

        Parameters
        ----------
        directory: str
            The directory to store output data in.
        available_resources: ComputeResources
            The resources available to execute on. If `None`, the protocol
            will be executed on a single CPU.
        """
        if len(directory) > 0 and not os.path.isdir(directory):
            os.makedirs(directory, exist_ok=True)

        if available_resources is None:
            available_resources = ComputeResources(number_of_threads=1)

        self.validate(InputAttribute)
        self._execute(directory, available_resources)


[docs]class ProtocolGraph:
    """A graph of connected protocols which may be
    executed together.
    """

    @property
    def protocols(self):
        """dict of str and Protocol: The protocols in this graph."""
        return self._protocols_by_id

    @property
    def root_protocols(self):
        """list of str: The ids of the protocols in the group which do not
        take input from the other grouped protocols."""
        return self._root_protocols

[docs]    def __init__(self):

        self._protocols_by_id = {}
        self._root_protocols = []

    def _build_dependants_graph(
        self, protocols, allow_external_dependencies, apply_reduction=False
    ):
        """Builds a dictionary of key value pairs where each key
        is the id of a protocol in the graph and each value is a
        list ids of protocols which depend on this protocol.

        Parameters
        ----------
        dict of str and Protocol
            The protocols in the graph.
        allow_external_dependencies: bool
            If `False`, an exception will be raised if a protocol
            has a dependency outside of this graph.
        apply_reduction: bool
            Whether or not to apply transitive reduction to the
            graph.
        """
        internal_protocol_ids = {*protocols, *self._protocols_by_id}

        dependants_graph = defaultdict(set)

        for protocol_id in protocols:
            dependants_graph[protocol_id] = set()

        for protocol in protocols.values():

            for dependency in protocol.dependencies:

                # Check for external dependencies.
                if dependency.start_protocol not in internal_protocol_ids:

                    if allow_external_dependencies:
                        continue
                    else:

                        raise ValueError(
                            f"The {dependency.start_protocol} dependency "
                            f"is outside of this graph."
                        )

                # Skip global or self dependencies.
                if dependency.is_global or dependency.start_protocol == protocol.id:
                    continue

                # Add the dependency
                dependants_graph[dependency.start_protocol].add(protocol.id)

        dependants_graph = {key: list(value) for key, value in dependants_graph.items()}

        if not graph.is_acyclic(dependants_graph):
            raise ValueError("The protocols in this graph have cyclical dependencies.")

        if apply_reduction:
            # Remove any redundant connections from the graph.
            graph.apply_transitive_reduction(dependants_graph)

        return dependants_graph

    def _add_protocol(
        self,
        protocol_id,
        protocols_to_add,
        dependant_ids,
        parent_protocol_ids,
        full_dependants_graph,
    ):
        """Adds a protocol into the graph.

        Parameters
        ----------
        protocol_id : str
            The id of the protocol to insert.
        protocols_to_add: dict of str and Protocol
            A dictionary of all of the protocols currently being
            added to the graph.
        dependant_ids: list of str
            The ids of the protocols which depend on the output of the
            protocol to be inserted.
        parent_protocol_ids : `list` of str
            The ids of the parents of the node to be inserted. If None,
            the protocol will be added as a new root node.
        full_dependants_graph: dict of str and list of str
            The current dependants graph of the entire workflow
            graph. This will be used to find the child protocols
            of existing protocols in the graph.

        Returns
        -------
        str
            The id of the protocol which was inserted. This may not be
            the same as `protocol_id` if the protocol to insert was merged
            with an existing one.
        dict of str and str
            A mapping between all current protocol ids, and the new ids of
            protocols after the protocol has been inserted due to protocol
            merging.
        """

        # Build a list of protocols which have the same ancestors
        # as the protocols to insert. This will be used to check
        # if we are trying to add a redundant protocol to the graph.
        existing_protocols = (
            self._root_protocols if len(parent_protocol_ids) == 0 else []
        )

        for parent_protocol_id in parent_protocol_ids:

            existing_protocols.extend(
                x
                for x in full_dependants_graph[parent_protocol_id]
                if x not in existing_protocols
            )

        # Don't merge protocols from the same workflow / batch
        existing_protocols = [
            x
            for x in existing_protocols
            if x not in protocols_to_add
            or graph.retrieve_uuid(x) != graph.retrieve_uuid(protocol_id)
        ]

        protocol_to_insert = protocols_to_add[protocol_id]
        existing_protocol = None

        # Start by checking to see if the starting protocol of the workflow graph is
        # already present in the full graph.
        for existing_id in existing_protocols:

            protocol = self._protocols_by_id[existing_id]

            if not protocol.can_merge(protocol_to_insert):
                continue

            existing_protocol = protocol
            break

        # Store a mapping between original and merged protocols.
        merged_ids = {}

        if existing_protocol is not None:

            # Make a note that the existing protocol should be used in place
            # of this workflows version.
            protocols_to_add[protocol_id] = existing_protocol

            merged_ids = existing_protocol.merge(protocol_to_insert)
            merged_ids[protocol_to_insert.id] = existing_protocol.id

            for old_id, new_id in merged_ids.items():

                for dependant_id in dependant_ids:
                    protocols_to_add[dependant_id].replace_protocol(old_id, new_id)

            for parent_id in parent_protocol_ids:

                if parent_id not in protocols_to_add:
                    continue

                if existing_protocol.id not in full_dependants_graph[parent_id]:
                    full_dependants_graph[parent_id].append(existing_protocol.id)

        else:

            # Add the protocol as a new protocol in the graph.
            self._protocols_by_id[protocol_id] = protocol_to_insert
            existing_protocol = self._protocols_by_id[protocol_id]

            full_dependants_graph[protocol_id] = []

            if len(parent_protocol_ids) == 0:
                self._root_protocols.append(protocol_id)

            for parent_id in parent_protocol_ids:

                if protocol_id not in full_dependants_graph[parent_id]:
                    full_dependants_graph[parent_id].append(protocol_id)

        return existing_protocol.id, merged_ids

[docs]    def add_protocols(self, *protocols, allow_external_dependencies=False):
        """Adds a set of protocols to the graph.

        Parameters
        ----------
        protocols : tuple of Protocol
            The protocols to add.
        allow_external_dependencies: bool
            If `False`, an exception will be raised if a protocol
            has a dependency outside of this graph.

        Returns
        -------
        dict of str and str
            A mapping between the original protocols and protocols which
            were merged over the course of adding the new protocols.
        """

        # noinspection PyUnresolvedReferences
        conflicting_ids = [x.id for x in protocols if x.id in self._protocols_by_id]

        # Make sure we aren't trying to add protocols with conflicting ids.
        if len(conflicting_ids) > 0:

            raise ValueError(
                f"The graph already contains protocols with ids {conflicting_ids}"
            )

        # Add the protocols to the graph
        # noinspection PyUnresolvedReferences
        protocols_by_id = {x.id: x for x in protocols}

        # Build a the dependants graph of the protocols to add,
        # and determine the order that they would be executed in.
        # This is the order we should try to insert them in.
        dependants_graph = self._build_dependants_graph(
            protocols_by_id, allow_external_dependencies, apply_reduction=False
        )

        # Determine if any of the protocols to add depend on protocols which
        # are already in the graph.
        existing_parents = {
            x: set(y) for x, y in dependants_graph.items() if x in self._protocols_by_id
        }
        child_to_existing_parents = graph.dependants_to_dependencies(existing_parents)
        child_to_existing_parents = {
            x: y for x, y in child_to_existing_parents.items() if x in protocols_by_id
        }

        # Compute the reduced new graph to add.
        dependants_graph = {
            x: y for x, y in dependants_graph.items() if x in protocols_by_id
        }

        reduced_dependants_graph = {x: [*y] for x, y in dependants_graph.items()}
        graph.apply_transitive_reduction(reduced_dependants_graph)

        # Determine the order in which the new protocols would execute.
        # This will be the order we attempt to insert them into the graph.
        protocol_execution_order = graph.topological_sort(dependants_graph)

        # Construct the full dependants graph which will be used to find the
        # children of existing parent protocols
        full_dependants_graph = self._build_dependants_graph(
            self._protocols_by_id, allow_external_dependencies, apply_reduction=True
        )

        # Store a mapping between original and merged protocols.
        merged_ids = {}

        parent_protocol_ids = defaultdict(set)
        parent_protocol_ids.update(child_to_existing_parents)

        for protocol_id in protocol_execution_order:

            parent_ids = parent_protocol_ids.get(protocol_id) or []
            inserted_id, new_ids = self._add_protocol(
                protocol_id,
                protocols_by_id,
                dependants_graph[protocol_id],
                parent_ids,
                full_dependants_graph,
            )

            # Keep track of any merged protocols
            merged_ids.update(new_ids)

            # Update the parent graph
            for dependant in reduced_dependants_graph[protocol_id]:
                parent_protocol_ids[dependant].add(inserted_id)

        return merged_ids

[docs]    def execute(
        self,
        root_directory="",
        calculation_backend=None,
        compute_resources=None,
        enable_checkpointing=True,
        safe_exceptions=True,
    ):
        """Execute the protocol graph in the specified directory,
        and either using a `CalculationBackend`, or using a specified
        set of compute resources.

        Parameters
        ----------
        root_directory: str
            The directory to execute the graph in.
        calculation_backend: CalculationBackend, optional.
            The backend to execute the graph on. This parameter
            is mutually exclusive with `compute_resources`.
        compute_resources: CalculationBackend, optional.
            The compute resources to run using. This parameter
            is mutually exclusive with `calculation_backend`.
        enable_checkpointing: bool
            If enabled, protocols will not be executed more than once if the
            output from their previous execution is found.
        safe_exceptions: bool
            If true, exceptions will be serialized into the results file rather
            than directly raised, otherwise, the exception will be raised as
            normal.

        Returns
        -------
        dict of str and str or Future:
            The paths to the JSON serialized outputs of the executed protocols.
            If executed using a calculation backend, these will be `Future` objects
            which will return the output paths on calling `future.result()`.
        """

        if len(root_directory) > 0:
            os.makedirs(root_directory, exist_ok=True)

        assert (calculation_backend is None and compute_resources is not None) or (
            calculation_backend is not None and compute_resources is None
        )

        # Determine the order in which to submit the protocols, such
        # that all dependencies are satisfied.
        dependants_graph = self._build_dependants_graph(
            self._protocols_by_id, False, False
        )

        execution_order = graph.topological_sort(dependants_graph)

        # Build a dependency graph from the dependants graph so that
        # futures can easily be passed in the correct place.
        dependencies = graph.dependants_to_dependencies(dependants_graph)

        protocol_outputs = {}

        for protocol_id in execution_order:

            protocol = self._protocols_by_id[protocol_id]
            parent_outputs = []

            for dependency in dependencies[protocol_id]:
                parent_outputs.append(protocol_outputs[dependency])

            directory_name = protocol_id.replace("|", "_")
            directory_name = directory_name.replace(":", "_")
            directory_name = directory_name.replace(";", "_")

            directory = os.path.join(root_directory, directory_name)

            if calculation_backend is not None:

                protocol_outputs[protocol_id] = calculation_backend.submit_task(
                    ProtocolGraph._execute_protocol,
                    directory,
                    protocol.schema.json(),
                    enable_checkpointing,
                    *parent_outputs,
                    safe_exceptions=safe_exceptions,
                )

            else:

                protocol_outputs[protocol_id] = ProtocolGraph._execute_protocol(
                    directory,
                    protocol,
                    enable_checkpointing,
                    *parent_outputs,
                    available_resources=compute_resources,
                    safe_exceptions=safe_exceptions,
                )

        return protocol_outputs

    @staticmethod
    def _execute_protocol(
        directory,
        protocol,
        enable_checkpointing,
        *previous_output_paths,
        available_resources,
        safe_exceptions,
        **_,
    ):
        """Executes the protocol defined by the ``protocol_schema``.

        Parameters
        ----------
        directory: str
            The directory to execute the protocol in.
        protocol: Protocol or str
            Either the protocol to execute, or the JSON schema which defines
            the protocol to execute.
        enable_checkpointing: bool
            If enabled, the protocol will not be executed again if the
            output of its previous execution is found.
        parent_outputs: tuple of str
            Paths to the outputs of the protocols which the
            protocol to execute depends on.
        safe_exceptions: bool
            If true, exceptions will be serialized into the results file rather
            than directly raised, otherwise, the exception will be raised as
            normal.

        Returns
        -------
        str
            The id of the executed protocol.
        str
            The path to the JSON serialized output of the executed
            protocol.
        """

        if isinstance(protocol, str):
            protocol_schema = ProtocolSchema.parse_json(protocol)
            protocol = protocol_schema.to_protocol()

        # The path where the output of this protocol will be stored.
        os.makedirs(directory, exist_ok=True)

        output_path = os.path.join(directory, f"{protocol.id}_output.json")

        # We need to make sure ALL exceptions are handled within this method,
        # to avoid accidentally killing the compute backend / server.
        try:

            # Check if the output of this protocol already exists and
            # whether we allow returning the found output.
            if os.path.isfile(output_path) and enable_checkpointing:

                with open(output_path) as file:
                    outputs = json.load(file, cls=TypedJSONDecoder)

                if not isinstance(outputs, WorkflowException):

                    for protocol_path, output in outputs.items():

                        protocol_path = ProtocolPath.from_string(protocol_path)
                        protocol.set_value(protocol_path, output)

                return protocol.id, output_path

            # Store the results of the relevant previous protocols in a
            # convenient dictionary.
            previous_outputs = {}

            for parent_id, previous_output_path in previous_output_paths:

                with open(previous_output_path) as file:
                    parent_output = json.load(file, cls=TypedJSONDecoder)

                # If one of the results is a failure exit early and propagate
                # the exception up the graph.
                if isinstance(parent_output, EvaluatorException):
                    return protocol.id, previous_output_path

                for protocol_path, output_value in parent_output.items():

                    protocol_path = ProtocolPath.from_string(protocol_path)

                    if (
                        protocol_path.start_protocol is None
                        or protocol_path.start_protocol != parent_id
                    ):

                        protocol_path.prepend_protocol_id(parent_id)

                    previous_outputs[protocol_path] = output_value

            # Pass the outputs of previously executed protocols as input to the
            # protocol to execute.
            for input_path in protocol.required_inputs:

                value_references = protocol.get_value_references(input_path)

                for source_path, target_path in value_references.items():

                    if (
                        target_path.start_protocol == input_path.start_protocol
                        or target_path.start_protocol == protocol.id
                        or target_path.start_protocol is None
                    ):
                        # The protocol takes input from itself / a nested protocol.
                        # This is handled by the protocol directly so we can skip here.
                        continue

                    property_name = target_path.property_name
                    property_index = None

                    nested_property_name = None

                    if property_name.find(".") > 0:

                        nested_property_name = ".".join(property_name.split(".")[1:])
                        property_name = property_name.split(".")[0]

                    if property_name.find("[") >= 0 or property_name.find("]") >= 0:

                        property_name, property_index = extract_variable_index_and_name(
                            property_name
                        )

                    target_protocol_ids = target_path.protocol_ids

                    target_value = previous_outputs[
                        ProtocolPath(property_name, *target_protocol_ids)
                    ]

                    if property_index is not None:
                        target_value = target_value[property_index]

                    if nested_property_name is not None:
                        target_value = get_nested_attribute(
                            target_value, nested_property_name
                        )

                    protocol.set_value(source_path, target_value)

            logger.info(f"Executing {protocol.id}")

            start_time = time.perf_counter()
            protocol.execute(directory, available_resources)

            output = {key.full_path: value for key, value in protocol.outputs.items()}
            output = json.dumps(output, cls=TypedJSONEncoder)

            end_time = time.perf_counter()

            execution_time = (end_time - start_time) * 1000
            logger.info(f"{protocol.id} finished executing after {execution_time} ms")

        except Exception as e:

            logger.info(f"Protocol failed to execute: {protocol.id}")

            if not safe_exceptions:
                raise

            exception = WorkflowException.from_exception(e)
            exception.protocol_id = protocol.id

            output = exception.json()

        with open(output_path, "w") as file:
            file.write(output)

        return protocol.id, output_path


[docs]@workflow_protocol()
class ProtocolGroup(Protocol):
    """A group of workflow protocols to be executed in one batch.

    This may be used for example to cluster together multiple protocols
    that will execute in a linear chain so that multiple scheduler
    execution calls are reduced into a single one.

    Additionally, a group may provide enhanced behaviour, for example
    running all protocols within the group self consistently until
    a given condition is met (e.g run a simulation until a given observable
    has converged).
    """

    @property
    def required_inputs(self):
        """list of ProtocolPath: The inputs which must be set on this protocol."""
        required_inputs = super(ProtocolGroup, self).required_inputs

        # Pull each of an individual protocols inputs up so that they
        # become a required input of the group.
        for protocol in self._protocols:

            for input_path in protocol.required_inputs:

                input_path = input_path.copy()

                if input_path.start_protocol != protocol.id:
                    input_path.prepend_protocol_id(protocol.id)

                input_path.prepend_protocol_id(self.id)

                required_inputs.append(input_path)

        return required_inputs

    @property
    def dependencies(self):
        """list of ProtocolPath: A list of pointers to the protocols which this
        protocol takes input from.
        """
        dependencies = super(ProtocolGroup, self).dependencies
        # Remove child dependencies.
        dependencies = [
            x for x in dependencies if x.start_protocol not in self.protocols
        ]

        return dependencies

    @property
    def outputs(self):
        """dict of ProtocolPath and Any: A dictionary of the outputs of this property."""

        outputs = super(ProtocolGroup, self).outputs

        for protocol in self._protocols:

            for output_path in protocol.outputs:

                output_value = protocol.get_value(output_path)
                output_path = output_path.copy()

                if output_path.start_protocol != protocol.id:
                    output_path.prepend_protocol_id(protocol.id)

                output_path.prepend_protocol_id(self.id)

                outputs[output_path] = output_value

        return outputs

    @property
    def protocols(self):
        """dict of str and Protocol: A dictionary of the protocols in
        this groups, where the dictionary key is the protocol id, and
        the value is the protocol itself.

        Notes
        -----
        This property should *not* be altered. Use `add_protocols` to
        add new protocols to the group.
        """
        return {protocol.id: protocol for protocol in self._protocols}

[docs]    def __init__(self, protocol_id):
        """Constructs a new ProtocolGroup."""
        super().__init__(protocol_id)

        self._protocols = []

        self._inner_graph = ProtocolGraph()
        self._enable_checkpointing = True

    def _get_schema(self, schema_type=ProtocolGroupSchema, *args):

        protocol_schemas = {x.id: x.schema for x in self._protocols}

        schema = super(ProtocolGroup, self)._get_schema(
            schema_type, protocol_schemas, *args
        )

        return schema

    def _set_schema(self, schema_value):
        """
        Parameters
        ----------
        schema_value: ProtocolGroupSchema
            The schema from which this group should take its properties.
        """

        super(ProtocolGroup, self)._set_schema(schema_value)

        self._protocols = []
        self._inner_graph = ProtocolGraph()

        protocols_to_add = []

        for protocol_schema in schema_value.protocol_schemas.values():

            protocol = Protocol.from_schema(protocol_schema)
            protocols_to_add.append(protocol)

        self.add_protocols(*protocols_to_add)

[docs]    def add_protocols(self, *protocols):
        """Add protocols to this group.

        Parameters
        ----------
        protocols: Protocol
            The protocols to add.
        """
        for protocol in protocols:

            if protocol.id in self.protocols:

                raise ValueError(
                    f"The {self.id} group already contains a protocol "
                    f"with id {protocol.id}."
                )

            self._protocols.append(protocol)

        self._inner_graph.add_protocols(*protocols, allow_external_dependencies=True)

[docs]    def set_uuid(self, value):
        """Store the uuid of the calculation this protocol belongs to

        Parameters
        ----------
        value : str
            The uuid of the parent calculation.
        """
        for protocol in self._protocols:
            protocol.set_uuid(value)

        super(ProtocolGroup, self).set_uuid(value)

        # Rebuild the inner graph
        self._inner_graph = ProtocolGraph()
        self._inner_graph.add_protocols(
            *self._protocols, allow_external_dependencies=True
        )

[docs]    def replace_protocol(self, old_id, new_id):
        """Finds each input which came from a given protocol
         and redirects it to instead take input from a different one.

        Parameters
        ----------
        old_id : str
            The id of the old input protocol.
        new_id : str
            The id of the new input protocol.
        """
        for protocol in self._protocols:
            protocol.replace_protocol(old_id, new_id)

        super(ProtocolGroup, self).replace_protocol(old_id, new_id)

        # Rebuild the inner graph
        if old_id in self.protocols:

            self._inner_graph = ProtocolGraph()
            self._inner_graph.add_protocols(
                *self._protocols, allow_external_dependencies=True
            )

    def _execute(self, directory, available_resources):

        # Update the inputs of protocols which require values
        # from the protocol group itself (i.e. the current
        # iteration from a conditional group).
        for required_input in self.required_inputs:

            if required_input.start_protocol != self.id:
                continue

            value_references = self.get_value_references(required_input)

            if len(value_references) == 0:
                continue

            for input_path, value_reference in value_references.items():

                if (
                    value_reference.protocol_path != self.id
                    and value_reference.start_protocol is not None
                ):
                    continue

                value = self.get_value(value_reference)
                self.set_value(input_path, value)

        self._inner_graph.execute(
            directory,
            compute_resources=available_resources,
            enable_checkpointing=self._enable_checkpointing,
            safe_exceptions=False,
        )

[docs]    def can_merge(self, other, path_replacements=None):

        if path_replacements is None:
            path_replacements = []

        path_replacements.append((other.id, self.id))

        if not isinstance(other, ProtocolGroup):
            return False

        if not super(ProtocolGroup, self).can_merge(other, path_replacements):
            return False

        # Ensure that at least one root protocol from each group can be merged.
        for self_id in self._inner_graph.root_protocols:
            for other_id in other._inner_graph.root_protocols:

                if self.protocols[self_id].can_merge(
                    other.protocols[other_id], path_replacements
                ):
                    return True

        return False

[docs]    def merge(self, other):

        assert isinstance(other, ProtocolGroup)
        merged_ids = super(ProtocolGroup, self).merge(other)

        # Update the protocol ids of the other grouped protocols.
        for protocol in other.protocols.values():
            protocol.replace_protocol(other.id, self.id)

        # Merge the two groups using the inner protocol graph.
        new_merged_ids = self._inner_graph.add_protocols(
            *other.protocols.values(), allow_external_dependencies=True
        )

        # Replace the original protocol list with the new one.
        self._protocols = list(self._inner_graph.protocols.values())

        merged_ids.update(new_merged_ids)
        return merged_ids

[docs]    def get_value_references(self, input_path):

        values_references = super(ProtocolGroup, self).get_value_references(input_path)

        for key, value_reference in values_references.items():

            if value_reference.start_protocol not in self.protocols:
                continue

            value_reference = value_reference.copy()
            value_reference.prepend_protocol_id(self.id)

            values_references[key] = value_reference

        return values_references

    def _get_next_in_path(self, reference_path):
        """Returns the id of the next protocol in a protocol path,
        making sure that the targeted protocol is within this group.

        Parameters
        ----------
        reference_path: ProtocolPath
            The path being traversed.

        Returns
        -------
        str
            The id of the next protocol in the path.
        ProtocolPath
            The remainder of the path to be traversed.
        """

        # Make a copy of the path so we can alter it safely.
        reference_path_clone = copy.deepcopy(reference_path)

        if reference_path.start_protocol == self.id:
            reference_path_clone.pop_next_in_path()

        target_protocol_id = reference_path_clone.pop_next_in_path()

        if target_protocol_id not in self.protocols:

            raise ValueError(
                "The reference path does not target this protocol "
                "or any of its children."
            )

        return target_protocol_id, reference_path_clone

[docs]    def get_class_attribute(self, reference_path):

        if (
            len(reference_path.protocol_path) == 0
            or reference_path.protocol_path == self.id
        ):
            return super(ProtocolGroup, self).get_class_attribute(reference_path)

        target_protocol_id, truncated_path = self._get_next_in_path(reference_path)
        return self.protocols[target_protocol_id].get_class_attribute(truncated_path)

[docs]    def get_value(self, reference_path):

        if (
            len(reference_path.protocol_path) == 0
            or reference_path.protocol_path == self.id
        ):
            return super(ProtocolGroup, self).get_value(reference_path)

        target_protocol_id, truncated_path = self._get_next_in_path(reference_path)
        return self.protocols[target_protocol_id].get_value(truncated_path)

[docs]    def set_value(self, reference_path, value):

        if (
            len(reference_path.protocol_path) == 0
            or reference_path.protocol_path == self.id
        ):
            return super(ProtocolGroup, self).set_value(reference_path, value)

        if isinstance(value, ProtocolPath) and value.start_protocol == self.id:
            value.pop_next_in_path()

        target_protocol_id, truncated_path = self._get_next_in_path(reference_path)
        return self.protocols[target_protocol_id].set_value(truncated_path, value)

[docs]    def apply_replicator(
        self,
        replicator,
        template_values,
        template_index=-1,
        template_value=None,
        update_input_references=False,
    ):

        protocols, replication_map = replicator.apply(
            self.protocols, template_values, template_index, template_value
        )

        if (
            template_index >= 0 or template_value is not None
        ) and update_input_references is True:

            raise ValueError(
                "Specific template indices and values cannot be passed "
                "when `update_input_references` is True"
            )

        if update_input_references:
            replicator.update_references(protocols, replication_map, template_values)

        # Re-initialize the group using the replicated protocols.
        self._protocols = []
        self._inner_graph = ProtocolGraph()
        self.add_protocols(*protocols.values())

        return replication_map