Source code for openff.evaluator.datasets.curation.components.freesolv

import io
import logging
import re
from typing import List, Union

import pandas
import requests
from typing_extensions import Literal

from openff.evaluator.datasets import (
    MeasurementSource,
    PhysicalPropertyDataSet,
    PropertyPhase,
)
from openff.evaluator.datasets.curation.components import (
    CurationComponent,
    CurationComponentSchema,
)
from openff.evaluator.properties import SolvationFreeEnergy
from openff.evaluator.substances import Component, ExactAmount, MoleFraction, Substance
from openff.evaluator.thermodynamics import ThermodynamicState

logger = logging.getLogger(__name__)


[docs]class ImportFreeSolvSchema(CurationComponentSchema): type: Literal["ImportFreeSolv"] = "ImportFreeSolv"
[docs]class ImportFreeSolv(CurationComponent): """A component which will import the latest version of the FreeSolv data set from the GitHub repository where it is stored. """ @classmethod def _download_free_solv(cls) -> pandas.DataFrame: """Downloads the FreeSolv data set from GitHub. Returns ------- The Free Solv data stored in a pandas data frame. """ # Download the database from GitHub download_request = requests.get( "https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt" ) download_request.raise_for_status() text_contents = download_request.text # Unify the delimiter text_contents = text_contents.replace("; ", ";") # Convert the set to a pandas object text_buffer = io.StringIO(text_contents) free_solv_data_frame = pandas.read_csv(text_buffer, delimiter=";", skiprows=2) return free_solv_data_frame @classmethod def _validate_doi(cls, doi: str): """Attempts to validate a string which may contain a (or multiple) digital object identifier. If a valid DOI is not found, the FreeSolv DOI itself is returned.""" fall_back_doi = "10.5281/zenodo.596537" # From https://www.crossref.org/blog/dois-and-matching-regular-expressions/ doi_patterns = [ r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$", r"^10.1002/[^\s]+$", r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$", r"^10.1021/\w\w\d+$", r"^10.1207/[\w\d]+\&\d+_\d+$", ] # Split the string to try and catch concatenated DOIs doi_split = doi.split(" and ") matched_dois: List[str] = [] for split_doi in doi_split: matched_doi = None for doi_pattern in doi_patterns: regex_match = re.match(doi_pattern, split_doi, re.I) if not regex_match: continue matched_doi = regex_match.group() break if not isinstance(matched_doi, str): continue matched_dois.append(matched_doi) final_doi = ( fall_back_doi if len(matched_dois) == 0 else " + ".join(matched_dois) ) return final_doi @classmethod def _apply( cls, data_frame: pandas.DataFrame, schema: ImportFreeSolvSchema, n_processes, ) -> pandas.DataFrame: from openff.units import unit from openff.evaluator import properties, substances # Convert the data frame into data rows. free_solv_data_frame = cls._download_free_solv() data_entries = [] for _, row in free_solv_data_frame.iterrows(): # Extract and standardize the SMILES pattern of the solute_smiles = row["SMILES"].lstrip().rstrip() solute_smiles = substances.Component(solute_smiles).smiles # Build the substance. substance = Substance() substance.add_component(Component(smiles="O"), MoleFraction(1.0)) substance.add_component( Component(smiles=solute_smiles, role=Component.Role.Solute), ExactAmount(1), ) # Extract the value and uncertainty value = ( float(row["experimental value (kcal/mol)"]) * unit.kilocalorie / unit.mole ) std_error = ( float(row["experimental uncertainty (kcal/mol)"]) * unit.kilocalorie / unit.mole ) # Attempt to extract a DOI original_source = row[ "experimental reference (original or paper this value was taken from)" ] doi = cls._validate_doi(original_source) data_entry = SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, substance=substance, value=value.to(properties.SolvationFreeEnergy.default_unit()), uncertainty=std_error.to(properties.SolvationFreeEnergy.default_unit()), source=MeasurementSource(doi=doi), ) data_entries.append(data_entry) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) free_solv_data_frame = data_set.to_pandas() data_frame = pandas.concat( [data_frame, free_solv_data_frame], ignore_index=True, sort=False ) return data_frame
FreeSolvComponentSchema = Union[ImportFreeSolvSchema]