Source code for openff.evaluator.datasets.curation.components.freesolv
import io
import logging
import re
from typing import List, Union
import pandas
import requests
from typing_extensions import Literal
from openff.evaluator.datasets import (
MeasurementSource,
PhysicalPropertyDataSet,
PropertyPhase,
)
from openff.evaluator.datasets.curation.components import (
CurationComponent,
CurationComponentSchema,
)
from openff.evaluator.properties import SolvationFreeEnergy
from openff.evaluator.substances import Component, ExactAmount, MoleFraction, Substance
from openff.evaluator.thermodynamics import ThermodynamicState
logger = logging.getLogger(__name__)
[docs]class ImportFreeSolvSchema(CurationComponentSchema):
type: Literal["ImportFreeSolv"] = "ImportFreeSolv"
[docs]class ImportFreeSolv(CurationComponent):
"""A component which will import the latest version of the FreeSolv
data set from the GitHub repository where it is stored.
"""
@classmethod
def _download_free_solv(cls) -> pandas.DataFrame:
"""Downloads the FreeSolv data set from GitHub.
Returns
-------
The Free Solv data stored in a pandas data frame.
"""
# Download the database from GitHub
download_request = requests.get(
"https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt"
)
download_request.raise_for_status()
text_contents = download_request.text
# Unify the delimiter
text_contents = text_contents.replace("; ", ";")
# Convert the set to a pandas object
text_buffer = io.StringIO(text_contents)
free_solv_data_frame = pandas.read_csv(text_buffer, delimiter=";", skiprows=2)
return free_solv_data_frame
@classmethod
def _validate_doi(cls, doi: str):
"""Attempts to validate a string which may contain a (or multiple)
digital object identifier. If a valid DOI is not found, the FreeSolv
DOI itself is returned."""
fall_back_doi = "10.5281/zenodo.596537"
# From https://www.crossref.org/blog/dois-and-matching-regular-expressions/
doi_patterns = [
r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
r"^10.1002/[^\s]+$",
r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
r"^10.1021/\w\w\d+$",
r"^10.1207/[\w\d]+\&\d+_\d+$",
]
# Split the string to try and catch concatenated DOIs
doi_split = doi.split(" and ")
matched_dois: List[str] = []
for split_doi in doi_split:
matched_doi = None
for doi_pattern in doi_patterns:
regex_match = re.match(doi_pattern, split_doi, re.I)
if not regex_match:
continue
matched_doi = regex_match.group()
break
if not isinstance(matched_doi, str):
continue
matched_dois.append(matched_doi)
final_doi = (
fall_back_doi if len(matched_dois) == 0 else " + ".join(matched_dois)
)
return final_doi
@classmethod
def _apply(
cls,
data_frame: pandas.DataFrame,
schema: ImportFreeSolvSchema,
n_processes,
) -> pandas.DataFrame:
from openff.units import unit
from openff.evaluator import properties, substances
# Convert the data frame into data rows.
free_solv_data_frame = cls._download_free_solv()
data_entries = []
for _, row in free_solv_data_frame.iterrows():
# Extract and standardize the SMILES pattern of the
solute_smiles = row["SMILES"].lstrip().rstrip()
solute_smiles = substances.Component(solute_smiles).smiles
# Build the substance.
substance = Substance()
substance.add_component(Component(smiles="O"), MoleFraction(1.0))
substance.add_component(
Component(smiles=solute_smiles, role=Component.Role.Solute),
ExactAmount(1),
)
# Extract the value and uncertainty
value = (
float(row["experimental value (kcal/mol)"])
* unit.kilocalorie
/ unit.mole
)
std_error = (
float(row["experimental uncertainty (kcal/mol)"])
* unit.kilocalorie
/ unit.mole
)
# Attempt to extract a DOI
original_source = row[
"experimental reference (original or paper this value was taken from)"
]
doi = cls._validate_doi(original_source)
data_entry = SolvationFreeEnergy(
thermodynamic_state=ThermodynamicState(
temperature=298.15 * unit.kelvin,
pressure=101.325 * unit.kilopascal,
),
phase=PropertyPhase.Liquid,
substance=substance,
value=value.to(properties.SolvationFreeEnergy.default_unit()),
uncertainty=std_error.to(properties.SolvationFreeEnergy.default_unit()),
source=MeasurementSource(doi=doi),
)
data_entries.append(data_entry)
data_set = PhysicalPropertyDataSet()
data_set.add_properties(*data_entries)
free_solv_data_frame = data_set.to_pandas()
data_frame = pandas.concat(
[data_frame, free_solv_data_frame], ignore_index=True, sort=False
)
return data_frame
FreeSolvComponentSchema = Union[ImportFreeSolvSchema]