Source code for swiftemulator.io.swift

"""
I/O functions for reading in SWIFT simulation data.

Includes functions to read parameters to
instances of :class:``ModelParameters``, and functions
to read model data to instances of :class:``ModelValues``.

Also includes functions to write out :class:``ModelParameters``
as files, based on a base parameter file.
"""

from swiftemulator.backend.model_parameters import ModelParameters
from swiftemulator.backend.model_specification import ModelSpecification
from swiftemulator.backend.model_values import ModelValues

from typing import List, Optional, Dict, Hashable, Tuple, Union, Callable
from functools import reduce
from pathlib import Path
from math import log10

import yaml
import numpy as np


[docs]def load_pipeline_outputs( filenames: Dict[Hashable, Path], scaling_relations: List[str], log_independent: Optional[List[str]] = None, log_dependent: Optional[List[str]] = None, histogram_for_error: Optional[str] = None, sanitize_output: bool = False, ) -> Tuple[Dict[str, ModelValues], Dict[str, Dict[str, Union[str, bool]]]]: """ Loads the pipeline outputs from the provided files, for the given specified scaling relations, into a :class:``ModelValues`` container. Parameters ---------- filenames, Dict[Hashable, Path] Paths to files to load data from, with the keys in the dictionary the unique identifiers for the models to use throughout. scaling_relations: List[str] Top-level name for the scaling relations (i.e. the top-level item in the yaml file, e.g. ``stellar_mass_function_100``). log_independent: List[str], optional Scaling relations (the same as in ``scaling_relations``) where the independent values (given by ``centers`` in the yaml files) should be log-scaled (uses ``log10``). log_dependent: List[str], optional Scaling relations (the same as in ``scaling_relations``) where the dependent values (given by ``values`` in the yaml files) should be log-scaled (uses ``log10``). histogram_for_error: str, optional If provided, a histogram to use for sampling-based errors on each bin. This can be beneficial versus the scatter-based default errors when there is significant intrinsic scatter. Errors are reported as ``dependent / sqrt(N)`` (or ``log(1 + 1 / sqrt(N)`` in the case of log-scaled). Note that you _must_ ensure consistent binning between the other input scaling relations and this histogram, or there will be an error. sanitize_output: bool, optional If set to True, removes all zero-valued, infinity, and NaN dependent and independent values from the data. This helps sanitize input for the emulator, but is slower. Returns ------- model_values, Dict[str, ModelValues] Dictionary of ``ModelValues`` containers for each scaling relation, read from the files. The keys are the names of the scaling relations. unit_dict, Dict[str, Dict[str, Union[str, bool]]] Dictionary of symbolic units for each scaling relation. Has the structure: ``{scaling_relation: {independent: "Msun", dependent: "kpc", log_independent: True, log_dependent: True}}``. """ model_values = {scaling_relation: {} for scaling_relation in scaling_relations} unit_dict = {scaling_relation: {} for scaling_relation in scaling_relations} if log_independent is None: log_independent = [] if log_dependent is None: log_dependent = [] # Need to search for possible keys within the `lines` dictionary. # Priority given by ordering of line_types line_types = [ "median", "mass_function", "mean", "adaptive_mass_function", "histogram", ] recursive_search = ( lambda d, k: d.get(k[0], recursive_search(d, k[1:])) if len(k) > 0 else None ) line_search = lambda d: recursive_search(d, line_types) for unique_identifier, filename in filenames.items(): with open(filename, "r") as handle: raw_data = yaml.safe_load(handle) if histogram_for_error is not None: line = raw_data[histogram_for_error]["lines"]["histogram"] number_of_items = np.array(line["values"]) inverse_square_root_error = 1.0 / np.sqrt(number_of_items) for scaling_relation in scaling_relations: line = line_search(raw_data[scaling_relation]["lines"]) if line is None: continue independent = np.array(line["centers"]) unit_dict[scaling_relation]["independent_units"] = line["centers_units"] dependent = np.array(line["values"]) unit_dict[scaling_relation]["dependent_units"] = line["values_units"] dependent_error = np.array(line.get("scatter", np.zeros_like(dependent))) if scaling_relation in log_independent: independent = np.log10(independent) unit_dict[scaling_relation]["log_independent"] = True else: unit_dict[scaling_relation]["log_independent"] = False if scaling_relation in log_dependent: if histogram_for_error: # Need to curtail at end because of bins that contain the unbinnable galaxies dependent_error = np.log10( 1.0 + inverse_square_root_error[: len(dependent)] ) else: # Handle case of dependent errors needing to be logged. if dependent_error.ndim > 1: lower = dependent - dependent_error[0, :] upper = dependent + dependent_error[1, :] else: lower = dependent - dependent_error upper = dependent + dependent_error dependent = np.log10(dependent) upper_diff = np.log10(upper) - dependent lower_diff = dependent - np.log10(lower) dependent_error = 0.5 * (upper_diff + lower_diff) unit_dict[scaling_relation]["log_dependent"] = True else: unit_dict[scaling_relation]["log_dependent"] = False if histogram_for_error: dependent_error = ( dependent * inverse_square_root_error[: len(dependent)] ) else: if dependent_error.ndim > 1: # Only need to average if multi-dimensional, otherwise we just inherit # from above. dependent_error = np.mean(dependent_error, axis=0) if sanitize_output: sanitization_mask = np.logical_and.reduce( [ np.isfinite(independent), np.isfinite(dependent), np.isfinite(dependent_error), ] ) else: sanitization_mask = np.s_[:] model_values[scaling_relation][unique_identifier] = { "independent": independent[sanitization_mask], "dependent": dependent[sanitization_mask], "dependent_error": dependent_error[sanitization_mask], } return {k: ModelValues(v) for k, v in model_values.items()}, unit_dict
[docs]def load_parameter_files( filenames: Dict[Hashable, Path], parameters: List[str], log_parameters: Optional[List[str]] = None, parameter_printable_names: Optional[List[str]] = None, parameter_limits: Optional[List[List[float]]] = None, ) -> Tuple[ModelSpecification, ModelParameters]: """ Loads information from the parameter files and returns the associated model specification and model parameters instances. Parameters ---------- filenames, Dict[Hashable, Path] Paths to the parameter files, keyed by their unique identifiers (i.e. those also used in :func:`load_pipeline_outputs`). parameters, List[str] Parameters to load from the yaml files. Should be specified in the same way as the ``--param`` option in SWIFT, i.e. in the format ``SectionName:ParameterName``. log_parameters, List[str], optional Which parameters in the list above should be scaled logarithmically. parameter_printable_names, List[str], optional Optional 'fancy' names for your parameters. These strings will be used on any figures generated through swift-emulator. Can include LaTeX formatting as in ``matplotlib``. parameter_limits, List[List[float]], optional The lower and upper limit of the input parameters. Should be the same length as ``parameters``, but each item is a list of length two, with a lower and upper bound. For example, in a two parameter model ``[[0.0, 1.0], [8.3, 9.3]]`` would mean that the first parameter would vary between 0.0 and 1.0, with the second parameter varying between 8.3 and 9.3. If not provided, these will be inferred from the data. Returns ------- model_specification, ModelSpecification Specification for the model based on the parameters that have been passed to this function. model_parameters, ModelParameters Model parameter container corresponding to the SWIFT parameter files. """ if log_parameters is None: log_parameters = [] else: for parameter in log_parameters: if not parameter in parameters: raise AttributeError( f"Parameter {parameter} requested for logarithmic transform " "not available in main list of parameters." ) read_parameters = {k: None for k in filenames.keys()} for unique_identifier, filename in filenames.items(): with open(filename, "r") as handle: full_parameter_file = yaml.load(handle, Loader=yaml.Loader) base_parameters = { parameter: float( reduce(lambda d, k: d.get(k), parameter.split(":"), full_parameter_file) ) for parameter in parameters } read_parameters[unique_identifier] = { parameter: log10(value) if parameter in log_parameters else value for parameter, value in base_parameters.items() } if parameter_limits is None: parameter_limits = [ [ min([model[parameter] for model in read_parameters.values()]), max([model[parameter] for model in read_parameters.values()]), ] for parameter in parameters ] model_specification = ModelSpecification( number_of_parameters=len(parameters), parameter_names=parameters, parameter_limits=parameter_limits, parameter_printable_names=parameter_printable_names, ) model_parameters = ModelParameters(model_parameters=read_parameters) return model_specification, model_parameters
[docs]def write_parameter_files( filenames: Dict[Hashable, Path], model_parameters: ModelParameters, parameter_transforms: Optional[Dict[str, Callable]] = None, base_parameter_file: Optional[Path] = None, ): """ Writes parameter files, containing the parameters from a :class:`ModelParameters` instance, based on a base parameter file. Parameters ---------- filenames: Dict[Hashable, Path] Dictionary stating where to write each parameter file, based upon the unique identifiers of each run in ``model_parameters``. model_parameters: ModelParameters Varied parameters to write in the output files. parameter_transforms: Dict[str, Callable], optional Parameter transformation functions for transforming parameter values before writing. Parameters may be generated (and emulated) in a space that is very different to their meaning in the code. Hence, this parameter allows for a transformation (for instance, a logrithmic transformation). For each parameter that should be transformed (keys) there should be a function taking the emulated value, transforming it into the code value. For instance, if a parameter is emulated in logarithmic space, this should be ``lambda x: 10**x``. base_parameter_file: Path, optional Base parameter file to read. The parameters specified in ``model_parameters`` will be overwritten when writing each individual file, but the rest will remain the same. Notes ----- Also changes the value of ``MetaData:run_name`` to the unique identifiers. """ if base_parameter_file is not None: with open(base_parameter_file, "r") as handle: base_parameters = yaml.load(handle, Loader=yaml.Loader) else: base_parameters = {} if parameter_transforms is None: parameter_transforms = {} for identifier, filename in filenames.items(): parameters = base_parameters.copy() for parameter, value in { "MetaData:run_name": str(identifier), **model_parameters.model_parameters[identifier], }.items(): section, key = parameter.split(":") transform = parameter_transforms.get(parameter, lambda x: x) try: parameters[section][key] = transform(value) except KeyError: parameters[section] = {} parameters[section][key] = transform(value) with open(filename, "w") as handle: yaml.dump(parameters, handle, default_flow_style=False) return