Coverage for wsimod\validation.py: 0%
81 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-24 11:16 +0100
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-24 11:16 +0100
1import ast
2import os
3from pathlib import Path
4from typing import Any, Literal, Optional, Union
6import pandas as pd
7import yaml
9from wsimod.core import constants
12def evaluate_input_file(settings: Path) -> Literal["saved", "custom"]:
13 """Decides what type of input file we are dealing with.
15 "save" correspond to fully constructed models which have been saved, alongside
16 any necessary data files. "custom" are input files constructed manually.
18 Raises:
19 ValueError: If the settings file do not exist.
21 Return:
22 If the input file is a saved model file or a custom input.
23 """
24 if settings.is_dir() or not settings.exists():
25 raise ValueError(
26 f"The settings file at {settings.absolute()} could not be found."
27 )
29 with settings.open("rb") as f:
30 settings_ = yaml.safe_load(f)
32 if set(["data", "inputs", "outputs"]).isdisjoint(settings_.keys()):
33 return "saved"
35 return "custom"
38def validate_io_args(
39 settings: Path, inputs: Optional[Path], outputs: Optional[Path]
40) -> dict[str, Any]:
41 """Validate the io arguments, including their definition in settings.
43 This does not include validating the existance of data input files, which is done
44 at a later stage.
46 Args:
47 settings (Path): The path to the file, in TOML format, containing all the
48 configuration required for the simulation.
49 inputs (Optional[Path]): Base directory for all input files. If present,
50 overwrites value in the settings file.
51 outputs (Optional[Path]): Base directory for all output files. If present,
52 overwrites value in the settings file.
54 Returns:
55 dict[str, Any]: The loaded settings file with validated inputs and outputs.
56 """
57 if settings.is_dir() or not settings.exists():
58 raise ValueError(
59 f"The settings file at {settings.absolute()} could not be found."
60 )
62 with settings.open("rb") as f:
63 settings_ = yaml.safe_load(f)
65 # Valildate inputs folder
66 settings_["inputs"] = _validate_input_dir(
67 inputs if inputs else settings_.get("inputs", None), default=settings.parent
68 )
70 # Valildate outputs folder
71 settings_["outputs"] = _validate_output_dir(
72 outputs if outputs else settings_.get("outputs", None), default=settings.parent
73 )
75 return settings_
78def _validate_input_dir(input_dir: Optional[Path], default: Path) -> Path:
79 """Validates the directory of input files.
81 If not provided, the default directory is used.
83 Args:
84 input_dir (Optional[Path]): The potential directory with the inputs.
85 default (Path): Default input path if none provided.
87 Raises:
88 ValueError: If the inputs base directory is not actually a directory.
90 Returns:
91 Path: The validated path containing the inputs.
92 """
93 if not input_dir:
94 return default.absolute()
96 input_dir = Path(input_dir).absolute()
97 if not input_dir.is_dir():
98 raise ValueError(
99 f"The inputs base directory at {input_dir} is not a directory."
100 )
101 return input_dir
104def _validate_output_dir(output_dir: Optional[Path], default: Path) -> Path:
105 """Validates the directory for output files.
107 If not provided, the default path is used. If it does not exist, it is created.
109 Args:
110 output_dir (Optional[Path]): The potential directory for the outputs.
111 default (Path): Defualt output path if none provided.
113 Raises:
114 ValueError: If a file with the same name already exist.
116 Returns:
117 Path: The validated path containing where outputs will be saved.
118 """
119 if not output_dir:
120 return default.absolute()
122 output_dir = Path(output_dir).absolute()
123 if output_dir.exists() and not output_dir.is_dir():
124 raise ValueError(f"A file at {output_dir} exists and is not a directory.")
126 os.makedirs(output_dir, exist_ok=True)
127 return output_dir
130def load_data_files(
131 data_settings: dict[str, Any], input_dir: Path
132) -> dict[str, Union[pd.DataFrame, pd.Series, dict]]:
133 """Reads the settings data section and reads the required data from files.
135 Args:
136 data_settings (dict[str, Any]): The data section of the settings file.
137 input_dir (Path): The directory where input files are located.
139 Returns:
140 dict[str, Union[pd.DataFrame, pd.Series, dict]]: Loaded dataframe, series or
141 dictionary following the instructions.
142 """
143 return {
144 f"data:{key}": read_data(var, input_dir) for key, var in data_settings.items()
145 }
148def assign_data_to_settings(
149 settings: dict[str, Any],
150 data_settings: dict[str, Union[pd.DataFrame, pd.Series, dict]],
151) -> dict[str, Any]:
152 """Assigns the data files to the right variables in the settings dictionary.
154 Search for data files to load is done recursively, walking through the whole
155 settgins dictionary tree.
157 Args:
158 settings (dict[str, Any]): The settings dicitonary.
159 input_dir (Path): The directory where input files are located.
161 Returns:
162 dict[str, Any]: A new settings dictionary where data files have been loaded.
163 """
164 loaded_settings: dict[str, Any] = {}
166 for k, v in settings.items():
167 if isinstance(v, dict):
168 loaded_settings[k] = assign_data_to_settings(v, data_settings)
169 elif isinstance(v, list):
170 loaded_settings[k] = [
171 assign_data_to_settings(item, data_settings) for item in v
172 ]
173 elif isinstance(v, str) and v.startswith("data:"):
174 try:
175 loaded_settings[k] = data_settings[v]
176 except KeyError:
177 raise ValueError(
178 f"{v} could not be found. Did you configure loading that data in"
179 " the data section of the settings file?"
180 )
181 else:
182 loaded_settings[k] = v
184 return loaded_settings
187def read_data(
188 instructions: dict[str, Any], inputs: Path
189) -> Union[pd.DataFrame, pd.Series, dict]:
190 """Uses the instructions to load tabular data.
192 The instructions are a dictionary of options that define what file to load, how
193 to load it and some simple manipulations to do to the loaded pandas Dataframe
194 before returing it.
196 The keys to control this proces are:
198 filename: Filename of the data to load
199 filter (optional): List of filters for the dataframe, each a dictionary in the
200 form:
201 where: column to filer
202 is: value of that column
203 scaling (optional): List of variable scaling, each a dictionary of the form:
204 where: column to filer (optional)
205 is: value of that column (optional)
206 variable: name of the column to scale
207 factor: unit conversion factor, as defined in `wsimod.core.constants`,
208 eg. MM_TO_M
209 format (optional): How the output should be provided. If format is `dict` then
210 the output is provided as a dictonary, otherwise a Dataframe or a Series
211 (if there is only 1 column) is output.
212 index (optional): Column(s) to use as index.
213 output (optional): Column to provide as output.
214 options (optional): Options to pass to the `pandas.read_csv` function.
216 The order in which operations are done is:
218 read -> filter -> scale -> set_index -> select_output -> convert_format
220 Only the `read` step will always happen. The others depend on the inputs.
222 Args:
223 instructions (str): A dictionary with instructions to load the data.
224 inputs (Path): Base directory of inputs.
226 Returns:
227 Union[pd.DataFrame, pd.Series, dict]: Loaded dataframe, series or dictionary
228 following the instructions.
229 """
230 filename = inputs / Path(instructions["filename"])
231 options_: dict[str, Any] = process_options(instructions.get("options", ""))
232 data = pd.read_csv(inputs / Path(filename), **options_)
234 for filter in instructions.get("filters", []):
235 data = data.loc[data[filter["where"]] == filter["is"]]
237 for scaler in instructions.get("scaling", []):
238 idx = data[scaler["where"]] == scaler["is"] if "is" in scaler else slice(None)
239 factor = (
240 getattr(constants, scaler["factor"])
241 if isinstance(scaler["factor"], str)
242 else scaler["factor"]
243 )
244 data.loc[idx, scaler["variable"]] *= factor
246 if index := instructions.get("index", None):
247 data = data.set_index(index)
249 if output := instructions.get("output", None):
250 data = data[output]
252 if isinstance(data, pd.DataFrame) and len(data.columns) == 1:
253 data = data.squeeze()
255 if instructions.get("format", "") == "dict":
256 return data.to_dict()
258 return data
261def process_options(options: str) -> dict[str, Any]:
262 """Formats the options string as keyword arguments.
264 >>> process_options("sep=' ',index_col='datetime'")
265 {'sep': ' ', 'index_col': 'datetime'}
267 Args:
268 options (str): The strings with the arguments to process.
270 Returns:
271 dict[str, Any]: The dictionary with the processed keyword arguments.
272 """
273 if not options:
274 return {}
276 args = "f({})".format(options)
277 tree = ast.parse(args)
278 funccall = tree.body[0].value
280 kwargs = {arg.arg: ast.literal_eval(arg.value) for arg in funccall.keywords}
281 return kwargs