Coverage for wsimod/validation.py: 0%

1import ast

2import os

3from pathlib import Path

4from typing import Any, Literal, Optional, Union

6import pandas as pd

7import yaml

9from wsimod.core import constants

12def evaluate_input_file(settings: Path) -> Literal["saved", "custom"]:

13 """Decides what type of input file we are dealing with.

15 "save" correspond to fully constructed models which have been saved, alongside

16 any necessary data files. "custom" are input files constructed manually.

18 Raises:

19 ValueError: If the settings file do not exist.

21 Return:

22 If the input file is a saved model file or a custom input.

23 """

24 if settings.is_dir() or not settings.exists():

25 raise ValueError(

26 f"The settings file at {settings.absolute()} could not be found."

27 )

29 with settings.open("rb") as f:

30 settings_ = yaml.safe_load(f)

32 if set(["data", "inputs", "outputs"]).isdisjoint(settings_.keys()):

33 return "saved"

35 return "custom"

38def validate_io_args(

39 settings: Path, inputs: Optional[Path], outputs: Optional[Path]

40) -> dict[str, Any]:

41 """Validate the io arguments, including their definition in settings.

43 This does not include validating the existance of data input files, which is done

44 at a later stage.

46 Args:

47 settings (Path): The path to the file, in TOML format, containing all the

48 configuration required for the simulation.

49 inputs (Optional[Path]): Base directory for all input files. If present,

50 overwrites value in the settings file.

51 outputs (Optional[Path]): Base directory for all output files. If present,

52 overwrites value in the settings file.

54 Returns:

55 dict[str, Any]: The loaded settings file with validated inputs and outputs.

56 """

57 if settings.is_dir() or not settings.exists():

58 raise ValueError(

59 f"The settings file at {settings.absolute()} could not be found."

60 )

62 with settings.open("rb") as f:

63 settings_ = yaml.safe_load(f)

65 # Valildate inputs folder

66 settings_["inputs"] = _validate_input_dir(

67 inputs if inputs else settings_.get("inputs", None), default=settings.parent

68 )

70 # Valildate outputs folder

71 settings_["outputs"] = _validate_output_dir(

72 outputs if outputs else settings_.get("outputs", None), default=settings.parent

73 )

75 return settings_

78def _validate_input_dir(input_dir: Optional[Path], default: Path) -> Path:

79 """Validates the directory of input files.

81 If not provided, the default directory is used.

83 Args:

84 input_dir (Optional[Path]): The potential directory with the inputs.

85 default (Path): Default input path if none provided.

87 Raises:

88 ValueError: If the inputs base directory is not actually a directory.

90 Returns:

91 Path: The validated path containing the inputs.

92 """

93 if not input_dir:

94 return default.absolute()

96 input_dir = Path(input_dir).absolute()

97 if not input_dir.is_dir():

98 raise ValueError(

99 f"The inputs base directory at {input_dir} is not a directory."

100 )

101 return input_dir

102

103

104def _validate_output_dir(output_dir: Optional[Path], default: Path) -> Path:

105 """Validates the directory for output files.

106

107 If not provided, the default path is used. If it does not exist, it is created.

108

109 Args:

110 output_dir (Optional[Path]): The potential directory for the outputs.

111 default (Path): Defualt output path if none provided.

112

113 Raises:

114 ValueError: If a file with the same name already exist.

115

116 Returns:

117 Path: The validated path containing where outputs will be saved.

118 """

119 if not output_dir:

120 return default.absolute()

121

122 output_dir = Path(output_dir).absolute()

123 if output_dir.exists() and not output_dir.is_dir():

124 raise ValueError(f"A file at {output_dir} exists and is not a directory.")

125

126 os.makedirs(output_dir, exist_ok=True)

127 return output_dir

128

129

130def load_data_files(

131 data_settings: dict[str, Any], input_dir: Path

132) -> dict[str, Union[pd.DataFrame, pd.Series, dict]]:

133 """Reads the settings data section and reads the required data from files.

134

135 Args:

136 data_settings (dict[str, Any]): The data section of the settings file.

137 input_dir (Path): The directory where input files are located.

138

139 Returns:

140 dict[str, Union[pd.DataFrame, pd.Series, dict]]: Loaded dataframe, series or

141 dictionary following the instructions.

142 """

143 return {

144 f"data:{key}": read_data(var, input_dir) for key, var in data_settings.items()

145 }

146

147

148def assign_data_to_settings(

149 settings: dict[str, Any],

150 data_settings: dict[str, Union[pd.DataFrame, pd.Series, dict]],

151) -> dict[str, Any]:

152 """Assigns the data files to the right variables in the settings dictionary.

153

154 Search for data files to load is done recursively, walking through the whole

155 settgins dictionary tree.

156

157 Args:

158 settings (dict[str, Any]): The settings dicitonary.

159 input_dir (Path): The directory where input files are located.

160

161 Returns:

162 dict[str, Any]: A new settings dictionary where data files have been loaded.

163 """

164 loaded_settings: dict[str, Any] = {}

165

166 for k, v in settings.items():

167 if isinstance(v, dict):

168 loaded_settings[k] = assign_data_to_settings(v, data_settings)

169 elif isinstance(v, list):

170 loaded_settings[k] = [

171 assign_data_to_settings(item, data_settings) for item in v

172 ]

173 elif isinstance(v, str) and v.startswith("data:"):

174 try:

175 loaded_settings[k] = data_settings[v]

176 except KeyError:

177 raise ValueError(

178 f"{v} could not be found. Did you configure loading that data in"

179 " the data section of the settings file?"

180 )

181 else:

182 loaded_settings[k] = v

183

184 return loaded_settings

185

186

187def read_data(

188 instructions: dict[str, Any], inputs: Path

189) -> Union[pd.DataFrame, pd.Series, dict]:

190 """Uses the instructions to load tabular data.

191

192 The instructions are a dictionary of options that define what file to load, how

193 to load it and some simple manipulations to do to the loaded pandas Dataframe

194 before returing it.

195

196 The keys to control this proces are:

197

198 filename: Filename of the data to load

199 filter (optional): List of filters for the dataframe, each a dictionary in the

200 form:

201 where: column to filer

202 is: value of that column

203 scaling (optional): List of variable scaling, each a dictionary of the form:

204 where: column to filer (optional)

205 is: value of that column (optional)

206 variable: name of the column to scale

207 factor: unit conversion factor, as defined in `wsimod.core.constants`,

208 eg. MM_TO_M

209 format (optional): How the output should be provided. If format is `dict` then

210 the output is provided as a dictonary, otherwise a Dataframe or a Series

211 (if there is only 1 column) is output.

212 index (optional): Column(s) to use as index.

213 output (optional): Column to provide as output.

214 options (optional): Options to pass to the `pandas.read_csv` function.

215

216 The order in which operations are done is:

217

218 read -> filter -> scale -> set_index -> select_output -> convert_format

219

220 Only the `read` step will always happen. The others depend on the inputs.

221

222 Args:

223 instructions (str): A dictionary with instructions to load the data.

224 inputs (Path): Base directory of inputs.

225

226 Returns:

227 Union[pd.DataFrame, pd.Series, dict]: Loaded dataframe, series or dictionary

228 following the instructions.

229 """

230 filename = inputs / Path(instructions["filename"])

231 options_: dict[str, Any] = process_options(instructions.get("options", ""))

232 data = pd.read_csv(inputs / Path(filename), **options_)

233

234 for filter in instructions.get("filters", []):

235 data = data.loc[data[filter["where"]] == filter["is"]]

236

237 for scaler in instructions.get("scaling", []):

238 idx = data[scaler["where"]] == scaler["is"] if "is" in scaler else slice(None)

239 factor = (

240 getattr(constants, scaler["factor"])

241 if isinstance(scaler["factor"], str)

242 else scaler["factor"]

243 )

244 data.loc[idx, scaler["variable"]] *= factor

245

246 if index := instructions.get("index", None):

247 data = data.set_index(index)

248

249 if output := instructions.get("output", None):

250 data = data[output]

251

252 if isinstance(data, pd.DataFrame) and len(data.columns) == 1:

253 data = data.squeeze()

254

255 if instructions.get("format", "") == "dict":

256 return data.to_dict()

257

258 return data

259

260

261def process_options(options: str) -> dict[str, Any]:

262 """Formats the options string as keyword arguments.

263

264 >>> process_options("sep=' ',index_col='datetime'")

265 {'sep': ' ', 'index_col': 'datetime'}

266

267 Args:

268 options (str): The strings with the arguments to process.

269

270 Returns:

271 dict[str, Any]: The dictionary with the processed keyword arguments.

272 """

273 if not options:

274 return {}

275

276 args = "f({})".format(options)

277 tree = ast.parse(args)

278 funccall = tree.body[0].value

279

280 kwargs = {arg.arg: ast.literal_eval(arg.value) for arg in funccall.keywords}

281 return kwargs