Coverage for wsimod\validation.py: 0%

81 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-10-24 11:16 +0100

1import ast 

2import os 

3from pathlib import Path 

4from typing import Any, Literal, Optional, Union 

5 

6import pandas as pd 

7import yaml 

8 

9from wsimod.core import constants 

10 

11 

12def evaluate_input_file(settings: Path) -> Literal["saved", "custom"]: 

13 """Decides what type of input file we are dealing with. 

14 

15 "save" correspond to fully constructed models which have been saved, alongside 

16 any necessary data files. "custom" are input files constructed manually. 

17 

18 Raises: 

19 ValueError: If the settings file do not exist. 

20 

21 Return: 

22 If the input file is a saved model file or a custom input. 

23 """ 

24 if settings.is_dir() or not settings.exists(): 

25 raise ValueError( 

26 f"The settings file at {settings.absolute()} could not be found." 

27 ) 

28 

29 with settings.open("rb") as f: 

30 settings_ = yaml.safe_load(f) 

31 

32 if set(["data", "inputs", "outputs"]).isdisjoint(settings_.keys()): 

33 return "saved" 

34 

35 return "custom" 

36 

37 

38def validate_io_args( 

39 settings: Path, inputs: Optional[Path], outputs: Optional[Path] 

40) -> dict[str, Any]: 

41 """Validate the io arguments, including their definition in settings. 

42 

43 This does not include validating the existance of data input files, which is done 

44 at a later stage. 

45 

46 Args: 

47 settings (Path): The path to the file, in TOML format, containing all the 

48 configuration required for the simulation. 

49 inputs (Optional[Path]): Base directory for all input files. If present, 

50 overwrites value in the settings file. 

51 outputs (Optional[Path]): Base directory for all output files. If present, 

52 overwrites value in the settings file. 

53 

54 Returns: 

55 dict[str, Any]: The loaded settings file with validated inputs and outputs. 

56 """ 

57 if settings.is_dir() or not settings.exists(): 

58 raise ValueError( 

59 f"The settings file at {settings.absolute()} could not be found." 

60 ) 

61 

62 with settings.open("rb") as f: 

63 settings_ = yaml.safe_load(f) 

64 

65 # Valildate inputs folder 

66 settings_["inputs"] = _validate_input_dir( 

67 inputs if inputs else settings_.get("inputs", None), default=settings.parent 

68 ) 

69 

70 # Valildate outputs folder 

71 settings_["outputs"] = _validate_output_dir( 

72 outputs if outputs else settings_.get("outputs", None), default=settings.parent 

73 ) 

74 

75 return settings_ 

76 

77 

78def _validate_input_dir(input_dir: Optional[Path], default: Path) -> Path: 

79 """Validates the directory of input files. 

80 

81 If not provided, the default directory is used. 

82 

83 Args: 

84 input_dir (Optional[Path]): The potential directory with the inputs. 

85 default (Path): Default input path if none provided. 

86 

87 Raises: 

88 ValueError: If the inputs base directory is not actually a directory. 

89 

90 Returns: 

91 Path: The validated path containing the inputs. 

92 """ 

93 if not input_dir: 

94 return default.absolute() 

95 

96 input_dir = Path(input_dir).absolute() 

97 if not input_dir.is_dir(): 

98 raise ValueError( 

99 f"The inputs base directory at {input_dir} is not a directory." 

100 ) 

101 return input_dir 

102 

103 

104def _validate_output_dir(output_dir: Optional[Path], default: Path) -> Path: 

105 """Validates the directory for output files. 

106 

107 If not provided, the default path is used. If it does not exist, it is created. 

108 

109 Args: 

110 output_dir (Optional[Path]): The potential directory for the outputs. 

111 default (Path): Defualt output path if none provided. 

112 

113 Raises: 

114 ValueError: If a file with the same name already exist. 

115 

116 Returns: 

117 Path: The validated path containing where outputs will be saved. 

118 """ 

119 if not output_dir: 

120 return default.absolute() 

121 

122 output_dir = Path(output_dir).absolute() 

123 if output_dir.exists() and not output_dir.is_dir(): 

124 raise ValueError(f"A file at {output_dir} exists and is not a directory.") 

125 

126 os.makedirs(output_dir, exist_ok=True) 

127 return output_dir 

128 

129 

130def load_data_files( 

131 data_settings: dict[str, Any], input_dir: Path 

132) -> dict[str, Union[pd.DataFrame, pd.Series, dict]]: 

133 """Reads the settings data section and reads the required data from files. 

134 

135 Args: 

136 data_settings (dict[str, Any]): The data section of the settings file. 

137 input_dir (Path): The directory where input files are located. 

138 

139 Returns: 

140 dict[str, Union[pd.DataFrame, pd.Series, dict]]: Loaded dataframe, series or 

141 dictionary following the instructions. 

142 """ 

143 return { 

144 f"data:{key}": read_data(var, input_dir) for key, var in data_settings.items() 

145 } 

146 

147 

148def assign_data_to_settings( 

149 settings: dict[str, Any], 

150 data_settings: dict[str, Union[pd.DataFrame, pd.Series, dict]], 

151) -> dict[str, Any]: 

152 """Assigns the data files to the right variables in the settings dictionary. 

153 

154 Search for data files to load is done recursively, walking through the whole 

155 settgins dictionary tree. 

156 

157 Args: 

158 settings (dict[str, Any]): The settings dicitonary. 

159 input_dir (Path): The directory where input files are located. 

160 

161 Returns: 

162 dict[str, Any]: A new settings dictionary where data files have been loaded. 

163 """ 

164 loaded_settings: dict[str, Any] = {} 

165 

166 for k, v in settings.items(): 

167 if isinstance(v, dict): 

168 loaded_settings[k] = assign_data_to_settings(v, data_settings) 

169 elif isinstance(v, list): 

170 loaded_settings[k] = [ 

171 assign_data_to_settings(item, data_settings) for item in v 

172 ] 

173 elif isinstance(v, str) and v.startswith("data:"): 

174 try: 

175 loaded_settings[k] = data_settings[v] 

176 except KeyError: 

177 raise ValueError( 

178 f"{v} could not be found. Did you configure loading that data in" 

179 " the data section of the settings file?" 

180 ) 

181 else: 

182 loaded_settings[k] = v 

183 

184 return loaded_settings 

185 

186 

187def read_data( 

188 instructions: dict[str, Any], inputs: Path 

189) -> Union[pd.DataFrame, pd.Series, dict]: 

190 """Uses the instructions to load tabular data. 

191 

192 The instructions are a dictionary of options that define what file to load, how 

193 to load it and some simple manipulations to do to the loaded pandas Dataframe 

194 before returing it. 

195 

196 The keys to control this proces are: 

197 

198 filename: Filename of the data to load 

199 filter (optional): List of filters for the dataframe, each a dictionary in the 

200 form: 

201 where: column to filer 

202 is: value of that column 

203 scaling (optional): List of variable scaling, each a dictionary of the form: 

204 where: column to filer (optional) 

205 is: value of that column (optional) 

206 variable: name of the column to scale 

207 factor: unit conversion factor, as defined in `wsimod.core.constants`, 

208 eg. MM_TO_M 

209 format (optional): How the output should be provided. If format is `dict` then 

210 the output is provided as a dictonary, otherwise a Dataframe or a Series 

211 (if there is only 1 column) is output. 

212 index (optional): Column(s) to use as index. 

213 output (optional): Column to provide as output. 

214 options (optional): Options to pass to the `pandas.read_csv` function. 

215 

216 The order in which operations are done is: 

217 

218 read -> filter -> scale -> set_index -> select_output -> convert_format 

219 

220 Only the `read` step will always happen. The others depend on the inputs. 

221 

222 Args: 

223 instructions (str): A dictionary with instructions to load the data. 

224 inputs (Path): Base directory of inputs. 

225 

226 Returns: 

227 Union[pd.DataFrame, pd.Series, dict]: Loaded dataframe, series or dictionary 

228 following the instructions. 

229 """ 

230 filename = inputs / Path(instructions["filename"]) 

231 options_: dict[str, Any] = process_options(instructions.get("options", "")) 

232 data = pd.read_csv(inputs / Path(filename), **options_) 

233 

234 for filter in instructions.get("filters", []): 

235 data = data.loc[data[filter["where"]] == filter["is"]] 

236 

237 for scaler in instructions.get("scaling", []): 

238 idx = data[scaler["where"]] == scaler["is"] if "is" in scaler else slice(None) 

239 factor = ( 

240 getattr(constants, scaler["factor"]) 

241 if isinstance(scaler["factor"], str) 

242 else scaler["factor"] 

243 ) 

244 data.loc[idx, scaler["variable"]] *= factor 

245 

246 if index := instructions.get("index", None): 

247 data = data.set_index(index) 

248 

249 if output := instructions.get("output", None): 

250 data = data[output] 

251 

252 if isinstance(data, pd.DataFrame) and len(data.columns) == 1: 

253 data = data.squeeze() 

254 

255 if instructions.get("format", "") == "dict": 

256 return data.to_dict() 

257 

258 return data 

259 

260 

261def process_options(options: str) -> dict[str, Any]: 

262 """Formats the options string as keyword arguments. 

263 

264 >>> process_options("sep=' ',index_col='datetime'") 

265 {'sep': ' ', 'index_col': 'datetime'} 

266 

267 Args: 

268 options (str): The strings with the arguments to process. 

269 

270 Returns: 

271 dict[str, Any]: The dictionary with the processed keyword arguments. 

272 """ 

273 if not options: 

274 return {} 

275 

276 args = "f({})".format(options) 

277 tree = ast.parse(args) 

278 funccall = tree.body[0].value 

279 

280 kwargs = {arg.arg: ast.literal_eval(arg.value) for arg in funccall.keywords} 

281 return kwargs