ldcast_code / scripts /train_nowcaster.py

Upload folder using huggingface_hub

d2f661a verified 6 months ago

4.86 kB

	from datetime import timedelta
	import gc
	import gzip
	import os
	import pickle

	import numpy as np

	from ldcast.features import batch, patches, split, transform

	#file_dir = os.path.dirname(os.path.abspath(__file__))
	file_dir = os.path.dirname("/data/data_WF/ldcast_precipitation/ldcast/")

	def setup_data(
	use_obs=True,
	use_nwp=False,
	obs_vars=("RZC",),
	nwp_vars=(
	"cape", "cin", "rate-cp", "rate-tp", "t2m",
	"tclw", "tcwv", "u", "v"
	),
	nwp_lags=(0,12),
	target_var="RZC",
	batch_size=8,
	past_timesteps=4,
	future_timesteps=20,
	timestep_secs=300,
	nwp_timestep_secs=3600,
	sampler_file=None,
	#chunks_file="./preprocess_data/split_chunks.pkl.gz",
	chunks_file="./data/split_chunks.pkl.gz",
	sample_shape=(4,4)
	):
	target = target_var + "-T"
	predictors_obs = [v + "-O" for v in obs_vars]
	predictors = []
	if use_obs:
	predictors += predictors_obs
	if use_nwp:
	predictors.append("nwp")

	variables = {
	target: {
	"sources": [target_var],
	"timesteps": np.arange(1,future_timesteps+1),
	}
	}
	for (var, raw_var) in zip(predictors_obs, obs_vars):
	variables[var] = {
	"sources": [raw_var],
	"timesteps": np.arange(-past_timesteps+1,1)
	}
	nwp_t1 = int(np.ceil(future_timesteps*timestep_secs/nwp_timestep_secs)) + 2
	nwp_range = np.arange(nwp_t1)
	variables["nwp"] = {
	"sources": nwp_vars,
	"timesteps": nwp_range,
	"timestep_secs": nwp_timestep_secs
	}

	# determine which raw variables are needed, then load them
	raw_vars = set.union(
	*(set(variables[v]["sources"]) for v in predictors_obs+[target])
	)
	if use_nwp:
	for raw_var_base in variables["nwp"]["sources"]:
	raw_vars.update(f"{raw_var_base}-{lag}" for lag in nwp_lags)
	raw = {
	var: patches.load_all_patches(
	os.path.join(file_dir, f"./data/{var}/"), var
	#os.path.join(file_dir, f"./preprocess_data/{var}/"), var
	)
	for var in raw_vars
	}

	# Load pregenerated train/valid/test split data.
	# These can be generated with features.split.get_chunks()
	with gzip.open(os.path.join(file_dir, chunks_file), 'rb') as f:
	chunks = pickle.load(f)
	(raw, _) = split.train_valid_test_split(raw, var, chunks=chunks)

	transform_rain = lambda: transform.default_rainrate_transform(
	raw["train"]["RZC"]["scale"]
	)
	transform_cape = lambda: transform.normalize_threshold(
	log=True,
	threshold=1.0, fill_value=1.0,
	mean=1.530, std=0.859
	)
	transform_rate_tp = lambda: transform.normalize_threshold(
	log=True,
	threshold=1e-5, fill_value=1e-5,
	mean=-3.831, std=0.650
	)
	transform_wind = lambda: transform.normalize(std=9.44)

	transforms = {
	"RZC-T": transform_rain(),
	"RZC-O": transform_rain(),
	"cape": transform_cape(),
	"cin": transform_cape(),
	"rate-tp": transform_rate_tp(),
	"rate-cp": transform_rate_tp(),
	"t2m": transform.normalize(mean=286.069, std=7.323),
	"tclw": transform.normalize_threshold(
	log=True,
	threshold=0.001, fill_value=0.001,
	mean=-1.486, std=0.638
	),
	"tcwv": transform.normalize(std=17.307),
	"u": transform_wind(),
	"v": transform_wind()
	}
	transforms["nwp"] = transform.combine([transforms[v] for v in nwp_vars])
	for (var_name, var_data) in variables.items():
	var_data["transform"] = transforms[var_name]

	if sampler_file is None:
	sampler_file = {
	#"train": "../cache/sampler_nowcaster_train.pkl",
	#"valid": "../cache/sampler_nowcaster_valid.pkl",
	#"test": "../cache/sampler_nowcaster_test.pkl",
	"train": "/data/data_WF/ldcast_precipitation/ldcast/cache/sampler_nowcaster_train.pkl",
	"valid": "/data/data_WF/ldcast_precipitation/ldcast/cache/sampler_nowcaster_valid.pkl",
	"test": "/data/data_WF/ldcast_precipitation/ldcast/cache/sampler_nowcaster_test.pkl",
	}
	bins = np.exp(np.linspace(np.log(0.2), np.log(50), 10))
	datamodule = split.DataModule(
	variables, raw, predictors, target, target,
	forecast_raw_vars=nwp_vars,
	interval=timedelta(seconds=timestep_secs),
	batch_size=batch_size, sampling_bins=bins,
	time_range_sampling=(-past_timesteps+1,future_timesteps+1),
	sampler_file=sampler_file,
	sample_shape=sample_shape,
	valid_seed=1234, test_seed=2345,
	)

	gc.collect()
	return datamodule