Summary of Daily Rainfall Runoff Datasets

This file shows summary of all rainfall-runoff datasets available in the package which have hydrometeorological data at daily timestep. It also shows how to access these datasets using a unified interface of RainfallRunoff class.

At the time of running this script, the datasets had been previosly downloaded. Therefore, if you run this script for the first time, it may take days to run or may even not run successfully till the end due to internet connection issues.

[1]:
import os
import site

wd_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__')))))
#wd_dir = os.path.dirname(os.path.dirname(os.path.realpath('__file__')))
#wd_dir = os.path.dirname(os.path.realpath('__file__'))
print(wd_dir)
site.addsitedir(wd_dir)

from datetime import datetime
import textwrap
import time
import matplotlib
nice_fonts = {
    #"text.usetex": True,
    "font.family": "sans-serif",  #sans -serif
    #"font.serif" : "Times New Roman",
}
matplotlib.rcParams.update(nice_fonts)

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

from easy_mpl.utils import despine_axes

from aqua_fetch.utils import print_info
from aqua_fetch import RainfallRunoff

print_info()
# path where the data will be downloaded or has previously been downloaded

DATA_PATH = '/mnt/storage1/atr/data/gscad_database/raw'
/home/atr/AquaFetch
numpy 1.26.4
pandas 2.1.4
aqua_fetch 1.0.1
python 3.12.12 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 20:16:04) [GCC 11.2.0]
os posix
matplotlib 3.8.4
shapefile 2.3.1
xarray 2024.7.0
netCDF4 1.7.4
scipy 1.17.0
fiona 1.10.1
Script Executed on:  04 February 2026 12:46:40
tot_cpus 48
avail_cpus 48
mem_gib 503.5233840942383
[2]:
datasets = {
    "Arcticnet" : DATA_PATH,
    "Bull" : DATA_PATH,
    "CABra" : DATA_PATH,
    # GRDC Caravan is overshadowing the other datasets
    # so better put it at start
    "GRDCCaravan": DATA_PATH,
    # "CAMELS_AUS" : os.path.join(DATA_PATH, 'CAMELS_AUS_V1'),
    "CAMELS_AUS": os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_GB" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_BR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_COL" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_US" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CL" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DK" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CH" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_FR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_FI" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_SE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_SK" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_LUX" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_IND" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_NZ" : os.path.join(DATA_PATH, 'CAMELS'),
    "Caravan_DK": DATA_PATH,
    "LamaHCE" : DATA_PATH,
    "LamaHIce" : os.path.join(DATA_PATH, 'LamaHIce_daily'),
    "HYSETS": DATA_PATH,
    "CCAM": DATA_PATH,
    "Japan": DATA_PATH,
    "Ireland": DATA_PATH,
    "Finland": DATA_PATH,
    "Italy": DATA_PATH,
    "Poland": DATA_PATH,
    "Portugal": DATA_PATH,
    "Slovenia": DATA_PATH,
    "Simbi": DATA_PATH,
    "Spain": DATA_PATH,
    "Thailand": DATA_PATH,
    "USGS": DATA_PATH,
}

colors = plt.cm.tab20.colors + plt.cm.tab20b.colors

rets = {}
items = {}

block1 = ['HYSETS', 'Italy', 'CAMELS_COL', 'LamaHCE', 'LamaHIce', "CABra", "CAMELS_US",
          "CAMELS_CL", 'Ireland', 'Spain', 'Poland', 'CAMELS_SE', 'USGS', "Bull", "CAMELS_BR"]

block2 = ['CAMELS_DK', 'CAMELS_FR', 'CAMELS_DE', 'Portugal',
          "CAMELS_GB", "CAMELS_CH", "Caravan_DK"]

block3 = ['Thailand', 'CCAM', 'CAMELS_LUX', 'Finland',
          'CAMELS_IND', "Simbi", #'GRDCCaravan'
          ]

block4 = ['CAMELS_NZ', 'CAMELS_AUS', 'CAMELS_SK', "Japan", 'Arcticnet',
          'CAMELS_FI', 'Slovenia']

# collect the coords data
coords_data = {}
for idx, (src, path) in enumerate(datasets.items()):
    start = time.time()

    ds = RainfallRunoff(src, path=path, verbosity=0)

    end = time.time()

    coords_data[src] = ds.stn_coords()

# draw the figure
_, ax = plt.subplots(figsize=(10, 12))

map = Basemap(ax=ax, resolution='l',
              #llcrnrlon=-180, llcrnrlat=-60, urcrnrlon=180, urcrnrlat=90
              )
map.drawcoastlines(linewidth=0.3, ax=ax, color="gray", zorder=0)
for idx, src in enumerate(datasets.keys()):

    coords = coords_data[src]

    ret = map.scatter(coords['long'].values, coords['lat'].values,
                marker=".",
                s=2,
                linewidths=0.0,
                color = colors[idx],
                alpha=1.0,
                label=f"{src} (n={coords.shape[0]})")

    rets[src] = ret
    items[src] = coords.shape[0]

leg1 = ax.legend(
    [rets[src] for src in sorted(block1)],
    [f"{src} (n={items[src]})" for src in sorted(block1)],
    markerscale=12,
    fontsize=8,
    borderpad=0.2,
    labelspacing=0.5,
    title_fontproperties={'weight': 'bold', 'size': 8+2},
    bbox_to_anchor=(0.001, 0.001),
    loc="lower left",
    framealpha=0.6
    )
leg2 = ax.legend([rets[src] for src in sorted(block2)],
                [f"{src} (n={items[src]})" for src in sorted(block2)],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.34, 0.001),
        loc="lower left",
        )
leg3 = ax.legend([rets[src] for src in block3],
                [f"{src} (n={items[src]})" for src in block3],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.60, 0.001),
        loc="lower left",
        )
leg4 = ax.legend([rets[src] for src in block4],
                [f"{src} (n={items[src]})" for src in block4],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.79, 0.71),
        framealpha=0.5,
        loc="lower left",
        )
ax.add_artist(leg1)
ax.add_artist(leg2)
ax.add_artist(leg3)
ax.add_artist(leg4)

despine_axes(ax)
# plt.savefig("rr_stations.png", dpi=600, bbox_inches="tight")
plt.show()
/home/atr/AquaFetch/aqua_fetch/rr/_simbi.py:318: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/atr/AquaFetch/aqua_fetch/rr/_simbi.py:318: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
../_images/_notebooks_rr_summary_2_1.svg

Arcticnet

[3]:
dataset = RainfallRunoff('Arcticnet', path=DATA_PATH, verbosity=0)
print(dataset)
Arcticnet with 106 stations, 27 dynamic and 35 static features

The static features of Arcticnet are same as that of GSHA.

[4]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[5]:
df = dataset.fetch_static_features()
print(df.shape)
(106, 35)
[6]:
print(df.isna().sum().sum())
df.isna().sum()
22
[6]:
EVP_uncertainty(%)      9
HYRIV_ID                0
LRAD_uncertainty(%)     2
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area_km2                0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     11
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slope_degrees           0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[7]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[8]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[8]:
EVP_uncertainty(%)      9
LRAD_uncertainty(%)     2
pet_uncertainty(%)     11
dtype: int64
[9]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature of Arcticnet.

[10]:
# _, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Bull

[11]:
dataset = RainfallRunoff('Bull', path=DATA_PATH, verbosity=0)
print(dataset)
Bull with 484 stations, 55 dynamic and 214 static features
[12]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE, aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_hydroatlas, area_km2, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj, cmi_ix_s01,
cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08, cmi_ix_s09,
cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn, dis_m3_pmx,
dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj, fmh_cl_smj,
for_pc_sse, frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01,
glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09,
glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17,
glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09,
hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse,
kar_pc_sse, lat, lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq,
moisture_index, nli_ix_sav, non-altered, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[13]:
df = dataset.fetch_static_features()
print(df.shape)
(484, 214)
[14]:
print(df.isna().sum().sum())
df.isna().sum()
0
[14]:
NSE           0
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 214, dtype: int64

find those columns which have at least one NaN value

[15]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[16]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[16]:
Series([], dtype: float64)
[17]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_AEMET_max, airtemp_C_AEMET_min, airtemp_C_EMO1arc_max,
airtemp_C_EMO1arc_min, airtemp_C_ERA5Land_max, airtemp_C_ERA5Land_min, airtemp_C_mean_2m,
airtemp_C_mean_AEMET, airtemp_C_mean_EMO1arc, airtemp_C_mean_ERA5Land, dptemp_C_max, dptemp_C_mean,
dptemp_C_min, pcp_mm_AEMET, pcp_mm_BULL, pcp_mm_EMO1arc, pcp_mm_ERA5Land, pet_mm_AEMET,
pet_mm_EMO1arc, pet_mm_ERA5Land, pevap_mm, q_cms_obs, solrad_wm2, solrad_wm2_max, solrad_wm2_min,
streamflow, surface_pressure_max_BULL, surface_pressure_mean_BULL, surface_pressure_min_BULL,
swe_mm, swe_mm_max, swe_mm_min, thermrad_wm2, thermrad_wm2_max, thermrad_wm2_min,
volumetric_soil_water_layer_1_max_BULL, volumetric_soil_water_layer_1_mean_BULL,
volumetric_soil_water_layer_1_min_BULL, volumetric_soil_water_layer_2_max_BULL,
volumetric_soil_water_layer_2_mean_BULL, volumetric_soil_water_layer_2_min_BULL,
volumetric_soil_water_layer_3_max_BULL, volumetric_soil_water_layer_3_mean_BULL,
volumetric_soil_water_layer_3_min_BULL, volumetric_soil_water_layer_4_max_BULL,
volumetric_soil_water_layer_4_mean_BULL, volumetric_soil_water_layer_4_min_BULL,
windspeedu_mps_max_10m, windspeedu_mps_mean_10m, windspeedu_mps_min_10m, windspeedv_mps_max_10m,
windspeedv_mps_mean_10m, windspeedv_mps_min_10m
[18]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[18]:
<xarray.Dataset> Size: 6GB
Dimensions:           (time: 25932, dynamic_features: 55)
Coordinates:
  * time              (time) datetime64[ns] 207kB 1951-01-02 ... 2021-12-31
  * dynamic_features  (dynamic_features) object 440B 'airtemp_C_2m_max' ... '...
Data variables: (12/484)
    BULL_8029         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9177         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_5150         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_10123        (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9270         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_5138         (time, dynamic_features) float64 11MB nan nan ... nan nan
    ...                ...
    BULL_1617         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9137         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9036         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9268         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_2124         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_5025         (time, dynamic_features) float64 11MB nan nan ... nan nan
[19]:
# print total number of nans for each of dynamic feature of Bull.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CABra

[20]:
dataset = RainfallRunoff('CABra', path=DATA_PATH, verbosity=0)
print(dataset)
CABra with 735 stations, 13 dynamic and 87 static features
[21]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ANA_ID, aquif_name, aquif_type, area_km2, aridity_index, baseflow_index, catch_hand, catch_lith,
catch_order, catch_wtd, clim_et, clim_p, clim_pet, clim_quality, clim_rh, clim_srad, clim_tmax,
clim_tmin, clim_wind, cover_bare, cover_crops, cover_forest, cover_grass, cover_main, cover_moss,
cover_shrub, cover_snow, cover_urban, cover_waterp, cover_waters, dist_urban, elev_gauge, elev_max,
elev_mean, elev_min, fdc_slope, gauge_biome, gauge_hreg, gauge_state, hand_class, hdisturb_index,
lat, long, missing_data, ndvi_djf, ndvi_jja, ndvi_mam, ndvi_son, p_seasonality, q_1, q_5, q_95,
q_99, q_cv, q_elasticity, q_hcv, q_hd, q_hf, q_hfd, q_lcv, q_ld, q_lf, q_mean, q_zero,
quality_index, res_area, res_number, res_regulation, res_volume, runoff_coef, series_length,
slope_perc, soil_bulk, soil_carbon, soil_clay, soil_depth, soil_sand, soil_silt, soil_textclass,
soil_type, sub_hconduc, sub_permeability, sub_porosity, water_demand, well_dynamic, well_number,
well_static
[22]:
df = dataset.fetch_static_features()
print(df.shape)
(735, 87)
[23]:
print(df.isna().sum().sum())
df.isna().sum()
0
[23]:
ANA_ID           0
aquif_name       0
aquif_type       0
area_km2         0
aridity_index    0
                ..
sub_porosity     0
water_demand     0
well_dynamic     0
well_number      0
well_static      0
Length: 87, dtype: int64

find those columns which have at least one NaN value

[24]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[25]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[25]:
Series([], dtype: float64)
[26]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Quality, aet_mm_ens, airtemp_C_ens_max, airtemp_C_ens_min, airtemp_C_mean_ens, pcp_mm_ens,
pet_mm_hg, pet_mm_pm, pet_mm_pt, q_cms_obs, rh_%_ens, solrad_wm2_ens, windspeed_mps_ens
[27]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[27]:
<xarray.Dataset> Size: 838MB
Dimensions:           (time: 10957, dynamic_features: 13)
Coordinates:
  * time              (time) datetime64[ns] 88kB 1980-10-01 ... 2010-09-30
  * dynamic_features  (dynamic_features) <U18 936B 'Quality' ... 'windspeed_m...
Data variables: (12/735)
    1                 (time, dynamic_features) float64 1MB ...
    2                 (time, dynamic_features) float64 1MB ...
    3                 (time, dynamic_features) float64 1MB ...
    4                 (time, dynamic_features) float64 1MB ...
    5                 (time, dynamic_features) float64 1MB ...
    6                 (time, dynamic_features) float64 1MB ...
    ...                ...
    730               (time, dynamic_features) float64 1MB ...
    731               (time, dynamic_features) float64 1MB ...
    732               (time, dynamic_features) float64 1MB ...
    733               (time, dynamic_features) float64 1MB ...
    734               (time, dynamic_features) float64 1MB ...
    735               (time, dynamic_features) float64 1MB ...
[28]:
# print total number of nans for each of dynamic feature of CABra.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_AUS

[29]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS_AUS_V1'), version=1, verbosity=0)
print(dataset)
CAMELS_AUS with 222 stations, 28 dynamic and 166 static features
[30]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, anngro_mega, anngro_meso, anngro_micro, area_km2, aridity, baseflow_index, carbnatesed,
catchment_di, claya, clayb, confinement, daystart, daystart_P, daystart_Q, distupdamw,
drainage_division, elev_max, elev_mean, elev_min, elev_range, elongratio, end_date, erosivity,
extract_ind_fac, flow_div_fac, flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec,
geol_sec_prop, gromega_seas, gromeso_seas, gromicro_seas, hdf_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, igneous, impound_fac, infrastruc_fac, ksat, landuse_fac,
lat, lat_centroid, lc01_extracti, lc03_waterbo, lc04_saltlak, lc05_irrcrop, lc06_irrpast,
lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands, lc14_tussclo, lc15_alpineg,
lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope, lc31_forclos, lc32_foropen,
lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long, long_centroid, low_prec_dur,
low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, map_zone, mean_slope_pct, metamorph,
mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5, mrvbf_prop_6,
mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes, npp_1, npp_10,
npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann, num_nested_within,
nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e, nvis_grasses_n,
nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e, nvis_woodlands_n,
oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max, pop_mean,
prop_forested, prop_missing_data, q_mean, q_uncert_n, q_uncert_num_curves, q_uncert_q10,
q_uncert_q10_lower, q_uncert_q10_upper, q_uncert_q50, q_uncert_q50_lower, q_uncert_q50_upper,
q_uncert_q90, q_uncert_q90_lower, q_uncert_q90_upper, relief, reliefratio, river_di, river_region,
runoff_ratio, sanda, sedvolc, settlement_fac, silicsed, slope_fdc, solpawhc, solum_thickness,
start_date, state_alt, state_outlet, station_name, strahler, strdensity, stream_elas, unconsoldted,
upsdist, zero_q_freq
[31]:
df = dataset.fetch_static_features()
print(df.shape)
(222, 166)
[32]:
print(df.isna().sum().sum())
df.isna().sum()
1175
[32]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat                  0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 166, dtype: int64

find those columns which have at least one NaN value

[33]:
df.loc[:, (df.isna().sum()>0)]
[33]:
state_alt next_station_ds q_uncert_num_curves q_uncert_n q_uncert_q10 q_uncert_q10_upper q_uncert_q10_lower q_uncert_q50 q_uncert_q50_upper q_uncert_q50_lower q_uncert_q90 q_uncert_q90_upper q_uncert_q90_lower
station_id
912101A NT NaN 3.0 15226.0 0.015122 25.07% -21.06% 0.027200 20.06% -17.82% 0.121670 18.46% -15.13%
912105A NT 912101A 1.0 15232.0 0.016572 196.84% -93.24% 0.031969 129.72% -77.38% 0.161384 49.79% -40.02%
915011A NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
917107A NaN NaN 2.0 15772.0 0.001552 143.47% -66.93% 0.036077 51.70% -37.00% 0.371124 26.85% -22.30%
919003A NaN NaN 1.0 14933.0 0.004731 21.65% -18.16% 0.053229 15.45% -13.59% 1.273285 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
312061 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314207 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314213 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
315450 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
318076 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

222 rows × 13 columns

[34]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[34]:
state_alt              212
next_station_ds        192
q_uncert_num_curves     56
q_uncert_n              56
q_uncert_q10            56
q_uncert_q10_upper     118
q_uncert_q10_lower     118
q_uncert_q50            56
q_uncert_q50_upper      66
q_uncert_q50_lower      67
q_uncert_q90            56
q_uncert_q90_upper      61
q_uncert_q90_lower      61
dtype: int64
[35]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_awap_max, airtemp_C_awap_min, airtemp_C_mean_awap, airtemp_C_mean_silo,
airtemp_C_silo_max, airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO,
evap_syn_SILO, mslp_SILO, pcp_mm_awap, pcp_mm_silo, precipitation_var_AWAP, q_cms_obs, q_mm_obs,
rh_%_silo_tmax, rh_%_silo_tmin, solrad_wm2_awap, solrad_wm2_silo, streamflow_MLd_inclInfilled,
vp_deficit_SILO, vp_hpa_awap, vp_hpa_silo

print total number of nans for each of dynamic feature of CAMELS_AUS.

[36]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[36]:
<xarray.Dataset> Size: 581MB
Dimensions:           (time: 23376, dynamic_features: 28)
Coordinates:
  * time              (time) datetime64[ns] 187kB 1951-01-01 ... 2014-12-31
  * dynamic_features  (dynamic_features) <U27 3kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/222)
    912101A           (time, dynamic_features) float32 3MB ...
    912105A           (time, dynamic_features) float32 3MB ...
    915011A           (time, dynamic_features) float32 3MB ...
    917107A           (time, dynamic_features) float32 3MB ...
    919003A           (time, dynamic_features) float32 3MB ...
    919201A           (time, dynamic_features) float32 3MB ...
    ...                ...
    308799            (time, dynamic_features) float32 3MB ...
    312061            (time, dynamic_features) float32 3MB ...
    314207            (time, dynamic_features) float32 3MB ...
    314213            (time, dynamic_features) float32 3MB ...
    315450            (time, dynamic_features) float32 3MB ...
    318076            (time, dynamic_features) float32 3MB ...
[37]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)
[38]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS'), version=2, verbosity=0)
print(dataset)
CAMELS_AUS with 561 stations, 28 dynamic and 187 static features
[39]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
anngro_mega, anngro_meso, anngro_micro, area_km2, aridity, carbnatesed, catchment_di, claya, clayb,
confinement, daystart, daystart_P, daystart_Q, distupdamw, drainage_division, elev_max, elev_mean,
elev_min, elev_range, elongratio, end_date, erosivity, extract_ind_fac, flow_div_fac,
flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec, geol_sec_prop, gromega_seas,
gromeso_seas, gromicro_seas, high_prec_dur, high_prec_freq, high_prec_timing, igneous, impound_fac,
infrastruc_fac, ksat, landuse_fac, lat, lat_centroid, lc01_extracti, lc03_waterbo, lc04_saltlak,
lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands,
lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope,
lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long,
long_centroid, low_prec_dur, low_prec_freq, low_prec_timing, map_zone, mean_slope_pct, metamorph,
mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5, mrvbf_prop_6,
mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes, npp_1, npp_10,
npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann, num_nested_within,
nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e, nvis_grasses_n,
nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e, nvis_woodlands_n,
oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max, pop_mean,
prop_forested, prop_missing_data, q_uncert_Q_above, q_uncert_days_above, q_uncert_rmse_all,
q_uncert_rmse_lower, q_uncert_rmse_upper, q_uncert_unique_curves, relief, reliefratio, river_di,
river_region, sanda, sedvolc, settlement_fac, sig_dur_RespTime, sig_dur_high_Q_dur,
sig_dur_low_Q_dur, sig_dur_zero_Q_dur, sig_freq_high_Q_freq, sig_freq_low_Q_freq,
sig_freq_zero_Q_freq, sig_mag_BFI, sig_mag_BaseMag, sig_mag_Q5, sig_mag_Q95, sig_mag_Q_7_day_max,
sig_mag_Q_7_day_min, sig_mag_Q_CoV, sig_mag_Q_mean, sig_mag_Q_skew, sig_mag_Q_var, sig_mag_VarIdx,
sig_other_EventRR, sig_other_PeakDistribution, sig_other_PeakDistribution_low,
sig_other_QP_elasticity, sig_other_RR_seasonality, sig_other_SnowDayRatio, sig_other_SnowStorage,
sig_other_Spearmans_rho, sig_other_StorageFromBase, sig_other_TotalRR,
sig_other_ratio_Event_TotalRR, sig_roc_AC1, sig_roc_AC1_low, sig_roc_BaseRecesK, sig_roc_FDC_slope,
sig_roc_FlashIdx, sig_roc_RLD, sig_roc_RecesK_early, sig_roc_RecesVarSeasonality,
sig_timing_HFD_mean, sig_timing_HFI_mean, silicsed, solpawhc, solum_thickness, start_date,
state_alt, state_outlet, station_name, strahler, strdensity, unconsoldted, upsdist
[40]:
df = dataset.fetch_static_features()
print(df.shape)
(561, 187)
[41]:
print(df.isna().sum().sum())
df.isna().sum()
1643
[41]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat                  0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 187, dtype: int64

find those columns which have at least one NaN value

[42]:
df.loc[:, (df.isna().sum()>0)]
[42]:
state_alt next_station_ds q_uncert_unique_curves q_uncert_rmse_all q_uncert_rmse_lower q_uncert_rmse_upper q_uncert_days_above q_uncert_Q_above sig_mag_VarIdx sig_roc_FDC_slope sig_other_PeakDistribution_low
station_id
912101A NT NaN NaN NaN NaN NaN NaN NaN 0.292867 -1.916733 -2.180623
912105A NT 912101A NaN NaN NaN NaN NaN NaN 0.304694 -1.795139 -1.254491
915011A NaN NaN NaN NaN NaN NaN NaN NaN 1.083646 NaN -6.090788
915206A NaN NaN 25.0 25.172244 6.506520 20.955888 0.078362 19.011459 1.009843 NaN -8.491230
917107A NaN NaN 16.0 53.380009 1168.007627 21.192680 0.132802 12.283859 0.641856 -3.957062 -3.631162
... ... ... ... ... ... ... ... ... ... ... ...
318150 NaN 318181 8.0 13.679565 13.569136 10.168856 0.000000 0.000000 0.459891 -3.489307 -5.351701
318181 NaN NaN 24.0 8.209045 23.363542 5.920785 0.004200 0.450893 0.507649 -3.661257 -5.290249
318191 NaN 318150 11.0 8.226708 12.538870 6.093167 0.000000 0.000000 0.514683 -3.525028 -8.555535
318311 NaN 318150 10.0 19.588965 34.652832 14.517310 0.121428 11.333069 0.678704 -4.723863 -7.717046
319204 NaN NaN 5.0 6.379150 20.465465 4.794664 0.005493 0.536084 0.683732 -5.213989 -6.477004

561 rows × 11 columns

[43]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[43]:
state_alt                         544
next_station_ds                   391
q_uncert_unique_curves            102
q_uncert_rmse_all                 102
q_uncert_rmse_lower               102
q_uncert_rmse_upper               102
q_uncert_days_above               102
q_uncert_Q_above                  102
sig_mag_VarIdx                      2
sig_roc_FDC_slope                  91
sig_other_PeakDistribution_low      3
dtype: int64
[44]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_agcd_max, airtemp_C_agcd_min, airtemp_C_mean_agcd, airtemp_C_mean_silo,
airtemp_C_silo_max, airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO,
evap_syn_SILO, mslp_SILO, pcp_mm_agcd, pcp_mm_silo, precipitation_var_AGCD, q_cms_obs, q_mm_obs,
rh_%_silo_tmax, rh_%_silo_tmin, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO,
vp_hpa_agcd_h09, vp_hpa_agcd_h15, vp_hpa_silo
[45]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[45]:
<xarray.Dataset> Size: 2GB
Dimensions:           (time: 26388, dynamic_features: 28)
Coordinates:
  * time              (time) datetime64[ns] 211kB 1950-01-01 ... 2022-03-31
  * dynamic_features  (dynamic_features) <U27 3kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/561)
    912101A           (time, dynamic_features) float32 3MB ...
    912105A           (time, dynamic_features) float32 3MB ...
    915011A           (time, dynamic_features) float32 3MB ...
    915206A           (time, dynamic_features) float32 3MB ...
    917107A           (time, dynamic_features) float32 3MB ...
    919003A           (time, dynamic_features) float32 3MB ...
    ...                ...
    318076            (time, dynamic_features) float32 3MB ...
    318150            (time, dynamic_features) float32 3MB ...
    318181            (time, dynamic_features) float32 3MB ...
    318191            (time, dynamic_features) float32 3MB ...
    318311            (time, dynamic_features) float32 3MB ...
    319204            (time, dynamic_features) float32 3MB ...

print total number of nans for each of dynamic feature of CAMELS_AUS version 2. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[46]:
#     print(feat, nans)

CAMELS_BR

[47]:
dataset = RainfallRunoff('CAMELS_BR', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_BR with 897 stations, 11 dynamic and 67 static features
[48]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_ana, area_gsim, area_gsim_quality, area_km2, aridity, asynchronicity, barren_perc,
baseflow_index, bedrock_depth, carb_rocks_perc, clay_perc, consumptive_use, consumptive_use_perc,
crop_mosaic_perc, crop_perc, dom_land_cover, dom_land_cover_perc, elev_gauge, elev_mean, et_mean,
forest_perc, frac_snow, gauge_name, gauge_region, geol_class_1st, geol_class_1st_perc,
geol_class_2nd, geol_class_2nd_perc, geol_permeability, geol_porosity, grass_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, imperv_perc, lat, long,
low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, org_carbon_content, p_mean,
p_seasonality, pet_mean, q_mean, q_quality_control_perc, q_stream_stage_perc, regulation_degree,
reservoirs_vol, runoff_ratio, sand_perc, shrub_perc, silt_perc, slope_degrees, slope_fdc, snow_perc,
stream_elas, water_table_depth, wet_perc, zero_q_freq
[49]:
df = dataset.fetch_static_features()
print(df.shape)
(897, 67)
[50]:
print(df.isna().sum().sum())
df.isna().sum().sort_values(ascending=False).head(10)
133
[50]:
geol_class_2nd            47
area_ana                  43
baseflow_index            18
slope_fdc                 16
frac_snow                  5
high_prec_timing           4
elev_gauge                 0
q_stream_stage_perc        0
q_quality_control_perc     0
area_gsim_quality          0
dtype: int64

find those columns which have at least one NaN value df.loc[:, (df.isna().sum()>0)]

df.loc[:, (df.isna().sum()>0)].isna().sum()

[51]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_mgb, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cpc,
pcp_mm_mswep, pet_mm_gleam, q_cms_obs, q_mm_obs
[52]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[52]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14245, dynamic_features: 11)
Coordinates:
  * time              (time) datetime64[ns] 114kB 1980-01-01 ... 2018-12-31
  * dynamic_features  (dynamic_features) <U14 616B 'q_mm_obs' ... 'q_cms_obs'
Data variables: (12/897)
    58030000          (time, dynamic_features) float64 1MB ...
    57170000          (time, dynamic_features) float64 1MB ...
    39580000          (time, dynamic_features) float64 1MB ...
    41818000          (time, dynamic_features) float64 1MB ...
    58870000          (time, dynamic_features) float64 1MB ...
    42546000          (time, dynamic_features) float64 1MB ...
    ...                ...
    53880000          (time, dynamic_features) float64 1MB ...
    26720000          (time, dynamic_features) float64 1MB ...
    65925000          (time, dynamic_features) float64 1MB ...
    39560000          (time, dynamic_features) float64 1MB ...
    71550000          (time, dynamic_features) float64 1MB ...
    41539998          (time, dynamic_features) float64 1MB ...

print total number of nans for each of dynamic feature of CAMELS-BR. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[53]:
#     print(feat, nans)

CAMELS_COL

[54]:
dataset = RainfallRunoff('CAMELS_COL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_COL with 347 stations, 6 dynamic and 255 static features
[55]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aquaculture, Beach, dune and sand spot, C-Pi, C-Sctm, C2P-Sm, D2D3-Sctm, DC-Sctm, DC1-Mmg, E1-Mlg,
E1-Pi, E1-Pm, E1-Sc, E1-St, E1E2-Hi, E1E2-Pi, E1E2-Pm, E1E2-VCm, E2-Pi, E2-Pm, E3-Pi, E3-Sc, E3-Sm,
E3-St, E3N1-Sct, E3N1-Stm, Flooded forest, Forest, Forest plantation, Glacier, Grasslands /
herbaceous, Infrastructure, J-Hf, J-Mlg, J-Pi, J-VCc, J-Vf, J-Vi, J1-Sct, J1J2-VCc, J1J2-VCct,
J1J2-Vf, J2J3-Sm, J3-Sc, J3K1?-Mlg, J?-Mhg, K1-Mlg, K1-Mmg, K1-Pf, K1-Pi, K1-Pm, K1-Pu, K1-Sct,
K1-Sctm, K1-Sm, K1-VCm, K2-Mhg, K2-Mlg, K2-Pf, K2-Pi, K2-Pm, K2-Pu, K2-Vf, K2-Vm, K2-Vu, MP-Mvlg,
MP-Pf, MP3NP1-Mhg, Mangrove, Mining, Mosaic of agriculture and pasture, N1-Sc, N1-Sct, N1-Sm, N1-St,
N1-VCc, N2-Sc, N2-Sm, N2-VCc, N2-Vi, N2-py, N2Q1-Sc, N2Q1-VCc, N2Q1-Vi, NP-Mmg, NP-Pm, NP-VCc,
NP3-Pf, NP3ε-Pm, O-Pf, O-Sm, O-Vf, O1-Pf, OS1-Mlg, OS1-Mmg, OS1-Pf, Other non forest formation,
Other non-vegetated area, P-Pf, P-Pi, P-Sctm, PP3PP4-Mmg, PP4-Pf, PZ-Sm, Palm oil, Q-Vi, Q-Vm, Q-af,
Q-al, Q-d, Q-gl, Q-py, Q-t, Q-vc, Q1-Hi, Q1-Sm, Q1-p, Q2-p, Q2-sw, Q2-vc, Q5, Q95, River, lake or
ocean, Rocky outcrop, S4D1-Mlg, T-Mhg, T-Mlg, T-Mm, T-Mmg, T-Mvlg, T-Pf, T-Pi, T-Pm, T-Pu, T2J1-VCm,
T3-Sm, T3-VCct, T3J-Pi, T?-Sc, Wetland, Wooded sand vegetation, alfisols_perc, andisols_perc,
area_km2, aridisols_perc, aridity, b1-Sct, b1-Sctm, b1?b4-Sct, b1b2-Sctm, b1b2-Stm, b1b5-Stm,
b1k1-Sm, b2b5-Sctm, b2b6-Sm, b2b6-Stm, b2k1-Sm, b2k5-Pm, b4?b6-Stm, b4b5-Mhp, b4b6-Sm, b4k1-Sm,
b5?k6-Sctm, b5b6-Sctm, b5k1-Sm, b5k4-VCm, b5k6-Sm, b6-Vf, b6?k1-Sm, b6k1-Stm, b6k1?-Sctm, b6k2-Mvlg,
b6k5-Sm, b6k6-Stm, baseflow_index, cn_catchment, coal_mine_pit_perc, e3e4-Sm, e5e6-Stm, e6e7-Stm,
e6e7-VCm, e6e8-Sc, e6e9-Sc, e6e9-Sct, e8n2-Sm, e8n2-St, e8n3-Sc, elev_gauge_m, entisols_perc,
equi_slope, eroded_misce_perc, factor_form, gauge_department, gravelius_index , high_prec_dur,
high_prec_freq, high_q_dur, high_q_freq, histosols_perc, inceptisols_perc, k1?k5-Sm, k1k4-Sm,
k1k6-Stm, k2k6-Sm, k5E1-Stm, k6E1-Sm, k6E1-Stm, k6e2-Mhp, lat, long, low_prec_dur, low_prec_freq,
low_q_dur, low_q_freq, mollisols_perc, n1?n5?-VCc, n1n2-Pi, n1n2-Sc, n1n2-St, n2n3-Hi, n3n4-Sm,
n3n5-Sc, n3n5-Sm, n3n5-St, n4n5-Pi, n4n5-Vm, n4n6-Hi, n4n6-Sc, n5n6-Sm, n5n7-Sct, n5n7-VCc, n6n7-Sc,
n6n7-Sm, n6n7-St, non_identi_land_perc, oxisols_perc, perimeter_km, perpetual_snow_perc, q_mean,
rocky_misce_perc, runoff_ratio, slope_fdc, spodosols_perc, stream_elas, streng_chanel ,
tc_Engi_Corps, tc_Johnstone, tc_chow, tc_kirpich, ultisols_perc, urban_perc, vertisols_perc,
water_bodies_perc, ¿K?-Sm, εO-Mlg, εO-Sm
[56]:
df = dataset.fetch_static_features()
print(df.shape)
(347, 255)
[57]:
print(df.isna().sum().sum())
df.isna().sum()
68620
[57]:
gauge_department      0
lat                   0
long                  0
elev_gauge_m          0
aridity               0
                   ...
n1?n5?-VCc          338
N1-VCc              332
n5n7-VCc            333
N2-VCc              317
N2Q1-VCc            330
Length: 255, dtype: int64

find those columns which have at least one NaN value

[58]:
df.loc[:, (df.isna().sum()>0)]
[58]:
urban_perc alfisols_perc andisols_perc aridisols_perc water_bodies_perc entisols_perc spodosols_perc coal_mine_pit_perc histosols_perc inceptisols_perc ... J-VCc K1-VCm b5k4-VCm E1E2-VCm e6e7-VCm n1?n5?-VCc N1-VCc n5n7-VCc N2-VCc N2Q1-VCc
52057060 0.1 NaN 54.0 NaN NaN 5.6 NaN NaN NaN 29.8 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
26207030 1.1 2.4 46.2 NaN 0.6 6.8 NaN NaN 0.5 25.4 ... NaN NaN 5.724778 NaN NaN 3.043919 NaN 2.148854 0.007610 NaN
21247050 NaN NaN 79.4 NaN 0.1 2.0 NaN NaN NaN 13.5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
22057040 NaN NaN 25.3 NaN 0.3 37.7 NaN NaN NaN 33.4 ... 4.931911 NaN NaN NaN NaN NaN NaN NaN NaN NaN
16057020 NaN NaN 13.7 NaN NaN 3.5 NaN NaN NaN 12.5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44017060 0.1 NaN 96.4 NaN NaN NaN NaN NaN 2.6 1.0 ... NaN NaN 0.449921 NaN NaN NaN NaN NaN 6.280313 NaN
52017120 0.1 NaN 28.7 NaN 0.4 4.4 NaN NaN NaN 25.7 ... 2.331993 NaN NaN NaN NaN NaN NaN NaN NaN NaN
16027010 NaN NaN NaN NaN NaN 4.8 NaN NaN NaN 95.2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
52017100 0.4 NaN 38.9 NaN 0.3 NaN NaN NaN NaN 27.2 ... 2.524602 NaN NaN NaN NaN NaN NaN NaN NaN NaN
37017050 0.1 NaN 14.3 NaN 0.1 18.6 NaN NaN 0.3 65.8 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

347 rows × 222 columns

[59]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[59]:
urban_perc           152
alfisols_perc        221
andisols_perc         56
aridisols_perc       342
water_bodies_perc     85
                    ...
n1?n5?-VCc           338
N1-VCc               332
n5n7-VCc             333
N2-VCc               317
N2Q1-VCc             330
Length: 222, dtype: int64
[60]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs
[61]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[61]:
<xarray.Dataset> Size: 256MB
Dimensions:           (time: 15340, dynamic_features: 6)
Coordinates:
  * time              (time) datetime64[ns] 123kB 1981-01-01 ... 2022-12-31
  * dynamic_features  (dynamic_features) <U14 336B 'pcp_mm' ... 'q_cms_obs'
Data variables: (12/347)
    52057060          (time, dynamic_features) float64 736kB ...
    26207030          (time, dynamic_features) float64 736kB ...
    21247050          (time, dynamic_features) float64 736kB ...
    22057040          (time, dynamic_features) float64 736kB ...
    16057020          (time, dynamic_features) float64 736kB ...
    21057050          (time, dynamic_features) float64 736kB ...
    ...                ...
    54027030          (time, dynamic_features) float64 736kB ...
    44017060          (time, dynamic_features) float64 736kB ...
    52017120          (time, dynamic_features) float64 736kB ...
    16027010          (time, dynamic_features) float64 736kB ...
    52017100          (time, dynamic_features) float64 736kB ...
    37017050          (time, dynamic_features) float64 736kB ...

print total number of nans for each of dynamic feature of CAMELS-BR. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[62]:
#     print(feat, nans)

CAMELS_CL

[63]:
dataset = RainfallRunoff('CAMELS_CL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CL with 516 stations, 12 dynamic and 104 static features
[64]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_km2, aridity_chirps, aridity_cr2met, aridity_mswep, aridity_tmpa, baseflow_index,
big_dam, carb_rocks_frac, crop_frac, dom_land_cover, dom_land_cover_frac, elev_gauge, elev_max,
elev_mean, elev_med, elev_min, forest_frac, fp_frac, fp_nf_index, frac_snow_chirps,
frac_snow_cr2met, frac_snow_mswep, frac_snow_tmpa, gauge_name, geol_class_1st, geol_class_1st_frac,
geol_class_2nd, geol_class_2nd_frac, grass_frac, gw_rights_flow, gw_rights_n, hfd_mean,
high_prec_dur_chirps, high_prec_dur_cr2met, high_prec_dur_mswep, high_prec_dur_tmpa,
high_prec_freq_chirps, high_prec_freq_cr2met, high_prec_freq_mswep, high_prec_freq_tmpa,
high_prec_timing_chirps, high_prec_timing_cr2met, high_prec_timing_mswep, high_prec_timing_tmpa,
high_q_dur, high_q_freq, imp_frac, interv_degree, land_cover_missing, lat, lc_barren, lc_glacier,
location_type, long, low_prec_dur_chirps, low_prec_dur_cr2met, low_prec_dur_mswep,
low_prec_dur_tmpa, low_prec_freq_chirps, low_prec_freq_cr2met, low_prec_freq_mswep,
low_prec_freq_tmpa, low_prec_timing_chirps, low_prec_timing_cr2met, low_prec_timing_mswep,
low_prec_timing_tmpa, low_q_dur, low_q_freq, n_obs, nested_inner, nested_outer, nf_frac,
p_mean_chirps, p_mean_cr2met, p_mean_mswep, p_mean_spread, p_mean_tmpa, p_seasonality_chirps,
p_seasonality_cr2met, p_seasonality_mswep, p_seasonality_tmpa, pet_mean, q_mean, record_period_end,
record_period_start, runoff_ratio_chirps, runoff_ratio_cr2met, runoff_ratio_mswep,
runoff_ratio_tmpa, shrub_frac, slope_fdc, slope_mkm-1, snow_frac, stream_elas_chirps,
stream_elas_cr2met, stream_elas_mswep, stream_elas_tmpa, sur_rights_flow, sur_rights_n, swe_ratio,
wet_frac, zero_q_freq
[65]:
df = dataset.fetch_static_features()
print(df.shape)
(516, 104)
[66]:
print(df.isna().sum().sum())
df.isna().sum()
12185
[66]:
gauge_id
Q5                 278
Q95                278
area_km2             0
aridity_chirps      43
aridity_cr2met       0
                  ...
sur_rights_flow      0
sur_rights_n         0
swe_ratio          397
wet_frac             0
zero_q_freq        278
Length: 104, dtype: int64

find those rows which have at least one NaN value

[67]:
df.loc[:, (df.isna().sum()>0)]
[67]:
gauge_id Q5 Q95 aridity_chirps aridity_tmpa baseflow_index frac_snow_chirps frac_snow_tmpa geol_class_2nd hfd_mean high_prec_dur_chirps ... runoff_ratio_cr2met runoff_ratio_mswep runoff_ratio_tmpa slope_fdc stream_elas_chirps stream_elas_cr2met stream_elas_mswep stream_elas_tmpa swe_ratio zero_q_freq
5411002 NaN NaN 4.3981422 NaN NaN 0.0011156589333 NaN Unconsolidated sediments NaN 1.430267 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1610004 NaN NaN 4.2814881 NaN NaN 0.0962529109889 NaN Intermediate volcanic rocks NaN 1.753666 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10322003 NaN NaN 0.4007878 NaN NaN 0.0250541627241 NaN Acid plutonic rocks NaN 1.277628 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7308001 NaN NaN 0.8128522 NaN NaN 0.0241605573015 NaN Acid plutonic rocks NaN 1.497126 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5746001 0.22086357569 1.580334352 3.1408824 NaN 0.7183103 0.0000000000000 NaN Unconsolidated sediments 124.6000 1.510638 ... 0.729703067 0.561616773 NaN 1.4288599 0.830534426 0.578472034 0.71464879 NaN 0.003368185140 0.0000000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4503001 0.00347988392 0.341498878 6.6343242 NaN 0.6953424 0.0215145922980 NaN Basic volcanic rocks 145.2222 1.362275 ... 0.317950344 0.130049041 NaN 2.4462387 1.892133343 1.081428663 1.29668444 NaN 0.846802087198 0.0000000000
7320003 NaN NaN 0.6129590 NaN NaN 0.1517049787595 NaN Acid plutonic rocks NaN 1.447368 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11147001 NaN NaN 0.5422394 NaN NaN 0.1853538657177 NaN Unconsolidated sediments NaN 1.179191 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2105005 NaN NaN 14.1817230 NaN NaN 0.0029313804245 NaN Pyroclastics NaN 1.665236 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11335002 NaN NaN 1.0463489 NaN NaN 0.1835252052010 NaN Mixed sedimentary rocks NaN 1.160000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

516 rows × 42 columns

[68]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[68]:
gauge_id
Q5                         278
Q95                        278
aridity_chirps              43
aridity_tmpa               516
baseflow_index             278
frac_snow_chirps            43
frac_snow_tmpa             516
geol_class_2nd              16
hfd_mean                   278
high_prec_dur_chirps        43
high_prec_dur_tmpa         516
high_prec_freq_chirps       43
high_prec_freq_tmpa        516
high_prec_timing_chirps     43
high_prec_timing_tmpa      516
high_q_dur                 278
high_q_freq                278
location_type              386
low_prec_dur_chirps         43
low_prec_dur_tmpa          516
low_prec_freq_chirps        43
low_prec_freq_tmpa         516
low_prec_timing_chirps      43
low_prec_timing_tmpa       516
low_q_dur                  278
low_q_freq                 278
p_mean_chirps               43
p_mean_tmpa                516
p_seasonality_chirps        43
p_seasonality_tmpa         516
q_mean                     278
runoff_ratio_chirps        297
runoff_ratio_cr2met        278
runoff_ratio_mswep         278
runoff_ratio_tmpa          516
slope_fdc                  278
stream_elas_chirps         297
stream_elas_cr2met         278
stream_elas_mswep          278
stream_elas_tmpa           516
swe_ratio                  397
zero_q_freq                278
dtype: int64
[69]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cr2met, pcp_mm_mswep,
pcp_mm_tmpa, pet_mm_hargreaves, pet_mm_modis, q_cms_obs, q_mm_obs, swe

print total number of nans for each of dynamic feature.

[70]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[70]:
<xarray.Dataset> Size: 2GB
Dimensions:           (time: 38374, dynamic_features: 12)
Coordinates:
  * time              (time) datetime64[ns] 307kB 1913-02-15 ... 2018-03-09
  * dynamic_features  (dynamic_features) <U17 816B 'q_cms_obs' ... 'swe'
Data variables: (12/516)
    5411002           (time, dynamic_features) float64 4MB ...
    1610004           (time, dynamic_features) float64 4MB ...
    10322003          (time, dynamic_features) float64 4MB ...
    7308001           (time, dynamic_features) float64 4MB ...
    5746001           (time, dynamic_features) float64 4MB ...
    5410002           (time, dynamic_features) float64 4MB ...
    ...                ...
    12291001          (time, dynamic_features) float64 4MB ...
    4503001           (time, dynamic_features) float64 4MB ...
    7320003           (time, dynamic_features) float64 4MB ...
    11147001          (time, dynamic_features) float64 4MB ...
    2105005           (time, dynamic_features) float64 4MB ...
    11335002          (time, dynamic_features) float64 4MB ...
[71]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 332807
q_mm_obs 355349
pcp_mm_cr2met 353555
pcp_mm_chirps 337159
pcp_mm_mswep 325973
pcp_mm_tmpa 295225
airtemp_C_min 312397
airtemp_C_max 348927
airtemp_C_mean 313383
pet_mm_modis 326345
pet_mm_hargreaves 338171
swe 342287

CAMELS_CH

[72]:
dataset = RainfallRunoff('CAMELS_CH', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CH with 331 stations, 9 dynamic and 209 static features
[73]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, aap, acid_plutonic, acid_volcanic, amk, api, area_km2, aridity, baseflow_index_landson,
basic_plutonic, basic_volcanic, bulk_dens, bulk_dens_25, bulk_dens_5, bulk_dens_50, bulk_dens_75,
bulk_dens_90, bulk_dens_missing, bulk_dens_skewness, carbonate_sedimentary, clay_perc, clay_perc_25,
clay_perc_5, clay_perc_50, clay_perc_75, clay_perc_90, clay_perc_missing, clay_perc_skewness,
coarse_fragm_perc, coarse_fragm_perc_25, coarse_fragm_perc_5, coarse_fragm_perc_50,
coarse_fragm_perc_75, coarse_fragm_perc_90, coarse_fragm_perc_missing, coarse_fragm_perc_skewness,
conductivity, conductivity_25, conductivity_5, conductivity_50, conductivity_75, conductivity_90,
conductivity_missing, conductivity_skewness, country, crop_perc, dens_inhabitants, dom_land_cover,
dup, dwood_perc, elev_max, elev_mean, elev_min, elev_percentile10, elev_percentile25,
elev_percentile50, elev_percentile75, elev_percentile90, ewood_perc, ext_area_perc, fju,
flat_area_perc, frac_snow, gauge_easting, gauge_elevation, gauge_name, gauge_northing,
geo_log10_permeability, geo_porosity, glac_area, glac_area_neighbours, glac_mass, glac_vol,
grass_perc, hardrock_imperm_perc, hardrock_perc, hes, hfd_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, hp_count, hp_inst_turb, hp_max_power, hp_qturb, ice_geo,
ice_perc, id6, ind_end_date, ind_number_of_years, ind_start_date, intermediate_plutonic,
inwater_perc, karst_perc, lat, long, loose_rock_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, metamorphics, mixed_sedimentary, mixed_wood_perc, mpk, mps, n_inhabitants,
null_perc, num_reservoir, omm, ood, oos, ops, organic_perc, organic_perc_25, organic_perc_5,
organic_perc_50, organic_perc_75, organic_perc_90, organic_perc_missing, organic_perc_skewness, osm,
p_mean, p_seasonality, pet_mean, porosity, porosity_25, porosity_5, porosity_50, porosity_75,
porosity_90, porosity_missing, porosity_skewness, pyroclastic, q_mean, qua, reservoir_cap,
reservoir_fs, reservoir_he, reservoir_irr, reservoir_nousedata, reservoir_year_first,
reservoir_year_last, rock_perc, root_depth, root_depth_25, root_depth_5, root_depth_50,
root_depth_75, root_depth_90, root_depth_missing, root_depth_skewness, runoff_ratio, sal, sand_perc,
sand_perc_25, sand_perc_5, sand_perc_50, sand_perc_75, sand_perc_90, sand_perc_missing,
sand_perc_skewness, scrub_perc, sign_end_date, sign_number_of_years, sign_start_date,
siliciclastic_sedimentary, silt_perc, silt_perc_25, silt_perc_5, silt_perc_50, silt_perc_75,
silt_perc_90, silt_perc_missing, silt_perc_skewness, slope_degrees, slope_fdc, steep_area_perc,
stream_elas, sus, tie, tot_avail_water, tot_avail_water_25, tot_avail_water_5, tot_avail_water_50,
tot_avail_water_75, tot_avail_water_90, tot_avail_water_missing, tot_avail_water_skewness, ukd,
unconsol_coarse_perc, unconsol_fine_perc, unconsol_imperm_perc, unconsol_medium_perc,
unconsol_sediments, uod, ups, urban_perc, usm, water_body_name, water_body_type, water_geo,
water_perc, wetlands_perc, zero_q_freq
[74]:
df = dataset.fetch_static_features()
print(df.shape)
(331, 209)
[75]:
print(df.isna().sum().sum())
df.isna().sum()
2097
[75]:
ind_start_date         0
ind_end_date           0
ind_number_of_years    0
p_mean                 0
pet_mean               0
                      ..
elev_percentile90      0
elev_max               0
slope_degrees          0
flat_area_perc         0
steep_area_perc        0
Length: 209, dtype: int64

find those columns which have at least one NaN value

[76]:
df.loc[:, (df.isna().sum()>0)]
[76]:
p_seasonality frac_snow high_prec_timing low_prec_timing reservoir_he reservoir_fs reservoir_irr reservoir_nousedata reservoir_year_first reservoir_year_last ... baseflow_index_landson hfd_mean Q5 Q95 high_q_freq high_q_dur low_q_freq low_q_dur zero_q_freq silt_perc_skewness
gauge_id
2004 0.159 0.039 jja son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN -0.252
2007 -0.118 0.170 djf son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.635
2009 0.078 0.436 jja son 0.999 0.0 0.001 0.0 1914.0 1989.0 ... 0.787 243.282 1.279 6.207 0.000 0.000 0.051 2.000 0.0 0.285
2011 0.106 0.474 son son 0.998 0.0 0.002 0.0 1927.0 1989.0 ... 0.751 263.667 0.821 6.681 0.051 1.000 0.436 1.000 0.0 0.267
2014 0.279 0.223 jja son 1.000 0.0 0.000 0.0 1910.0 2015.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.421
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6007 0.228 0.379 son djf 1.000 0.0 0.000 0.0 2010.0 2010.0 ... 0.715 211.333 1.298 8.789 2.005 1.714 2.451 7.333 0.0 0.393
6008 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.602 188.875 0.632 10.751 4.385 2.593 29.315 8.069 0.0 -0.603
6009 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.318 191.714 0.127 15.376 26.897 2.667 155.228 12.056 0.0 0.310
6010 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.494 198.400 1.002 12.617 12.195 2.103 4.998 3.571 0.0 -0.744
6011 0.272 0.110 NaN djf 1.000 0.0 0.000 0.0 1918.0 2010.0 ... 0.697 204.250 1.165 10.371 0.000 0.000 0.000 0.000 0.0 0.272

331 rows × 26 columns

[77]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[77]:
p_seasonality              54
frac_snow                  54
high_prec_timing           13
low_prec_timing             5
reservoir_he              223
reservoir_fs              223
reservoir_irr             223
reservoir_nousedata       223
reservoir_year_first      223
reservoir_year_last       223
sign_start_date            42
sign_end_date              42
q_mean                     42
runoff_ratio               42
stream_elas                44
slope_fdc                  42
baseflow_index_landson     42
hfd_mean                   42
Q5                         42
Q95                        42
high_q_freq                42
high_q_dur                 42
low_q_freq                 42
low_q_dur                  42
zero_q_freq                42
silt_perc_skewness          1
dtype: int64
[78]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, q_cms_obs, q_mm_obs, rel_sun_dur(%), swe_mm,
waterlevel(m)

print total number of nans for each of dynamic feature.

[79]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[79]:
<xarray.Dataset> Size: 174MB
Dimensions:           (time: 14610, dynamic_features: 9)
Coordinates:
  * time              (time) datetime64[ns] 117kB 1981-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U14 504B 'airtemp_C_max' ... 'water...
Data variables: (12/331)
    2004              (time, dynamic_features) float32 526kB ...
    2007              (time, dynamic_features) float32 526kB ...
    2009              (time, dynamic_features) float32 526kB ...
    2011              (time, dynamic_features) float32 526kB ...
    2014              (time, dynamic_features) float32 526kB ...
    2016              (time, dynamic_features) float32 526kB ...
    ...                ...
    6006              (time, dynamic_features) float32 526kB ...
    6007              (time, dynamic_features) float32 526kB ...
    6008              (time, dynamic_features) float32 526kB ...
    6009              (time, dynamic_features) float32 526kB ...
    6010              (time, dynamic_features) float32 526kB ...
    6011              (time, dynamic_features) float32 526kB ...
[80]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_max 35673
airtemp_C_mean 36038
airtemp_C_min 6453
pcp_mm 6453
q_cms_obs 45534
q_mm_obs 6453
rel_sun_dur(%) 35673
swe_mm 6453
waterlevel(m) 6453

CAMELS_DE

[81]:
dataset = RainfallRunoff('CAMELS_DE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DE with 1582 stations, 21 dynamic and 111 static features
[82]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE_hbv, NSE_lstm, Q5, Q95, agricultural_areas_perc, aquifer_aquitard_mixed_perc, aquifer_perc,
aquitard_perc, area_km2, area_metadata, artificial_surfaces_perc, bulk_density_0_30cm_mean,
bulk_density_100_200cm_mean, bulk_density_30_100cm_mean, cavity_fissure_karst_perc,
cavity_fissure_perc, cavity_fissure_pores_perc, cavity_pores_perc, clay_0_30cm_mean,
clay_100_200cm_mean, clay_30_100cm_mean, coarse_fragments_0_30cm_mean,
coarse_fragments_100_200cm_mean, coarse_fragments_30_100cm_mean, consolidation_solid_rock_perc,
consolidation_unconsolidated_rock_perc, dams_names, dams_num, dams_purposes, dams_river_names,
dams_total_lake_area, dams_total_lake_volume, dams_year_first, dams_year_last, elev_5, elev_50,
elev_95, elev_max, elev_mean, elev_min, federal_state, flow_perc_complete, flow_period_end,
flow_period_start, forests_and_seminatural_areas_perc, frac_snow, gauge_easting, gauge_elev,
gauge_elev_metadata, gauge_name, gauge_northing,
geochemical_rocktype_anthropogenically_modified_through_filling_perc,
geochemical_rocktype_carbonatic_perc, geochemical_rocktype_halitic_perc,
geochemical_rocktype_silicate_carbonatic_perc,
geochemical_rocktype_silicate_organic_components_perc, geochemical_rocktype_silicate_perc,
geochemical_rocktype_sulfatic_halitic_perc, geochemical_rocktype_sulfatic_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, kf_extremely_low_perc,
kf_high_perc, kf_highly_variable_perc, kf_low_perc, kf_low_to_extremely_low_perc, kf_medium_perc,
kf_medium_to_moderate_perc, kf_moderate_perc, kf_moderate_to_low_perc, kf_very_high_perc,
kf_very_high_to_high_perc, kf_very_low_perc, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, low_q_dur, low_q_freq, no_data_perc, p_mean, p_seasonality, provider_id, q_mean,
rocktype_magmatite_perc, rocktype_metamorphite_perc, rocktype_sediment_perc, runoff_ratio,
sand_0_30cm_mean, sand_100_200cm_mean, sand_30_100cm_mean, silt_0_30cm_mean, silt_100_200cm_mean,
silt_30_100cm_mean, slope_, soil_organic_carbon_0_30cm_mean, soil_organic_carbon_100_200cm_mean,
soil_organic_carbon_30_100cm_mean, testing_perc_complete, training_perc_complete,
validation_perc_complete, water_bodies_perc, water_body_name, waterbody_perc, wetlands_perc,
zero_q_freq
[83]:
df = dataset.fetch_static_features()
print(df.shape)
(1582, 111)
[84]:
print(df.isna().sum().sum())
df.isna().sum()
6914
[84]:
p_mean            0
p_seasonality     0
frac_snow         0
high_prec_freq    0
high_prec_dur     0
                 ..
elev_min          0
elev_5            0
elev_50           0
elev_95           0
elev_max          0
Length: 111, dtype: int64

find those columns which have at least one NaN value

[85]:
df.loc[:, (df.isna().sum()>0)]
[85]:
high_prec_timing low_prec_timing dams_names dams_river_names dams_year_first dams_year_last dams_total_lake_area dams_total_lake_volume dams_purposes NSE_lstm NSE_hbv gauge_elev_metadata
gauge_id
DE210420 jja son NaN NaN NaN NaN 0.0 0.0 NaN 0.940 0.727 501.73
DE110460 djf son NaN NaN NaN NaN 0.0 0.0 NaN 0.950 0.866 169.69
DEG10560 jja son NaN NaN NaN NaN 0.0 0.0 NaN 0.814 0.705 170.22
DEC10240 djf jja NaN NaN NaN NaN 0.0 0.0 NaN 0.875 0.809 275.21
DE911960 jja mam NaN NaN NaN NaN 0.0 0.0 NaN 0.895 0.837 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
DEE11120 jja mam NaN NaN NaN NaN 0.0 0.0 NaN 0.778 NaN 27.17
DE112020 jja mam NaN NaN NaN NaN 0.0 0.0 NaN -0.528 -0.854 202.57
DEA11880 djf mam NaN NaN NaN NaN 0.0 0.0 NaN 0.862 0.782 126.68
DE410230 jja mam NaN NaN NaN NaN 0.0 0.0 NaN 0.718 -0.062 77.25
DE411410 jja mam NaN NaN NaN NaN 0.0 0.0 NaN 0.326 -0.449 42.33

1582 rows × 12 columns

[86]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[86]:
high_prec_timing             8
low_prec_timing              3
dams_names                1263
dams_river_names          1263
dams_year_first           1274
dams_year_last            1274
dams_total_lake_area        41
dams_total_lake_volume       2
dams_purposes             1264
NSE_lstm                    43
NSE_hbv                    157
gauge_elev_metadata        322
dtype: int64
[87]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_max, pcp_mm_mean, pcp_mm_median, pcp_mm_min,
pcp_mm_std, q_cms_obs, q_mm_obs, rh_%, rh_%_max, rh_%_med, rh_%_min, rh_%_std, solrad_wm2_max,
solrad_wm2_mean, solrad_wm2_med, solrad_wm2_min, solrad_wm2_std, water_level_obs

print total number of nans for each of dynamic feature.

[88]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[88]:
<xarray.Dataset> Size: 7GB
Dimensions:           (time: 25568, dynamic_features: 21)
Coordinates:
  * time              (time) datetime64[ns] 205kB 1951-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U15 1kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/1582)
    DE210420          (time, dynamic_features) float64 4MB ...
    DE110460          (time, dynamic_features) float64 4MB ...
    DEG10560          (time, dynamic_features) float64 4MB ...
    DEC10240          (time, dynamic_features) float64 4MB ...
    DE911960          (time, dynamic_features) float64 4MB ...
    DE211460          (time, dynamic_features) float64 4MB ...
    ...                ...
    DE710050          (time, dynamic_features) float64 4MB ...
    DEE11120          (time, dynamic_features) float64 4MB ...
    DE112020          (time, dynamic_features) float64 4MB ...
    DEA11880          (time, dynamic_features) float64 4MB ...
    DE410230          (time, dynamic_features) float64 4MB ...
    DE411410          (time, dynamic_features) float64 4MB ...
[89]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 25973
q_mm_obs 12477
water_level_obs 18445
pcp_mm_mean 20831
pcp_mm_min 38169
pcp_mm_median 16985
pcp_mm_max 14796
pcp_mm_std 19907
rh_% 20637
rh_%_min 23941
rh_%_med 24627
rh_%_max 19817
rh_%_std 34512
solrad_wm2_mean 29768
solrad_wm2_min 38933
solrad_wm2_med 26116
solrad_wm2_max 55704
solrad_wm2_std 11139
airtemp_C_mean 35612
airtemp_C_min 37074
airtemp_C_max 32089

CAMELS_DK

[90]:
dataset = RainfallRunoff('CAMELS_DK', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DK with 304 stations, 13 dynamic and 119 static features
[91]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
FC, HCC, KS, MRC, THS, WP, area_km2, aridity, bulk_density, catch_accum_number, catch_flow_dir,
chalk_d, dem_max, dem_mean, dem_median, dem_min, frac_snow_daily, gauge_record_pct, gauged_type,
high_prec_dur, high_prec_freq, high_prec_timing, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, p_mean, p_seasonality, pct_aeolain_sand, pct_agriculture_corine_1990,
pct_agriculture_corine_2000, pct_agriculture_corine_2006, pct_agriculture_corine_2012,
pct_agriculture_corine_2018, pct_agriculture_levin_2011, pct_agriculture_levin_2016,
pct_agriculture_levin_2018, pct_agriculture_levin_2021, pct_beach, pct_clay, pct_claynor_100,
pct_claynor_200, pct_claynor_30, pct_claynor_60, pct_down_sand, pct_flat_area,
pct_forest_corine_1990, pct_forest_corine_2000, pct_forest_corine_2006, pct_forest_corine_2012,
pct_forest_corine_2018, pct_forest_levin_2011, pct_forest_levin_2016, pct_forest_levin_2018,
pct_forest_levin_2021, pct_fsandno_100, pct_fsandno_200, pct_fsandno_30, pct_fsandno_60,
pct_glaf_sand, pct_glal_clay, pct_glam_clay, pct_gravel, pct_gsandno_100, pct_gsandno_200,
pct_gsandno_30, pct_gsandno_60, pct_marine_sand, pct_marsh, pct_naturedry_levin_2011,
pct_naturedry_levin_2016, pct_naturedry_levin_2018, pct_naturedry_levin_2021,
pct_naturewet_levin_2011, pct_naturewet_levin_2016, pct_naturewet_levin_2018,
pct_naturewet_levin_2021, pct_organic, pct_sand, pct_sandy_till, pct_silt, pct_till,
pct_urban_corine_1990, pct_urban_corine_2000, pct_urban_corine_2006, pct_urban_corine_2012,
pct_urban_corine_2018, pct_urban_levin_2011, pct_urban_levin_2016, pct_urban_levin_2018,
pct_urban_levin_2021, pct_water_corine_1990, pct_water_corine_2000, pct_water_corine_2006,
pct_water_corine_2012, pct_water_corine_2018, pct_water_deposit, pct_water_levin_2011,
pct_water_levin_2016, pct_water_levin_2018, pct_water_levin_2021, pct_wetlands_corine_1990,
pct_wetlands_corine_2000, pct_wetlands_corine_2006, pct_wetlands_corine_2012,
pct_wetlands_corine_2018, pet_mean, root_depth, slope_max, slope_median, slope_min, slope_mkm-1,
t_mean, tawc, uaquifer_d, uaquifer_t, uclay_t, usand_t
[92]:
df = dataset.fetch_static_features()
print(df.shape)
(304, 119)
[93]:
print(df.isna().sum().sum())
df.isna().sum()
23
[93]:
FC            0
HCC           0
KS            0
MRC           0
THS           0
             ..
tawc          0
uaquifer_d    3
uaquifer_t    3
uclay_t       3
usand_t       3
Length: 119, dtype: int64

find those columns which have at least one NaN value

[94]:
df.loc[:, (df.isna().sum()>0)]
[94]:
chalk_d gauge_record_pct uaquifer_d uaquifer_t uclay_t usand_t
42320511 52.478255 100.000000 8.454527 5.234260 5.499425 1.980980
37450497 106.906431 100.000000 2.380543 25.008347 1.601028 11.624057
71220004 10.041812 64.511570 10.011749 49.907923 9.872605 0.011285
43600043 467.241006 86.680798 149.076457 8.253941 13.736824 0.778939
32400556 57.685100 54.840134 20.096662 12.086162 14.430148 0.596264
... ... ... ... ... ... ...
62230402 66.995727 96.776188 24.909207 17.469369 21.827138 0.413106
13230734 504.407802 100.000000 3.142049 26.288778 2.588668 6.955276
31100765 78.142761 100.000000 12.946495 5.115810 12.266228 0.411408
32160004 116.768897 58.063946 23.504958 11.199836 22.411797 0.890127
35321391 494.303796 100.000000 0.819501 47.968283 0.667293 35.446263

304 rows × 6 columns

[95]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[95]:
chalk_d             3
gauge_record_pct    8
uaquifer_d          3
uaquifer_t          3
uclay_t             3
usand_t             3
dtype: int64
[96]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Abstraction, DKM_dtp, DKM_gwh, DKM_irr, DKM_sdr, DKM_sre, DKM_wcr, Qdkm, aet_mm, airtemp_C_mean,
pcp_mm, pet_mm, q_cms_obs
[97]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[97]:
<xarray.Dataset> Size: 202MB
Dimensions:           (time: 12782, dynamic_features: 13)
Coordinates:
  * time              (time) datetime64[ns] 102kB 1989-01-02 ... 2023-12-31
  * dynamic_features  (dynamic_features) <U14 728B 'Abstraction' ... 'q_cms_obs'
Data variables: (12/304)
    42320511          (time, dynamic_features) float32 665kB ...
    37450497          (time, dynamic_features) float32 665kB ...
    71220004          (time, dynamic_features) float32 665kB ...
    43600043          (time, dynamic_features) float32 665kB ...
    32400556          (time, dynamic_features) float32 665kB ...
    41200020          (time, dynamic_features) float32 665kB ...
    ...                ...
    71270810          (time, dynamic_features) float32 665kB ...
    62230402          (time, dynamic_features) float32 665kB ...
    13230734          (time, dynamic_features) float32 665kB ...
    31100765          (time, dynamic_features) float32 665kB ...
    32160004          (time, dynamic_features) float32 665kB ...
    35321391          (time, dynamic_features) float32 665kB ...
[98]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_FI

[99]:
dataset = RainfallRunoff('CAMELS_FI', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FI with 320 stations, 16 dynamic and 106 static features
[100]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_km2, aridity, bares_perc_2000, bares_perc_2006, bares_perc_2012, bares_perc_2018,
baseflow_index_ladson, baseflow_index_lfstat, basin_id, basin_name, bedrock_perc, clay_perc,
coarse_perc, crop_frac_2000, crop_frac_2006, crop_frac_2012, crop_frac_2018, cross_border_perc,
dwood_perc_2000, dwood_perc_2006, dwood_perc_2012, dwood_perc_2018, elev_10, elev_90,
elev_catch_max_m, elev_gauge_m, elev_max, elev_mean, elev_min, elev_range, ewood_perc_2000,
ewood_perc_2006, ewood_perc_2012, ewood_perc_2018, frac_snow, gauge_easting, gauge_name,
gauge_northing, grass_frac_2000, grass_frac_2006, grass_frac_2012, grass_frac_2018, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, ice_correction,
inwater_perc_2000, inwater_perc_2006, inwater_perc_2012, inwater_perc_2018, lat, long, low_prec_dur,
low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, nestedness, num_dam, num_inhabitants,
num_regulation_other, num_reservoir, owner_id, owner_name, p_mean, p_seasonality, peat_perc,
pet_mean, pop_density_km2, q_mean, reference_gauge, regulation_level, reservoir_cap, runoff_ratio,
shrub_perc_2000, shrub_perc_2006, shrub_perc_2012, shrub_perc_2018, sign_end_date,
sign_number_of_obs, sign_number_of_years, sign_start_date, silt_perc, slope_fdc, slope_percent,
soil_depth_m, stream_elas, temperature_mean, till_perc, timeseries_number_of_years, urban_frac_2000,
urban_frac_2006, urban_frac_2012, urban_frac_2018, water_region_code, water_region_name,
wetland_perc_2000, wetland_perc_2006, wetland_perc_2012, wetland_perc_2018, zero_q_freq
[101]:
df = dataset.fetch_static_features()
print(df.shape)
(320, 106)
[102]:
print(df.isna().sum().sum())
df.isna().sum()
22
[102]:
crop_frac_2000          0
grass_frac_2000         0
shrub_perc_2000         0
dwood_perc_2000         0
ewood_perc_2000         0
                       ..
num_dam                 0
num_reservoir           0
reservoir_cap           0
num_regulation_other    0
regulation_level        0
Length: 106, dtype: int64

find those columns which have at least one NaN value

[103]:
df.loc[:, (df.isna().sum()>0)]
[103]:
slope_fdc baseflow_index_ladson baseflow_index_lfstat high_prec_timing low_prec_timing
gauge_id
1210-2685-2687 3.82 0.41 0.34 jja mam
1289 2.41 0.69 0.80 jja mam
930 3.44 0.56 0.72 jja mam
3218 3.51 0.54 0.55 jja mam
1095 2.88 0.50 0.46 jja mam
... ... ... ... ... ...
1307 1.67 0.74 0.61 jja mam
2726 2.55 0.64 0.63 jja mam
3602 1.55 0.80 0.88 jja mam
1223 8.76 0.41 0.37 jja mam
3096 3.03 0.53 0.58 jja mam

320 rows × 5 columns

[104]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[104]:
slope_fdc                4
baseflow_index_ladson    2
baseflow_index_lfstat    2
high_prec_timing         5
low_prec_timing          9
dtype: int64
[105]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pe_era5_land, pet_fmi, pet_mm, q_cms_obs,
q_mm_obs, radiation_global, rh_%, snow_evaporation, snowdepth_m, swe_mm_cci3-1, swe_mm_era5,
temperature_gmin
[106]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[106]:
<xarray.Dataset> Size: 943MB
Dimensions:           (time: 23010, dynamic_features: 16)
Coordinates:
  * time              (time) datetime64[ns] 184kB 1961-01-01 ... 2023-12-31
  * dynamic_features  (dynamic_features) <U16 1kB 'q_cms_obs' ... 'radiation_...
Data variables: (12/320)
    1210-2685-2687    (time, dynamic_features) float64 3MB ...
    1289              (time, dynamic_features) float64 3MB ...
    930               (time, dynamic_features) float64 3MB ...
    3218              (time, dynamic_features) float64 3MB ...
    1095              (time, dynamic_features) float64 3MB ...
    2716              (time, dynamic_features) float64 3MB ...
    ...                ...
    1046              (time, dynamic_features) float64 3MB ...
    1307              (time, dynamic_features) float64 3MB ...
    2726              (time, dynamic_features) float64 3MB ...
    3602              (time, dynamic_features) float64 3MB ...
    1223              (time, dynamic_features) float64 3MB ...
    3096              (time, dynamic_features) float64 3MB ...
[107]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_FR

[108]:
dataset = RainfallRunoff('CAMELS_FR', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FR with 654 stations, 22 dynamic and 344 static features
[109]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_km2, clc_1990_lvl1_1, clc_1990_lvl1_2, clc_1990_lvl1_3, clc_1990_lvl1_4, clc_1990_lvl1_5,
clc_1990_lvl1_dom_class, clc_1990_lvl1_na, clc_1990_lvl2_11, clc_1990_lvl2_12, clc_1990_lvl2_13,
clc_1990_lvl2_14, clc_1990_lvl2_21, clc_1990_lvl2_22, clc_1990_lvl2_23, clc_1990_lvl2_24,
clc_1990_lvl2_31, clc_1990_lvl2_32, clc_1990_lvl2_33, clc_1990_lvl2_41, clc_1990_lvl2_42,
clc_1990_lvl2_51, clc_1990_lvl2_52, clc_1990_lvl2_dom_class, clc_1990_lvl2_na, clc_1990_lvl3_111,
clc_1990_lvl3_112, clc_1990_lvl3_121, clc_1990_lvl3_122, clc_1990_lvl3_123, clc_1990_lvl3_124,
clc_1990_lvl3_131, clc_1990_lvl3_132, clc_1990_lvl3_133, clc_1990_lvl3_141, clc_1990_lvl3_142,
clc_1990_lvl3_211, clc_1990_lvl3_212, clc_1990_lvl3_213, clc_1990_lvl3_221, clc_1990_lvl3_222,
clc_1990_lvl3_223, clc_1990_lvl3_231, clc_1990_lvl3_241, clc_1990_lvl3_242, clc_1990_lvl3_243,
clc_1990_lvl3_244, clc_1990_lvl3_311, clc_1990_lvl3_312, clc_1990_lvl3_313, clc_1990_lvl3_321,
clc_1990_lvl3_322, clc_1990_lvl3_323, clc_1990_lvl3_324, clc_1990_lvl3_331, clc_1990_lvl3_332,
clc_1990_lvl3_333, clc_1990_lvl3_334, clc_1990_lvl3_335, clc_1990_lvl3_411, clc_1990_lvl3_412,
clc_1990_lvl3_421, clc_1990_lvl3_422, clc_1990_lvl3_423, clc_1990_lvl3_511, clc_1990_lvl3_512,
clc_1990_lvl3_521, clc_1990_lvl3_522, clc_1990_lvl3_523, clc_1990_lvl3_dom_class, clc_1990_lvl3_na,
clc_2018_lvl1_1, clc_2018_lvl1_2, clc_2018_lvl1_3, clc_2018_lvl1_4, clc_2018_lvl1_5,
clc_2018_lvl1_dom_class, clc_2018_lvl1_na, clc_2018_lvl2_11, clc_2018_lvl2_12, clc_2018_lvl2_13,
clc_2018_lvl2_14, clc_2018_lvl2_21, clc_2018_lvl2_22, clc_2018_lvl2_23, clc_2018_lvl2_24,
clc_2018_lvl2_31, clc_2018_lvl2_32, clc_2018_lvl2_33, clc_2018_lvl2_41, clc_2018_lvl2_42,
clc_2018_lvl2_51, clc_2018_lvl2_52, clc_2018_lvl2_dom_class, clc_2018_lvl2_na, clc_2018_lvl3_111,
clc_2018_lvl3_112, clc_2018_lvl3_121, clc_2018_lvl3_122, clc_2018_lvl3_123, clc_2018_lvl3_124,
clc_2018_lvl3_131, clc_2018_lvl3_132, clc_2018_lvl3_133, clc_2018_lvl3_141, clc_2018_lvl3_142,
clc_2018_lvl3_211, clc_2018_lvl3_212, clc_2018_lvl3_213, clc_2018_lvl3_221, clc_2018_lvl3_222,
clc_2018_lvl3_223, clc_2018_lvl3_231, clc_2018_lvl3_241, clc_2018_lvl3_242, clc_2018_lvl3_243,
clc_2018_lvl3_244, clc_2018_lvl3_311, clc_2018_lvl3_312, clc_2018_lvl3_313, clc_2018_lvl3_321,
clc_2018_lvl3_322, clc_2018_lvl3_323, clc_2018_lvl3_324, clc_2018_lvl3_331, clc_2018_lvl3_332,
clc_2018_lvl3_333, clc_2018_lvl3_334, clc_2018_lvl3_335, clc_2018_lvl3_411, clc_2018_lvl3_412,
clc_2018_lvl3_421, clc_2018_lvl3_422, clc_2018_lvl3_423, clc_2018_lvl3_511, clc_2018_lvl3_512,
clc_2018_lvl3_521, clc_2018_lvl3_522, clc_2018_lvl3_523, clc_2018_lvl3_dom_class, clc_2018_lvl3_na,
cli_aridity_ou, cli_aridity_pe, cli_aridity_pm, cli_assync_ou, cli_assync_pe, cli_assync_pm,
cli_pet_ou_mean, cli_pet_ou_yr, cli_pet_pe_mean, cli_pet_pe_yr, cli_pet_pm_mean, cli_pet_pm_yr,
cli_prec_date_max, cli_prec_dur_high, cli_prec_dur_low, cli_prec_freq_high, cli_prec_freq_low,
cli_prec_intensity, cli_prec_max, cli_prec_mean, cli_prec_mean_yr, cli_prec_season_pet_ou,
cli_prec_season_pet_pe, cli_prec_season_pet_pm, cli_prec_season_temp, cli_prec_timing_high,
cli_prec_timing_low, cli_psol_frac_berghuijs, cli_psol_frac_safran, cli_temp_mean, dam_influence,
dam_n, dam_volume, geo_dom_class, geo_ev, geo_ig, geo_mt, geo_nd, geo_pa, geo_pb, geo_pi, geo_py,
geo_sc, geo_sm, geo_ss, geo_su, geo_va, geo_vb, geo_vi, geo_wb, hgl_krs_karstic,
hgl_krs_not_karstic, hgl_krs_unknown, hgl_permeability, hgl_porosity, hgl_thm_alluvial,
hgl_thm_bedrock, hgl_thm_intense_folded, hgl_thm_sedimentary, hgl_thm_unknown, hgl_thm_volcanism,
hyc_jay_pet_ou, hyc_jay_pet_pe, hyc_jay_pet_pm, hyc_jay_prec_mean, hyc_jay_ratio_prec_pet_ou,
hyc_jay_ratio_prec_pet_pe, hyc_jay_ratio_prec_pet_pm, hyc_jay_ratio_q_prec, hyd_bfi_ladson,
hyd_bfi_lfstat, hyd_bfi_pelletier_pet_ou, hyd_hfd_mean, hyd_q_date_max, hyd_q_date_qmna,
hyd_q_dur_high, hyd_q_dur_low, hyd_q_freq_high, hyd_q_freq_low, hyd_q_freq_zero, hyd_q_max,
hyd_q_mean, hyd_q_mean_yr, hyd_q_qmna_min, hyd_stream_elas, hym_q_anomaly_inrae, hym_q_date_end,
hym_q_date_start, hym_q_low_uncertainty_inrae, hym_q_n_year, hym_q_na_period, hym_q_na_total,
hym_q_questionable, hym_q_unqualified, lat, long, sit_altitude, sit_altitude_datum, sit_area_hydro,
sit_city, sit_code_h3, sit_comment, sit_comment_impact_gene, sit_crs, sit_date_start,
sit_date_update, sit_entity, sit_flood_duration, sit_impact, sit_kp_down, sit_kp_up, sit_label,
sit_label_add, sit_label_usual, sit_latitude, sit_longitude, sit_mnemonic, sit_month1_low_water,
sit_month1_year, sit_publication_rights, sit_section, sit_section_vigilance, sit_status,
sit_test_site, sit_type, sit_type_add, sit_tz, sit_waterbody, sit_watercourse_acc, sit_zone_hydro,
slope_, sta_altitude_snap, sta_altitude_staff_gauge, sta_area_snap, sta_city, sta_code_child,
sta_code_h2, sta_code_parent, sta_comment, sta_comment_impact_local, sta_crs, sta_date_altitude_ref,
sta_date_end, sta_date_start, sta_date_update, sta_display_level, sta_dual_staff_gauge, sta_epsg,
sta_impact_local, sta_kp, sta_label, sta_label_add, sta_main_prod_code, sta_main_prod_name,
sta_main_prod_name_short, sta_monitor, sta_publication_right, sta_purpose, sta_qual_highflow,
sta_qual_lowflow, sta_qual_meanflow, sta_territory, sta_test_station, sta_time_data_gap,
sta_time_discontinuity, sta_type, sta_x_l2e, sta_x_l2e_snap, sta_x_l93, sta_x_l93_snap,
sta_x_w84_snap, sta_y_l2e, sta_y_l2e_snap, sta_y_l93, sta_y_l93_snap, sta_y_w84_snap,
top_altitude_mean, top_dist_outlet_mean, top_drainage_density, top_itopo_mean, top_mor_circ_ratio,
top_mor_compact_coef, top_mor_elong_ratio_catchment, top_mor_elong_ratio_circ,
top_mor_form_factor_horton, top_mor_form_factor_square, top_mor_relief_ratio, top_mor_shape_factor,
top_slo_flat, top_slo_gentle, top_slo_mean, top_slo_moderate, top_slo_ori_e, top_slo_ori_n,
top_slo_ori_ne, top_slo_ori_nw, top_slo_ori_s, top_slo_ori_se, top_slo_ori_sw, top_slo_ori_w,
top_slo_steep, top_slo_strong, top_slo_very_steep
[110]:
df = dataset.fetch_static_features()
print(df.shape)
(654, 344)
[111]:
print(df.isna().sum().sum())
df.isna().sum()
12253
[111]:
area_km2              7
clc_1990_lvl1_1       0
clc_1990_lvl1_2       0
clc_1990_lvl1_3       0
clc_1990_lvl1_4       0
                     ..
top_slo_ori_sw        0
top_slo_ori_w         0
top_slo_steep         0
top_slo_strong        0
top_slo_very_steep    0
Length: 344, dtype: int64

find those columns which have at least one NaN value

[112]:
df.loc[:, (df.isna().sum()>0)]
[112]:
area_km2 clc_1990_lvl1_dom_class clc_1990_lvl2_dom_class clc_1990_lvl3_dom_class clc_2018_lvl1_dom_class clc_2018_lvl2_dom_class clc_2018_lvl3_dom_class cli_prec_timing_high cli_prec_timing_low hyd_bfi_ladson ... sta_code_h2 sta_code_parent sta_comment sta_comment_impact_local sta_date_altitude_ref sta_date_end sta_display_level sta_kp sta_label_add sta_purpose
A105003001 233.0 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.56723 ... A1050310 NaN Mise à l'heure TU le 05/11/2009. - Remplacemen... NaN 2022-02-24 08:21:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A107020001 70.0 2.0 21.0 211.0 2.0 21.0 211.0 son son 0.56320 ... A1072010 NaN Nivellement de juillet 2002, géomètre Faber-Sc... NaN 2020-12-14 11:19:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A112020001 129.0 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.44951 ... A1122010 NaN Arrêt des observations le 10/01/2008. - Nivell... NaN NaN 2008-01-10 11:20:00 NaN NaN NaN Low flow monitoring - Flood forecasting
A116003002 666.0 2.0 21.0 211.0 2.0 21.0 211.0 jja son 0.53010 ... A1080320 NaN Echelle et pont arrachés en mai 1983. Seuil re... NaN 2018-12-05 07:24:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A140202001 7.6 3.0 31.0 311.0 3.0 31.0 311.0 djf son 0.50286 ... A1402020 NaN Passage à l'heure TU le 29/10/2009. - Nivellé ... NaN 2020-12-14 11:20:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Y781000101 129.0 3.0 32.0 333.0 3.0 32.0 333.0 son jja 0.37525 ... Y7804010 NaN Station du réseau de base sur seuil naturel, é... Pompages Manso et Galeria 2021-04-15 08:18:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y862000101 331.0 3.0 31.0 311.0 3.0 31.0 311.0 djf jja NaN ... Y8624010 NaN Courbes de tarage à partir du 31/12/1979 revue... NaN NaN NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y881000102 130.0 3.0 31.0 311.0 3.0 31.0 311.0 djf jja 0.55820 ... Y8814020 NaN NaN NaN NaN 2012-04-30 12:00:00 NaN NaN Zoza ancien Flood forecasting - Streamflow monitoring
Y902000101 147.0 3.0 31.0 312.0 3.0 31.0 312.0 son jja 0.51000 ... Y9025010 NaN NaN Influence forte des barrages de baigneurs en é... NaN NaN NaN NaN Pont de Noceta Low flow monitoring - Flood forecasting - Stre...
Y960000102 99.7 3.0 32.0 323.0 3.0 31.0 313.0 djf jja 0.34639 ... Y9605230 NaN STATION EN REMPLACEMENT DE CELLE DE TAFONATO Y... Pompages amont ? 2017-09-13 09:38:00 NaN NaN NaN Canniciu Streamflow monitoring

654 rows × 57 columns

[113]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[113]:
area_km2                      7
clc_1990_lvl1_dom_class       4
clc_1990_lvl2_dom_class       5
clc_1990_lvl3_dom_class       5
clc_2018_lvl1_dom_class       5
clc_2018_lvl2_dom_class       6
clc_2018_lvl3_dom_class       7
cli_prec_timing_high         15
cli_prec_timing_low           2
hyd_bfi_ladson               42
hyd_bfi_lfstat               42
hyd_bfi_pelletier_pet_ou     42
sit_altitude                 10
sit_altitude_datum           10
sit_area_hydro              641
sit_city                      2
sit_comment                 515
sit_comment_impact_gene     630
sit_crs                       2
sit_date_start              654
sit_date_update               2
sit_entity                    2
sit_flood_duration          654
sit_impact                    6
sit_kp_down                 590
sit_kp_up                   654
sit_label                     2
sit_label_add               464
sit_label_usual             326
sit_latitude                  2
sit_longitude                 2
sit_mnemonic                619
sit_month1_low_water          2
sit_month1_year               2
sit_publication_rights        2
sit_section                   2
sit_section_vigilance       127
sit_status                    2
sit_test_site                 2
sit_type                      2
sit_type_add                  2
sit_tz                        2
sit_waterbody               654
sit_watercourse_acc         632
sit_zone_hydro                2
sta_altitude_staff_gauge    120
sta_code_child              654
sta_code_h2                  13
sta_code_parent             654
sta_comment                 305
sta_comment_impact_local    624
sta_date_altitude_ref       120
sta_date_end                580
sta_display_level           654
sta_kp                      583
sta_label_add               527
sta_purpose                  17
dtype: int64
[114]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, lwdownrad_wm2, pcp_mm, pcp_mm_solfrac, pet_mm_ou,
pet_mm_pe, pet_mm_pm, q_cms_obs, q_mm_obs, solrad_wm2, spechum_gkg, tsd_swe_isba, tsd_swi_gr,
tsd_swi_isba, tsd_val_c, tsd_val_i, tsd_val_m, tsd_val_q, tsd_val_s, windspeed_mps
[115]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[115]:
<xarray.Dataset> Size: 2GB
Dimensions:           (time: 18993, dynamic_features: 22)
Coordinates:
  * time              (time) datetime64[ns] 152kB 1970-01-01 ... 2021-12-31
  * dynamic_features  (dynamic_features) <U14 1kB 'airtemp_C_max' ... 'windsp...
Data variables: (12/654)
    A105003001        (time, dynamic_features) float64 3MB ...
    A107020001        (time, dynamic_features) float64 3MB ...
    A112020001        (time, dynamic_features) float64 3MB ...
    A116003002        (time, dynamic_features) float64 3MB ...
    A140202001        (time, dynamic_features) float64 3MB ...
    A202030001        (time, dynamic_features) float64 3MB ...
    ...                ...
    Y661401001        (time, dynamic_features) float64 3MB ...
    Y781000101        (time, dynamic_features) float64 3MB ...
    Y862000101        (time, dynamic_features) float64 3MB ...
    Y881000102        (time, dynamic_features) float64 3MB ...
    Y902000101        (time, dynamic_features) float64 3MB ...
    Y960000102        (time, dynamic_features) float64 3MB ...
[116]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_GB

[117]:
dataset = RainfallRunoff('CAMELS_GB', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_GB with 671 stations, 10 dynamic and 145 static features
[118]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, abs_agriculture_perc, abs_amenities_perc, abs_energy_perc, abs_environmental_perc,
abs_industry_perc, abs_watersupply_perc, area_km2, aridity, bankfull_flow, bares_perc,
baseflow_index, baseflow_index_ceh, benchmark_catch, bulkdens, bulkdens_5, bulkdens_50, bulkdens_95,
bulkdens_missing, clay_perc, clay_perc_missing, conductivity_cosby, conductivity_cosby_5,
conductivity_cosby_50, conductivity_cosby_95, conductivity_cosby_missing, conductivity_hypres,
conductivity_hypres_5, conductivity_hypres_50, conductivity_hypres_95, conductivity_hypres_missing,
crop_perc, discharges, dom_land_cover, dpsbar, dwood_perc, elev_10, elev_50, elev_90, elev_max,
elev_mean, elev_min, ewood_perc, flow_perc_complete, flow_period_end, flow_period_start,
frac_high_perc, frac_low_perc, frac_mod_perc, frac_snow, gauge_easting, gauge_elev, gauge_name,
gauge_northing, grass_perc, groundwater_abs, hfd_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, inter_high_perc, inter_low_perc, inter_mod_perc,
inwater_perc, lat, long, low_nsig_perc, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur,
low_q_freq, no_gw_perc, nsig_low_perc, num_reservoir, organic_perc, organic_perc_missing, p_mean,
p_seasonality, pet_mean, porosity_cosby, porosity_cosby_5, porosity_cosby_50, porosity_cosby_95,
porosity_cosby_missing, porosity_hypres, porosity_hypres_5, porosity_hypres_50, porosity_hypres_95,
porosity_hypres_missing, q25_uncert_lower, q25_uncert_upper, q50_uncert_lower, q50_uncert_upper,
q5_uncert_lower, q5_uncert_upper, q75_uncert_lower, q75_uncert_upper, q95_uncert_lower,
q95_uncert_upper, q99_uncert_lower, q99_uncert_upper, q_mean, quncert_meta, reservoir_cap,
reservoir_drain, reservoir_env, reservoir_fs, reservoir_he, reservoir_nav, reservoir_nousedata,
reservoir_wr, reservoir_year_first, reservoir_year_last, root_depth, root_depth_5, root_depth_50,
root_depth_95, root_depth_missing, runoff_ratio, sand_perc, sand_perc_missing, shrub_perc,
silt_perc, silt_perc_missing, slope_, soil_depth_pelletier, soil_depth_pelletier_5,
soil_depth_pelletier_50, soil_depth_pelletier_95, soil_depth_pelletier_missing, station_type,
stream_elas, structurefull_flow, surfacewater_abs, tawc, tawc_5, tawc_50, tawc_95, tawc_missing,
urban_perc, zero_q_freq
[119]:
df = dataset.fetch_static_features()
print(df.shape)
(671, 145)
[120]:
print(df.isna().sum().sum())
df.isna().sum()
10316
[120]:
Q5                        0
Q95                       0
abs_agriculture_perc    313
abs_amenities_perc      313
abs_energy_perc         313
                       ...
tawc_50                   0
tawc_95                   0
tawc_missing              0
urban_perc                0
zero_q_freq               0
Length: 145, dtype: int64

find those columns which have at least one NaN value

[121]:
df.loc[:, (df.isna().sum()>0)]
[121]:
abs_agriculture_perc abs_amenities_perc abs_energy_perc abs_environmental_perc abs_industry_perc abs_watersupply_perc bankfull_flow discharges dpsbar elev_mean ... reservoir_he reservoir_nav reservoir_nousedata reservoir_wr reservoir_year_first reservoir_year_last slope_ station_type structurefull_flow surfacewater_abs
gauge_id
50008 NaN NaN NaN NaN NaN NaN 50.0 0.001 81.5 175.0 ... NaN NaN NaN NaN NaN NaN 4.03 FVVA NaN 0.000
28040 0.00 0.00 0.00 0.00 100.00 0.00 NaN 0.000 68.3 185.0 ... NaN NaN NaN NaN NaN NaN 2.38 C 45.0 0.002
27009 9.47 0.01 34.61 0.01 2.03 53.88 240.0 0.024 69.1 185.0 ... 0.00 0.0 0.00 90.29 1901.0 1936.0 2.78 US NaN 0.123
4006 NaN NaN NaN NaN NaN NaN 80.5 NaN 164.6 351.0 ... NaN NaN NaN NaN NaN NaN 3.58 VA NaN NaN
40033 0.00 0.00 0.00 0.00 0.00 100.00 NaN 0.000 91.9 120.0 ... NaN NaN NaN NaN NaN NaN 1.92 FV NaN 0.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27041 58.32 0.12 14.58 0.00 2.23 24.74 NaN 0.014 76.8 128.0 ... NaN NaN NaN NaN NaN NaN 2.04 C US 74.8 0.047
41022 100.00 0.00 0.00 0.00 0.00 0.00 NaN 0.011 78.6 82.0 ... NaN NaN NaN NaN NaN NaN 3.47 C 41.0 0.002
57009 NaN NaN NaN NaN NaN NaN NaN NaN 81.9 104.0 ... NaN NaN NaN NaN NaN NaN 2.72 FVVA NaN NaN
15012 NaN NaN NaN NaN NaN NaN NaN NaN 166.1 485.0 ... 72.43 0.0 27.57 0.00 1950.0 1962.0 2.44 VA NaN NaN
16004 NaN NaN NaN NaN NaN NaN 130.0 NaN 155.5 274.0 ... 78.01 0.0 0.00 21.99 1958.0 1964.0 2.81 VA NaN NaN

671 rows × 38 columns

[122]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[122]:
abs_agriculture_perc      313
abs_amenities_perc        313
abs_energy_perc           313
abs_environmental_perc    313
abs_industry_perc         313
abs_watersupply_perc      313
bankfull_flow             310
discharges                231
dpsbar                      2
elev_mean                   2
groundwater_abs           229
high_prec_timing           15
low_prec_timing             3
q25_uncert_lower          173
q25_uncert_upper          173
q50_uncert_lower          168
q50_uncert_upper          168
q5_uncert_lower           235
q5_uncert_upper           235
q75_uncert_lower          170
q75_uncert_upper          170
q95_uncert_lower          195
q95_uncert_upper          195
q99_uncert_lower          250
q99_uncert_upper          250
reservoir_drain           509
reservoir_env             509
reservoir_fs              509
reservoir_he              509
reservoir_nav             509
reservoir_nousedata       509
reservoir_wr              509
reservoir_year_first      530
reservoir_year_last       530
slope_                      3
station_type                1
structurefull_flow        408
surfacewater_abs          229
dtype: int64
[123]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_intercep, q_cms_obs, q_mm_obs, rh_%,
solrad_wm2, windspeed_mps
[124]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[124]:
<xarray.Dataset> Size: 882MB
Dimensions:           (time: 16436, dynamic_features: 10)
Coordinates:
  * time              (time) datetime64[ns] 131kB 1970-10-01 ... 2015-09-30
  * dynamic_features  (dynamic_features) <U15 600B 'pcp_mm' ... 'windspeed_mps'
Data variables: (12/671)
    50008             (time, dynamic_features) float64 1MB ...
    28040             (time, dynamic_features) float64 1MB ...
    27009             (time, dynamic_features) float64 1MB ...
    4006              (time, dynamic_features) float64 1MB ...
    40033             (time, dynamic_features) float64 1MB ...
    77003             (time, dynamic_features) float64 1MB ...
    ...                ...
    22007             (time, dynamic_features) float64 1MB ...
    27041             (time, dynamic_features) float64 1MB ...
    41022             (time, dynamic_features) float64 1MB ...
    57009             (time, dynamic_features) float64 1MB ...
    15012             (time, dynamic_features) float64 1MB ...
    16004             (time, dynamic_features) float64 1MB ...

print total number of nans for each of dynamic feature of CAMELS-GB. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[125]:
#     print(feat, nans)

CAMELS_IND

[126]:
dataset = RainfallRunoff('CAMELS_IND', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_IND with 472 stations, 20 dynamic and 210 static features
[127]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_gleam_mean, ai_mean, annual_max_1day, annual_max_30day, annual_max_3day, annual_max_7day,
annual_max_90day, annual_min_7day, annual_q, area_km2, aridity_p_pet, aridity_pet_aet,
asynchronicity, bare_frac, bfi, built_area_frac, bulkdens_sub_major, bulkdens_sub_mean,
bulkdens_top_major, bulkdense_top_mean, carb_rocks_frac, cen_time, clay_frac_sub, clay_frac_top,
crops_frac, crops_frac_1985, crops_frac_1995, crops_frac_2005, cv_apr_flow, cv_aug_flow,
cv_dec_flow, cv_feb_flow, cv_jan_flow, cv_jul_flow, cv_jun_flow, cv_mar_flow, cv_may_flow,
cv_nov_flow, cv_oct_flow, cv_sep_flow, cwc_river, cwc_site_name, dom_land_cover,
dom_land_cover_frac, doy_max_flow, doy_max_flow_7, doy_min_flow, doy_min_flow_7, drinking_frac,
dspbar, elev_max, elev_mean, elev_median, elev_min, evap_canopy_anum, evap_canopy_max,
evap_canopy_mean, evap_canopy_min, evap_surface_anum, evap_surface_max, evap_surface_mean,
evap_surface_min, fall_days, fall_rate_mean, fall_rate_median, first_dam_year, flood_frac,
flooded_veg_frac, flow_availability, freq_q_high, freq_q_low, gauge_elevation, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, geol_permeability, geol_porosity,
ghi_area, ghi_group, ghi_lat, ghi_lon, ghi_stn_id, gini_flow, gravel_frac_sub, gravel_frac_top,
high_prec_dur, high_prec_freq, high_prec_timing, hsg_major, hydroelec_frac, irrigation_frac,
lai_diff, lai_max, lai_mean, lai_min, last_dam_year, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, max_high_prec_dur, max_low_prec_dur, mean_anum_flow, mean_apr_flow, mean_atmn_flow,
mean_aug_flow, mean_dec_flow, mean_feb_flow, mean_jan_flow, mean_jul_flow, mean_jun_flow,
mean_mar_flow, mean_may_flow, mean_nov_flow, mean_oct_flow, mean_sep_flow, mean_sumr_flow,
mean_swmn_flow, mean_wint_flow, month_1day_max, month_1day_min, n_dams, navigation_frac, num_dams,
num_hyd_alt, org_carb_sub_major, org_carb_sub_mean, org_carb_top_major, org_carb_top_mean,
organic_frac_sub, organic_frac_top, overflow_frac, p_annual_variability, p_max, p_mean, p_mean_anum,
p_monthly_variability, p_unif, pet_gleam_mean, pet_max, pet_mean, pet_mean_anum, pet_min,
pop_density_2000, pop_density_2005, pop_density_2010, pop_density_2015, pop_density_2020, q_10,
q_25, q_25_swmn, q_50, q_50_swmn, q_5_swmn, q_75, q_75_swmn, q_90, q_95_swmn, q_cv, q_high_days,
q_low_days, q_mean, q_mean_swmn, q_zero, range_frac, rel_hum_mean, res_store_sum, reservoir_index,
rise_days, rise_rate_mean, rise_rate_median, river_basin, runoff_ratio, sand_frac_sub,
sand_frac_top, silt_frac_sub, silt_frac_top, slope_degrees, slope_fdc, slope_max, slope_median,
slope_min, sm_lvl1_mean, sm_lvl2_mean, sm_lvl3_mean, sm_lvl4_mean, soil_awc_sub, soil_awc_top,
soil_awsc_major, soil_awsc_max, soil_awsc_min, soil_conductivity_sub, soil_conductivity_top,
soil_depth, srad_lw_mean, srad_sw_mean, streamflow_elas, tailing_frac, tmax_mean, tmin_mean,
total_storage, trees_frac, urban_frac_1985, urban_frac_1995, urban_frac_2005, water_frac, wind_mean,
wtd
[128]:
df = dataset.fetch_static_features()
print(df.shape)
(472, 210)
[129]:
print(df.isna().sum().sum())
df.isna().sum()
20322
[129]:
aet_gleam_mean        0
ai_mean               0
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
                   ...
urban_frac_1995       0
urban_frac_2005       0
water_frac            0
wind_mean             0
wtd                   0
Length: 210, dtype: int64

find those columns which have at least one NaN value

[130]:
df.loc[:, (df.isna().sum()>0)]
[130]:
annual_max_1day annual_max_30day annual_max_3day annual_max_7day annual_max_90day annual_min_7day annual_q bfi bulkdens_sub_major bulkdens_sub_mean ... q_mean_swmn q_zero reservoir_index rise_days rise_rate_mean rise_rate_median runoff_ratio slope_fdc streamflow_elas tailing_frac
gauge_id
3001 NaN NaN NaN NaN NaN NaN NaN NaN 1.33 1.291356 ... 0.568 NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3002 756.807 136.155 485.599 284.301 90.494 0.000 828.584 0.372 1.45 1.450000 ... 4.587 86.667 0.000688 62.00 38.509 3.07 0.472 NaN 3.744 0.000000
3003 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.210000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3004 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.211649 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3005 14640.524 3780.572 11975.203 8184.747 2305.069 5.293 21163.952 0.385 1.21 1.218816 ... 3.028 0.000 0.507788 131.75 324.324 5.46 0.331 2.859 1.925 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17021 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.291081 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17022 370.817 98.813 269.619 191.648 60.805 0.060 322.091 0.254 NaN 1.318424 ... 0.005 148.950 1.030649 78.20 12.834 0.43 0.034 NaN 2.049 0.117647
17023 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.211304 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17024 609.535 154.046 460.329 312.861 81.274 0.000 356.873 0.169 NaN 1.320490 ... 0.001 306.800 0.942509 21.65 55.641 5.30 0.031 NaN 2.977 0.117647
17025 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.330000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

472 rows × 86 columns

[131]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[131]:
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
annual_max_7day     300
annual_max_90day    300
                   ...
rise_rate_median    299
runoff_ratio        244
slope_fdc           331
streamflow_elas     271
tailing_frac         66
Length: 86, dtype: int64
[132]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_canopy(kg/m2/s),
evap_surface(kg/m2/s), lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_gleam, q_cms_obs, rh_%, sm_lvl1(kg/m2),
sm_lvl2(kg/m2), sm_lvl3(kg/m2), sm_lvl4(kg/m2), solrad_wm2, windspeed_mps, windspeedu_mps,
windspeedv_mps
[133]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[133]:
<xarray.Dataset> Size: 566MB
Dimensions:           (time: 14976, dynamic_features: 20)
Coordinates:
  * time              (time) datetime64[ns] 120kB 1980-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U21 2kB 'aet_mm_gleam' ... 'windspe...
Data variables: (12/472)
    3001              (time, dynamic_features) float32 1MB ...
    3002              (time, dynamic_features) float32 1MB ...
    3003              (time, dynamic_features) float32 1MB ...
    3004              (time, dynamic_features) float32 1MB ...
    3005              (time, dynamic_features) float32 1MB ...
    3006              (time, dynamic_features) float32 1MB ...
    ...                ...
    17020             (time, dynamic_features) float32 1MB ...
    17021             (time, dynamic_features) float32 1MB ...
    17022             (time, dynamic_features) float32 1MB ...
    17023             (time, dynamic_features) float32 1MB ...
    17024             (time, dynamic_features) float32 1MB ...
    17025             (time, dynamic_features) float32 1MB ...
[134]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_LUX

[135]:
dataset = RainfallRunoff('CAMELS_LUX', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_LUX with 56 stations, 25 dynamic and 61 static features
[136]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
AI_Oudin, AI_PM, IDPR_MAX, IDPR_MEAN, IDPR_MIN, IDPR_RANGE, IDPR_STD, Kc_Gravelius, PET_Oudin_sum,
PET_PM_sum, Qspec_sum, SLOPE_MAX, SLOPE_MIN, SLOPE_RANGE, SLOPE_STD, Station, TWI_MAX, TWI_MEAN,
TWI_MIN, TWI_RANGE, TWI_STD, VRM_MAX, VRM_MEAN, VRM_MIN, VRM_RANGE, VRM_STD, XLuref, YLuref, Z_MAX,
Z_MIN, Z_RANGE, Z_STD, agency, area_km2, catchment, crop_frac, elev_catch_m, end,
forests_naturalareas, grass_frac, impermeable_formations, lat, limestone_dolomites, long,
marl_claystone, perimeter_km, permeable_formations, prad_sum, pstn_sum, runoffratio,
sandstone_conglomerates, schists_quartzites, slope_degree, start, stream, surface_deposits, t2m_max,
t2m_mean, t2m_min, urban_frac, watercourses_waterbodies_wetlands
[137]:
df = dataset.fetch_static_features()
print(df.shape)
(56, 61)
[138]:
print(df.isna().sum().sum())
df.isna().sum()
0
[138]:
Z_MIN           0
Z_MAX           0
Z_RANGE         0
elev_catch_m    0
Z_STD           0
               ..
start           0
end             0
area_km2        0
perimeter_km    0
Kc_Gravelius    0
Length: 61, dtype: int64

find those columns which have at least one NaN value

[139]:
df.loc[:, (df.isna().sum()>0)]
[139]:
gauge_id
ID_01
ID_02
ID_03
ID_04
ID_05
ID_06
ID_07
ID_08
ID_09
ID_10
ID_11
ID_12
ID_13
ID_14
ID_15
ID_16
ID_17
ID_18
ID_19
ID_20
ID_21
ID_22
ID_23
ID_24
ID_25
ID_26
ID_27
ID_28
ID_29
ID_30
ID_31
ID_32
ID_33
ID_34
ID_35
ID_36
ID_37
ID_38
ID_39
ID_40
ID_41
ID_42
ID_43
ID_44
ID_45
ID_46
ID_47
ID_48
ID_49
ID_50
ID_51
ID_52
ID_53
ID_54
ID_55
ID_56
[140]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[140]:
Series([], dtype: float64)
[141]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Qflag, RR_flag_rad, RR_max_rad, RR_min_rad, airtemp_C_mean, cape, cin, dls, kx, lls, pcp_mm_era5,
pcp_mm_radar, pcp_mm_station, pet_mm_oudin, pet_mm_pm, q_cms_obs, q_mm_obs, rh_%, sml1, sml2, sml3,
sml4, spechum_gkg, tcwv, windspeed_mps
[142]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[142]:
<xarray.Dataset> Size: 70MB
Dimensions:           (time: 6209, dynamic_features: 25)
Coordinates:
  * time              (time) datetime64[ns] 50kB 2004-11-01 ... 2021-10-31
  * dynamic_features  (dynamic_features) <U14 1kB 'q_cms_obs' ... 'sml4'
Data variables: (12/56)
    ID_01             (time, dynamic_features) float64 1MB ...
    ID_02             (time, dynamic_features) float64 1MB ...
    ID_03             (time, dynamic_features) float64 1MB ...
    ID_04             (time, dynamic_features) float64 1MB ...
    ID_05             (time, dynamic_features) float64 1MB ...
    ID_06             (time, dynamic_features) float64 1MB ...
    ...                ...
    ID_51             (time, dynamic_features) float64 1MB ...
    ID_52             (time, dynamic_features) float64 1MB ...
    ID_53             (time, dynamic_features) float64 1MB ...
    ID_54             (time, dynamic_features) float64 1MB ...
    ID_55             (time, dynamic_features) float64 1MB ...
    ID_56             (time, dynamic_features) float64 1MB ...
[143]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_NZ

[144]:
dataset = RainfallRunoff('CAMELS_NZ', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_NZ with 369 stations, 5 dynamic and 40 static features
[145]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Climate Zone, DIST_SEA, DSDamAffected, FlowRegime, Geology, Influence, IsAbstracted, IsDam,
IsEphemeral, IsGWinfluenced, IsRatingOk, IsSnowInfluenced, IsWeir, LANDCOVER, Latitude (WGS 84),
Longitude(WGS 84), Mean Annual Rainfall, RID, Records, Region, SRC_OF_FLW, Station Name,
StationName, Stream_Order, UpStreamLakes, area_km2, elev_gauge_m, lat, long, slope_degrees,
usAnRainVar, usCalc, usDaysRainGT25, usHard, usLake, usLowGrad, usPET, usParticleSize, usRainDays10,
usSteep
[146]:
df = dataset.fetch_static_features()
print(df.shape)
(369, 40)
[147]:
print(df.isna().sum().sum())
df.isna().sum()
17
[147]:
RID                      0
StationName              0
lat                      0
long                     0
usParticleSize           0
usHard                   0
usCalc                   0
Geology                  0
usDaysRainGT25           0
usRainDays10             0
Mean Annual Rainfall     0
usAnRainVar              0
usPET                    0
Climate Zone             0
IsDam                    0
IsWeir                   0
IsAbstracted             0
IsEphemeral              0
IsGWinfluenced           0
IsRatingOk               0
IsSnowInfluenced         0
DSDamAffected            1
FlowRegime               4
Influence                0
Station Name             0
Latitude (WGS 84)        0
Longitude(WGS 84)        0
area_km2                 0
Region                   0
UpStreamLakes           12
usLake                   0
Stream_Order             0
elev_gauge_m             0
usSteep                  0
usLowGrad                0
slope_degrees            0
DIST_SEA                 0
SRC_OF_FLW               0
Records                  0
LANDCOVER                0
dtype: int64

find those columns which have at least one NaN value

[148]:
df.loc[:, (df.isna().sum()>0)]
[148]:
DSDamAffected FlowRegime UpStreamLakes
Station_ID
802 0.0 ExceptFlood 0.0
1316 0.0 All 0.0
1335 0.0 ExceptFlood NaN
1903 0.0 ExceptFlood 0.0
3506 0.0 ExceptFlood 0.0
... ... ... ...
3043448 0.0 All 0.0
3043449 0.0 All 0.0
3043451 0.0 All 0.0
3043490 1.0 All 0.0
3043491 0.0 All 0.0

369 rows × 3 columns

[149]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[149]:
DSDamAffected     1
FlowRegime        4
UpStreamLakes    12
dtype: int64
[150]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, pet_mm, q_cms_obs, rh_%
[151]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[151]:
<xarray.Dataset> Size: 142MB
Dimensions:           (time: 19208, dynamic_features: 5)
Coordinates:
  * time              (time) datetime64[ns] 154kB 1972-01-01 ... 2024-08-02
  * dynamic_features  (dynamic_features) <U14 280B 'pet_mm' ... 'q_cms_obs'
Data variables: (12/369)
    802               (time, dynamic_features) float32 384kB ...
    1316              (time, dynamic_features) float32 384kB ...
    1335              (time, dynamic_features) float32 384kB ...
    1903              (time, dynamic_features) float32 384kB ...
    3506              (time, dynamic_features) float32 384kB ...
    3722              (time, dynamic_features) float32 384kB ...
    ...                ...
    3043407           (time, dynamic_features) float32 384kB ...
    3043448           (time, dynamic_features) float32 384kB ...
    3043449           (time, dynamic_features) float32 384kB ...
    3043451           (time, dynamic_features) float32 384kB ...
    3043490           (time, dynamic_features) float32 384kB ...
    3043491           (time, dynamic_features) float32 384kB ...
[152]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CAMELS_SE

[153]:
dataset = RainfallRunoff('CAMELS_SE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_SE with 50 stations, 4 dynamic and 76 static features
[154]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Agriculture_percentage, Bedrock_percentage_sc, Clayey_till_and_clay_till_percentage_sc, DOR,
Elevation_mabsl, Forest_percentage, Glacier_percentage_sc, Glaciers_percentage,
Glaciofluvial_sediment_percentage_sc, Name, Open_land_percentage, Peat_percentage_sc, Pmean_mm_year,
Postglacial_sand_and_gravel_percentage_sc, RegVol_m3, S01_Qmean_CNP_61_90, S01_Qmean_CNP_91_20,
S01_Qmean_hs, S02_Qcoeff_CNP_61_90, S02_Qcoeff_CNP_91_20, S02_Qcoeff_hs, S03_COM_CNP_61_90,
S03_COM_CNP_91_20, S03_COM_hs, S04_SPD_CNP_61_90, S04_SPD_CNP_91_20, S04_SPD_hs,
S05_Qmean_spring_CNP_61_90, S05_Qmean_spring_CNP_91_20, S05_Qmean_spring_hs,
S06_Qmean_summer_CNP_61_90, S06_Qmean_summer_CNP_91_20, S06_Qmean_summer_hs,
S07_Qmean_autumn_CNP_61_90, S07_Qmean_autumn_CNP_91_20, S07_Qmean_autumn_hs,
S08_Qmean_winter_CNP_61_90, S08_Qmean_winter_CNP_91_20, S08_Qmean_winter_hs, S09_LFfreq_CNP_61_90,
S09_LFfreq_CNP_91_20, S09_LFfreq_hs, S10_T_minQ_d30_CNP_61_90, S10_T_minQ_d30_CNP_91_20,
S10_T_minQ_d30_hs, S11_minQ_d7_CNP_61_90, S11_minQ_d7_CNP_91_20, S11_minQ_d7_hs,
S12_minQ_d30_CNP_61_90, S12_minQ_d30_CNP_91_20, S12_minQ_d30_hs, S13_HFfreq_CNP_61_90,
S13_HFfreq_CNP_91_20, S13_HFfreq_hs, S14_T_maxQ_d1_CNP_61_90, S14_T_maxQ_d1_CNP_91_20,
S14_T_maxQ_d1_hs, S15_maxQ_d30_CNP_61_90, S15_maxQ_d30_CNP_91_20, S15_maxQ_d30_hs,
S16_maxQ_d1_CNP_61_90, S16_maxQ_d1_CNP_91_20, S16_maxQ_d1_hs, Shrubs_and_grassland_percentage,
Silt_percentage_sc, Slope_mean_degree, Till_and_weathered_deposit_percentage_sc, Till_percentage_sc,
Tmean_C, Urban_percentage, Water_percentage, Water_percentage_sc, Wetlands_percentage, area_km2,
lat, long
[155]:
df = dataset.fetch_static_features()
print(df.shape)
(50, 76)
[156]:
print(df.isna().sum().sum())
df.isna().sum()
0
[156]:
Agriculture_percentage                     0
Bedrock_percentage_sc                      0
Clayey_till_and_clay_till_percentage_sc    0
DOR                                        0
Elevation_mabsl                            0
                                          ..
Water_percentage_sc                        0
Wetlands_percentage                        0
area_km2                                   0
lat                                        0
long                                       0
Length: 76, dtype: int64

find those columns which have at least one NaN value

[157]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[158]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[158]:
Series([], dtype: float64)
[159]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs, q_mm_obs

print total number of nans for each of dynamic feature.

[160]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[160]:
<xarray.Dataset> Size: 18MB
Dimensions:           (time: 21915, dynamic_features: 4)
Coordinates:
  * time              (time) datetime64[ns] 175kB 1961-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U14 224B 'airtemp_C_mean' ... 'q_mm...
Data variables: (12/50)
    5                 (time, dynamic_features) float32 351kB ...
    20                (time, dynamic_features) float32 351kB ...
    37                (time, dynamic_features) float32 351kB ...
    97                (time, dynamic_features) float32 351kB ...
    138               (time, dynamic_features) float32 351kB ...
    186               (time, dynamic_features) float32 351kB ...
    ...                ...
    1740              (time, dynamic_features) float32 351kB ...
    1762              (time, dynamic_features) float32 351kB ...
    1780              (time, dynamic_features) float32 351kB ...
    2020              (time, dynamic_features) float32 351kB ...
    2203              (time, dynamic_features) float32 351kB ...
    20002             (time, dynamic_features) float32 351kB ...
[161]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_mean 0
pcp_mm 0
q_cms_obs 0
q_mm_obs 0

CAMELS_US

[162]:
dataset = RainfallRunoff('CAMELS_US', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_US with 671 stations, 8 dynamic and 59 static features
[163]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_geospa_fabric, area_km2, aridity, baseflow_index, carbonate_rocks_frac, clay_frac,
dom_land_cover, dom_land_cover_frac, elev_mean, frac_forest, frac_snow, gauge_name, geol_1st_class,
geol_2nd_class, geol_permeability, geol_porostiy, glim_1st_class_frac, glim_2nd_class_frac,
gvf_diff, gvf_max, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur,
high_q_freq, huc_02, lai_diff, lai_max, lat, long, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, max_water_content, organic_frac, other_frac, p_mean, p_seasonality, pet_mean,
q5, q95, q_mean, root_depth_50, root_depth_99, runoff_ratio, sand_frac, silt_frac, slope_fdc,
slope_mkm-1, soil_conductivity, soil_depth_pelletier, soil_depth_statsgo, soil_porosity,
stream_elas, water_frac, zero_q_freq
[164]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_min, dayl(s), pcp_mm, q_cms_obs, solrad_wm2, swe_mm, vp_hpa
[165]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[165]:
<xarray.Dataset> Size: 549MB
Dimensions:           (time: 12784, dynamic_features: 8)
Coordinates:
  * time              (time) datetime64[ns] 102kB 1980-01-01 ... 2014-12-31
  * dynamic_features  (dynamic_features) <U13 416B 'dayl(s)' ... 'q_cms_obs'
Data variables: (12/671)
    09484000          (time, dynamic_features) float64 818kB ...
    09484600          (time, dynamic_features) float64 818kB ...
    09505800          (time, dynamic_features) float64 818kB ...
    09497800          (time, dynamic_features) float64 818kB ...
    09505350          (time, dynamic_features) float64 818kB ...
    09492400          (time, dynamic_features) float64 818kB ...
    ...                ...
    07359610          (time, dynamic_features) float64 818kB ...
    07291000          (time, dynamic_features) float64 818kB ...
    07373000          (time, dynamic_features) float64 818kB ...
    07295000          (time, dynamic_features) float64 818kB ...
    07362587          (time, dynamic_features) float64 818kB ...
    07376000          (time, dynamic_features) float64 818kB ...
[166]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Caravan_DK

[167]:
dataset = RainfallRunoff('Caravan_DK', path=DATA_PATH, verbosity=0)
print(dataset)
Caravan_DK with 308 stations, 39 dynamic and 211 static features
[168]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_km2, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj, cmi_ix_s01, cmi_ix_s02,
cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08, cmi_ix_s09, cmi_ix_s10,
cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn, dis_m3_pmx, dis_m3_pyr,
dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj, fmh_cl_smj, for_pc_sse,
frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02,
glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10,
glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18,
glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93,
high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lat,
lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq, moisture_index, nli_ix_sav,
p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05,
pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr,
pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07,
pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15,
pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06,
pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse,
pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality, sgr_dk_sav,
slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04, snw_pc_s05,
snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12, snw_pc_smx,
snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05, swc_pc_s06,
swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr, tbi_cl_smj,
tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06, tmp_dc_s07,
tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx, tmp_dc_syr,
urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05, wet_pc_s06,
wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[169]:
df = dataset.fetch_static_features()
print(df.shape)
(308, 211)
[170]:
print(df.isna().sum().sum())
df.isna().sum()
0
[170]:
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
aet_mm_s05    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 211, dtype: int64

find those columns which have at least one NaN value

[171]:
df.loc[:, (df.isna().sum()>0)]
[171]:
520076
310027
600024
420022
250090
...
620017
270002
250092
590008
230087

308 rows × 0 columns

[172]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[172]:
Series([], dtype: float64)
[173]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
dewpoint_temperature_2m_max, dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min,
potential_evaporation_sum, q_cms_obs, snow_depth_water_equivalent_max,
snow_depth_water_equivalent_mean, snow_depth_water_equivalent_min, surface_net_solar_radiation_max,
surface_net_solar_radiation_mean, surface_net_solar_radiation_min,
surface_net_thermal_radiation_max, surface_net_thermal_radiation_mean,
surface_net_thermal_radiation_min, surface_pressure_max, surface_pressure_mean,
surface_pressure_min, temperature_2m_max, temperature_2m_mean, temperature_2m_min,
total_precipitation_sum, u_component_of_wind_10m_max, u_component_of_wind_10m_mean,
u_component_of_wind_10m_min, v_component_of_wind_10m_max, v_component_of_wind_10m_mean,
v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max, volumetric_soil_water_layer_1_mean,
volumetric_soil_water_layer_1_min, volumetric_soil_water_layer_2_max,
volumetric_soil_water_layer_2_mean, volumetric_soil_water_layer_2_min,
volumetric_soil_water_layer_3_max, volumetric_soil_water_layer_3_mean,
volumetric_soil_water_layer_3_min, volumetric_soil_water_layer_4_max,
volumetric_soil_water_layer_4_mean, volumetric_soil_water_layer_4_min
[174]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[174]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14609, dynamic_features: 39)
Coordinates:
  * time              (time) datetime64[ns] 117kB 1981-01-02 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U34 5kB 'dewpoint_temperature_2m_ma...
Data variables: (12/308)
    520076            (time, dynamic_features) float64 5MB ...
    310027            (time, dynamic_features) float64 5MB ...
    600024            (time, dynamic_features) float64 5MB ...
    420022            (time, dynamic_features) float64 5MB ...
    250090            (time, dynamic_features) float64 5MB ...
    410017            (time, dynamic_features) float64 5MB ...
    ...                ...
    570068            (time, dynamic_features) float64 5MB ...
    620017            (time, dynamic_features) float64 5MB ...
    270002            (time, dynamic_features) float64 5MB ...
    250092            (time, dynamic_features) float64 5MB ...
    590008            (time, dynamic_features) float64 5MB ...
    230087            (time, dynamic_features) float64 5MB ...
[175]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CCAM

[176]:
dataset = RainfallRunoff('CCAM', path=DATA_PATH, verbosity=0)
print(dataset)
CCAM with 102 stations, 16 dynamic and 124 static features
[177]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_km2, barren, bdticm, bldfie_sl1, bldfie_sl2, bldfie_sl3, bldfie_sl4, bldfie_sl5, bldfie_sl6,
bldfie_sl7, cecsol_sl1, cecsol_sl2, cecsol_sl3, cecsol_sl4, cecsol_sl5, cecsol_sl6, cecsol_sl7,
circulatory_ratio, clay, closed_shrubland, compactness_coefficient, cropland,
cropland_natural_vegetaion, deciduous_broadleaf_tree, deciduous_needleleaf_tree, elev,
elongation_ratio, ev, evergreen_broadleaf_tree, evergreen_needleleaf_tree, evp_mean, form_factor,
frac_snow_daily, geol_permeability, geol_porosity, grassland, grav, gst_mean, high_prec_dur,
high_prec_freq, high_prec_timing, ig, lai_dif, lai_max, lat, length, length_continuous_runoff,
log_k_s_l1, log_k_s_l2, log_k_s_l3, log_k_s_l4, log_k_s_l5, log_k_s_l6, long, low_prec_dur,
low_prec_freq, low_prec_timing, mixed_forest, mt, nd, ndvi_mean, open_shrubland, orcdrc_sl1,
orcdrc_sl2, orcdrc_sl3, orcdrc_sl4, orcdrc_sl5, orcdrc_sl6, orcdrc_sl7, pa, pb, pdep,
permanent_wetland, pet_mean, phihox_sl1, phihox_sl2, phihox_sl3, phihox_sl4, phihox_sl5, phihox_sl6,
phihox_sl7, pi, pop, pop_dnsty, por, pre_mean, prs_mean, py, rhu_mean, root_depth_50, root_depth_99,
sand, savanna, sc, shape_factor, silt, slope_mkm-1, sm, snow_and_ice, som, ss, ssd_mean, su,
tem_mean, theta_s_l1, theta_s_l2, theta_s_l3, theta_s_l4, theta_s_l5, theta_s_l6, tksatu_l1,
tksatu_l2, tksatu_l3, tksatu_l4, tksatu_l5, tksatu_l6, urban_and_built-up_land, va, vb, vi,
water_bodies, wb, win_mean, woody_savanna
[178]:
df = dataset.fetch_static_features()
print(df.shape)
(102, 124)
[179]:
print(df.isna().sum().sum())
df.isna().sum()
0
[179]:
area_km2         0
barren           0
bdticm           0
bldfie_sl1       0
bldfie_sl2       0
                ..
vi               0
water_bodies     0
wb               0
win_mean         0
woody_savanna    0
Length: 124, dtype: int64

find those columns which have at least one NaN value

[180]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[181]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[181]:
Series([], dtype: float64)
[182]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_mm, gtemp_C, gtemp_C_max, gtemp_C_min, pcp_mm,
prs_max, prs_mean, prs_min, q_cms_obs, rh_%, ssd_hr, windspeed_mps, windspeed_mps_max

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[183]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Finland

[184]:
dataset = RainfallRunoff('Finland', path=DATA_PATH, verbosity=0)
print(dataset)
Finland with 669 stations, 10 dynamic and 214 static features

The static features of Finland are same as that of EStreams.

[185]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[186]:
df = dataset.fetch_static_features()
print(df.shape)
(669, 214)
[187]:
print(df.isna().sum().sum())
df.isna().sum()
10509
[187]:
static_features
area_flag            0
area_km2             0
area_official      126
area_rel           126
aridity            176
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        196
Length: 214, dtype: int64

find those columns which have at least one NaN value

[188]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[189]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[189]:
static_features
area_official          126
area_rel               126
aridity                176
baseflow_index         199
dam_yr_first           590
                      ...
soil_tawc_p90            1
start_date               6
start_date_climatic    176
start_date_hydro       196
zero_q_freq            196
Length: 109, dtype: int64
[190]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

GRDCCaravan

[191]:
dataset = RainfallRunoff('GRDCCaravan', path=DATA_PATH, verbosity=0)
print(dataset)
GRDCCaravan with 5356 stations, 41 dynamic and 215 static features
[192]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_km2, ari_ix_sav, aridity_ERA5_LAND, aridity_FAO_PM, cls_cl_smj, cly_pc_sav, clz_cl_smj,
cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08,
cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn,
dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj,
fmh_cl_smj, for_pc_sse, frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj,
glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08,
glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16,
glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav,
hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx,
ire_pc_sse, kar_pc_sse, lat, lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq,
moisture_index_ERA5_LAND, moisture_index_FAO_PM, nli_ix_sav, p_mean, pac_pc_sse, pet_mean_ERA5_LAND,
pet_mean_FAO_PM, pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07,
pet_mm_s08, pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01,
pnv_pc_s02, pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09,
pnv_pc_s10, pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav,
pre_mm_s01, pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08,
pre_mm_s09, pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav,
rev_mc_usu, ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality_ERA5_LAND, seasonality_FAO_PM,
sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04,
snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12,
snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05,
swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr,
tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06,
tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx,
tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05,
wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[193]:
df = dataset.fetch_static_features()
print(df.shape)
(5356, 215)
[194]:
print(df.isna().sum().sum())
df.isna().sum()
0
[194]:
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
aet_mm_s05    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 215, dtype: int64

find those columns which have at least one NaN value

[195]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[196]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[196]:
Series([], dtype: float64)
[197]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m, dewpoint_temperature_2m_max,
dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min, pcp_mm,
potential_evaporation_sum_ERA5_LAND, potential_evaporation_sum_FAO_PENMAN_MONTEITH, q_cms_obs,
q_mm_obs, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, surface_net_solar_radiation_max, surface_net_solar_radiation_mean,
surface_net_solar_radiation_min, surface_net_thermal_radiation_max,
surface_net_thermal_radiation_mean, surface_net_thermal_radiation_min, surface_pressure_max,
surface_pressure_mean, surface_pressure_min, u_component_of_wind_10m_max,
u_component_of_wind_10m_mean, u_component_of_wind_10m_min, v_component_of_wind_10m_max,
v_component_of_wind_10m_mean, v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max,
volumetric_soil_water_layer_1_mean, volumetric_soil_water_layer_1_min,
volumetric_soil_water_layer_2_max, volumetric_soil_water_layer_2_mean,
volumetric_soil_water_layer_2_min, volumetric_soil_water_layer_3_max,
volumetric_soil_water_layer_3_mean, volumetric_soil_water_layer_3_min,
volumetric_soil_water_layer_4_max, volumetric_soil_water_layer_4_mean,
volumetric_soil_water_layer_4_min

print total number of nans for each of dynamic feature for GRDCCaravan. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[198]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

HYSETS

[199]:
dataset = RainfallRunoff('HYSETS', path=DATA_PATH, verbosity=0)
print(dataset)
HYSETS with 14425 stations, 20 dynamic and 30 static features
[200]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Drainage_Area_GSIM_km2, Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries,
Flag_Land_Use_Extraction, Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction,
Gravelius, Hydrometric_station_latitude, Hydrometric_station_longitude, Land_Use_Crops_frac,
Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac, Land_Use_Snow_Ice_frac,
Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name, Official_ID, Perimeter,
Permeability_logk_m2, Porosity_frac, Source, area_km2, lat, long, slope_degrees
[201]:
df = dataset.fetch_static_features()
print(df.shape)
(14425, 30)
[202]:
print(df.isna().sum().sum())
df.isna().sum()
20179
[202]:
Source                               0
Name                                 0
Official_ID                          0
lat                                  0
long                                 0
area_km2                             0
Drainage_Area_GSIM_km2           13561
Flag_GSIM_boundaries                 0
Flag_Artificial_Boundaries           0
Elevation_m                          6
slope_degrees                        6
Gravelius                         1633
Perimeter                         1633
Flag_Shape_Extraction                0
Aspect_deg                           6
Flag_Terrain_Extraction              0
Land_Use_Forest_frac                13
Land_Use_Grass_frac                 13
Land_Use_Wetland_frac               13
Land_Use_Water_frac                 13
Land_Use_Urban_frac                 13
Land_Use_Shrubs_frac                13
Land_Use_Crops_frac                 13
Land_Use_Snow_Ice_frac              13
Flag_Land_Use_Extraction             0
Permeability_logk_m2              1615
Porosity_frac                     1615
Flag_Subsoil_Extraction              0
Hydrometric_station_latitude         0
Hydrometric_station_longitude        0
dtype: int64

find those columns which have at least one NaN value

[203]:
df.loc[:, (df.isna().sum()>0)]
[203]:
Drainage_Area_GSIM_km2 Elevation_m slope_degrees Gravelius Perimeter Aspect_deg Land_Use_Forest_frac Land_Use_Grass_frac Land_Use_Wetland_frac Land_Use_Water_frac Land_Use_Urban_frac Land_Use_Shrubs_frac Land_Use_Crops_frac Land_Use_Snow_Ice_frac Permeability_logk_m2 Porosity_frac
Watershed_ID
1 NaN 362.3 3.5329 2.7834 1194.505 130.4023 0.7869 0.0147 0.0645 0.0258 0.0089 0.0749 0.0242 0.0 -14.719327 0.180905
2 NaN 353.4 4.6633 2.0656 269.164 91.7329 0.8452 0.0102 0.0228 0.0219 0.0174 0.0410 0.0414 0.0 -14.056491 0.206450
3 2693.814 293.3 4.4690 2.0620 381.994 223.9510 0.8207 0.0093 0.0032 0.0487 0.0230 0.0351 0.0600 0.0 -14.537390 0.165357
4 NaN 276.5 4.1819 2.4682 413.839 120.7400 0.6837 0.0226 0.1024 0.0630 0.0115 0.0641 0.0528 0.0 -14.687869 0.170597
5 NaN 201.8 2.8061 NaN NaN 56.8902 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14421 NaN 1987.9 17.1982 2.0752 208.852 28.9860 0.5356 0.0330 0.0000 0.0000 0.0202 0.0170 0.3941 0.0 -13.160658 0.096755
14422 NaN 769.5 6.5921 1.5715 325.714 110.5607 0.1348 0.3106 0.0025 0.0024 0.0305 0.0300 0.4874 0.0 -12.698509 0.119993
14423 NaN 1883.2 14.7005 2.5953 1621.229 224.3422 0.8674 0.0437 0.0000 0.0026 0.0027 0.0429 0.0408 0.0 -12.976926 0.090284
14424 NaN 1791.2 12.1021 2.4269 1288.932 184.5177 0.7720 0.1524 0.0000 0.0013 0.0029 0.0474 0.0241 0.0 -12.968686 0.094042
14425 NaN 2179.1 5.9444 2.0769 165.762 112.0832 0.1605 0.5639 0.0000 0.0012 0.0091 0.1116 0.1536 0.0 -12.792099 0.168963

14425 rows × 16 columns

[204]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[204]:
Drainage_Area_GSIM_km2    13561
Elevation_m                   6
slope_degrees                 6
Gravelius                  1633
Perimeter                  1633
Aspect_deg                    6
Land_Use_Forest_frac         13
Land_Use_Grass_frac          13
Land_Use_Wetland_frac        13
Land_Use_Water_frac          13
Land_Use_Urban_frac          13
Land_Use_Shrubs_frac         13
Land_Use_Crops_frac          13
Land_Use_Snow_Ice_frac       13
Permeability_logk_m2       1615
Porosity_frac              1615
dtype: int64
[205]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_2m_max, airtemp_C_2m_min, cloudcover, dptemp_C_mean_2m, evap_mm,
evap_mm_snow, lwdownrad_wm2, lwnetrad_wm2, pcp_mm, q_cms_obs, q_mm_obs, snowdensity_kgm3,
snowfall_mm, snowmelt_mm, solrad_wm2, solradnet_wm2, swe_mm, windspeedu_mps, windspeedv_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[206]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Ireland

[207]:
dataset = RainfallRunoff('Ireland', path=DATA_PATH, verbosity=0)
print(dataset)
Ireland with 464 stations, 10 dynamic and 214 static features

The static features of Ireland are same as that of EStreams.

[208]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[209]:
df = dataset.fetch_static_features()
print(df.shape)
(464, 214)
[210]:
print(df.isna().sum().sum())
df.isna().sum()
9313
[210]:
static_features
area_flag            0
area_km2             0
area_official       16
area_rel            16
aridity            208
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        208
Length: 214, dtype: int64

find those columns which have at least one NaN value

[211]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[212]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[212]:
static_features
area_official           16
area_rel                16
aridity                208
baseflow_index         208
bedrk_dep                1
                      ...
soil_tawc_p90            1
start_date             137
start_date_climatic    208
start_date_hydro       204
zero_q_freq            208
Length: 109, dtype: int64
[213]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[214]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Italy

[215]:
dataset = RainfallRunoff('Italy', path=DATA_PATH, verbosity=0)
print(dataset)
Italy with 294 stations, 10 dynamic and 214 static features

The static features of Italy are same as that of EStreams.

[216]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[217]:
df = dataset.fetch_static_features()
print(df.shape)
(294, 214)
[218]:
print(df.isna().sum().sum())
df.isna().sum()
3695
[218]:
static_features
area_flag            0
area_km2             0
area_official      106
area_rel           106
aridity             46
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq         86
Length: 214, dtype: int64

find those columns which have at least one NaN value

[219]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[220]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[220]:
static_features
area_official                106
area_rel                     106
aridity                       46
baseflow_index                87
dam_yr_first                 265
dam_yr_last                  265
duplicated_suspect           277
elevation                    219
end_date_climatic             46
end_date_hydro                85
frac_snow                     46
hfd_mean                      98
hfd_std                      105
hp_dur                        46
hp_freq                       46
hp_time                       46
hq_dur                       107
hq_freq                      107
lp_dur                        46
lp_freq                       46
lp_time                       46
lq_dur                       112
lq_freq                      112
num_years_climatic            45
num_years_hydro               45
p_mean                        46
p_seasonality                 46
pet_mean                      46
q_5                           86
q_95                          86
q_elas_Sankarasubramanian     86
q_mean                        86
q_runoff_ratio                86
res_tot_sto                  265
slope_no_unit                 90
start_date_climatic           46
start_date_hydro              85
zero_q_freq                   86
dtype: int64
[221]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

Japan

[222]:
dataset = RainfallRunoff('Japan', path=DATA_PATH, verbosity=0)
print(dataset)
Japan with 751 stations, 27 dynamic and 35 static features

The static features of Japan are same as that of GSHA.

[223]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[224]:
df = dataset.fetch_static_features()
print(df.shape)
(751, 35)
[225]:
print(df.isna().sum().sum())
df.isna().sum()
265
[225]:
EVP_uncertainty(%)      78
HYRIV_ID                 0
LRAD_uncertainty(%)     66
P_uncertainty(%)         0
SRAD_uncertainty(%)      0
T_uncertainty(%)         0
agency                   0
area_km2                 0
cly_pc_uav               0
ele_mt_uav               0
ero_kh_uav               0
gla_pc_use               0
glc_cl_cmj               0
gwt_cm_cav               0
inu_pc_ult               0
lat                      0
lit_cl_cmj               0
long                     0
pet_uncertainty(%)     121
pnv_cl_cmj               0
prm_pc_use               0
sgr_dk_rav               0
slope_degrees            0
slt_pc_uav               0
snd_pc_uav               0
wet_pc_u01               0
wet_pc_u02               0
wet_pc_u03               0
wet_pc_u04               0
wet_pc_u05               0
wet_pc_u06               0
wet_pc_u07               0
wet_pc_u08               0
wet_pc_u09               0
wind_uncertainty(%)      0
dtype: int64

find those columns which have at least one NaN value

[226]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[227]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[227]:
EVP_uncertainty(%)      78
LRAD_uncertainty(%)     66
pet_uncertainty(%)     121
dtype: int64
[228]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
[229]:
# _, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

print total number of nans for each of dynamic feature. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[230]:
#     print(feat, nans)

LamaHCE

[231]:
dataset = RainfallRunoff('LamaHCE', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 22 dynamic and 84 static features
[232]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_gov, area_km2, area_ratio, arid_1, arid_2, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps_post, gaps_pre, gc_dom, gc_ig_fra, gc_mt_fra,
gc_pa_fra, gc_pb_fra, gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra,
gc_vb_fra, gc_wb_fra, geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max,
hi_prec_du, hi_prec_fr, hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du,
lo_prec_fr, lo_prec_ti, long, mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, nrs_euhyd, nrs_rivat,
obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean, p_season, region, river, root_dep, sand_fra,
silt_fra, slope_mkm-1, soil_condu, soil_poros, soil_tawc, strm_dens, typimpact, urban_fra
[233]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 84)
[234]:
print(df.isna().sum().sum())
df.isna().sum()
65
[234]:
agr_fra       0
area_gov      0
area_km2      0
area_ratio    0
arid_1        0
             ..
soil_poros    0
soil_tawc     0
strm_dens     0
typimpact     0
urban_fra     0
Length: 84, dtype: int64

find those columns which have at least one NaN value

[235]:
df.loc[:, (df.isna().sum()>0)]
[235]:
geol_perme hi_prec_ti lo_prec_ti nrs_rivat
ID
87 -12.8 jja djf 20445242.0
233 -12.0 jja djf 20424951.0
18 -11.8 jja djf NaN
577 -14.1 jja djf 20458414.0
839 -13.5 jja djf NaN
... ... ... ... ...
458 -14.4 jja djf 20448210.0
545 -14.1 son djf 20455515.0
558 -12.7 son djf 20462204.0
33 -11.8 NaN djf 20449909.0
223 -13.2 jja djf 20428188.0

859 rows × 4 columns

[236]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[236]:
geol_perme     1
hi_prec_ti    42
lo_prec_ti     3
nrs_rivat     19
dtype: int64
[237]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_max, airtemp_C_mean, airtemp_C_min, dptemp_C_max_2m, dptemp_C_mean_2m,
dptemp_C_min_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm, q_cms_obs, solrad_wm2, solrad_wm2_max,
swe_mm, thermrad_wm2, thermrad_wm2_max, total_et, volsw_123, volsw_4, windspeedu_mps, windspeedv_mps
[238]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[238]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14244, dynamic_features: 22)
Coordinates:
  * time              (time) datetime64[ns] 114kB 1981-01-01 ... 2019-12-31
  * dynamic_features  (dynamic_features) <U16 1kB 'airpres_hpa' ... 'windspee...
Data variables: (12/859)
    87                (time, dynamic_features) float32 1MB 808.5 -5.8 ... 0.2
    233               (time, dynamic_features) float32 1MB 917.4 0.8 ... 1.1 0.7
    18                (time, dynamic_features) float32 1MB 934.8 0.0 ... 1.2
    577               (time, dynamic_features) float32 1MB 841.1 -3.2 ... -1.2
    839               (time, dynamic_features) float32 1MB 956.4 3.9 ... -0.9
    526               (time, dynamic_features) float32 1MB 806.0 -6.4 ... -1.4
    ...                ...
    258               (time, dynamic_features) float32 1MB 888.3 -0.8 ... -0.3
    458               (time, dynamic_features) float32 1MB 938.8 3.9 ... -1.0
    545               (time, dynamic_features) float32 1MB 800.5 -6.1 ... -0.8
    558               (time, dynamic_features) float32 1MB 822.0 -6.0 ... -1.0
    33                (time, dynamic_features) float32 1MB 823.8 -5.8 ... 1.2
    223               (time, dynamic_features) float32 1MB 940.4 1.9 ... 1.0 0.7
[239]:
# print total number of nans for each of dynamic feature.
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

LamaHIce

[240]:
dataset = RainfallRunoff('LamaHIce', data_type='total_upstrm',
                         path=os.path.join(DATA_PATH, 'LamaHIce_daily'), verbosity=0)
print(dataset)
LamaHIce with 111 stations, 36 dynamic and 154 static features
[241]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ET_ERA5L_all_basin, ET_ERA5L_unfiltered_basin, ET_rav_all_basin, ET_rav_unfiltered_basin,
PET_ERA5L_all_basin, PET_ERA5L_unfiltered_basin, PET_rav_all_basin, PET_rav_unfiltered_basin,
P_ERA5L_all_basin, P_ERA5L_unfiltered_basin, P_rav_all_basin, P_rav_unfiltered_basin, Q5_basin,
Q5_gauge, Q95_basin, Q95_gauge, Q_all_basin, Q_unfiltered_basin, VHM_no_gauge, V_no_gauge,
agr_fra_basin, area_km2, aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin,
baseflow_index_ladson_basin, baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin,
degimpact_basin, degimpact_gauge, elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin,
elevation_gauge, elon_ratio_basin, forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin,
g621_fra_basin, g701_fra_basin, g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin,
g_dom_NI_basin, g_frac_basin, g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin,
g_min_el_basin, g_slope_basin, g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin,
gbnew_fra_basin, gbold_fra_basin, gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin,
gc_23_vapy_basin, gc_23_vb_basin, gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin,
gc_pb_fra_basin, gc_va_fra_basin, gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin,
ghraun_fra_basin, glac_fra_basin, gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin,
gsn_fra_basin, gsnew_fra_basin, gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin,
hfd_mean_gauge, high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin,
high_prec_fr_basin, high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin,
high_q_dur_gauge, high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin,
lake_fra_basin, lat, lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, long,
low_prec_du_ERA5L_basin, low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin,
low_q_dur_basin, low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin,
mvert_dist_basin, name_gauge, ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge,
obsend_day_gauge, obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin,
p_season_ERA5L_basin, p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge,
ref_et_mean_basin, river_gauge, root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge,
sand_fra_basin, scrub_fra_basin, silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mkm-1,
soil_poros_basin, soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin,
typimpact_basin, typimpact_gauge, urban_fra_basin, water_year_all_basin,
water_year_unfiltered_basin, wetl_fra_basin, zero_q_freq_gauge
[242]:
df = dataset.fetch_static_features()
print(df.shape)
(111, 154)
[243]:
print(df.isna().sum().sum())
df.isna().sum()
2013
[243]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
                               ..
urban_fra_basin                 0
water_year_all_basin           37
water_year_unfiltered_basin    14
wetl_fra_basin                  0
zero_q_freq_gauge              37
Length: 154, dtype: int64

find those columns which have at least one NaN value

[244]:
df.loc[:, (df.isna().sum()>0)]
[244]:
ET_ERA5L_all_basin ET_ERA5L_unfiltered_basin ET_rav_all_basin ET_rav_unfiltered_basin PET_ERA5L_all_basin PET_ERA5L_unfiltered_basin PET_rav_all_basin PET_rav_unfiltered_basin P_ERA5L_all_basin P_ERA5L_unfiltered_basin ... q_mean_gauge runoff_ratio_basin runoff_ratio_gauge slope_fdc_basin slope_fdc_gauge stream_elas_basin stream_elas_gauge water_year_all_basin water_year_unfiltered_basin zero_q_freq_gauge
id
87 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
18 0.432785 0.388629 0.444658 0.432533 0.653543 0.573865 0.586086 0.596251 2.914548 3.044353 ... 2.675433 0.930 0.929598 0.812 0.811730 0.260 0.260336 2000.666667 2003.116037 0.0
73 0.383039 0.391195 0.379611 0.374843 0.602475 0.601518 0.486505 0.461007 3.228554 3.265193 ... 3.378130 0.860 0.859913 0.912 0.912329 0.875 0.875205 1999.404710 2007.859574 0.0
10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
83 0.463385 0.439377 0.500961 0.484539 0.674688 0.632416 0.656113 0.642609 3.177704 3.108829 ... 2.671721 1.063 1.063444 0.243 0.242773 0.416 0.415684 2007.917599 2001.000664 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79 0.640192 0.636492 0.634929 0.633861 1.295533 1.293016 0.547516 0.557275 4.648427 4.631319 ... 9.389204 1.786 1.785626 0.404 0.404174 0.507 0.506978 2000.272524 1999.000666 0.0
60 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
69 NaN 0.491489 NaN 0.444122 NaN 0.780529 NaN 0.633352 NaN 2.756657 ... NaN NaN NaN NaN NaN NaN NaN NaN 2002.963139 NaN
38 0.338613 0.340275 0.352270 0.353195 0.567987 0.569779 0.307618 0.316879 6.729521 6.706454 ... 13.174330 1.259 1.258967 0.917 0.916913 0.407 0.406681 2000.143556 2000.499638 0.0
33 NaN 0.376920 NaN 0.500398 NaN 0.571928 NaN 0.600782 NaN 3.321827 ... NaN NaN NaN NaN NaN NaN NaN NaN 2001.251614 NaN

111 rows × 53 columns

[245]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[245]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
PET_ERA5L_unfiltered_basin     14
PET_rav_all_basin              37
PET_rav_unfiltered_basin       14
P_ERA5L_all_basin              37
P_ERA5L_unfiltered_basin       14
P_rav_all_basin                37
P_rav_unfiltered_basin         14
Q5_basin                       37
Q5_gauge                       37
Q95_basin                      37
Q95_gauge                      37
Q_all_basin                    37
Q_unfiltered_basin             14
baseflow_index_ladson_basin    37
baseflow_index_ladson_gauge    37
g_aspect_basin                 47
g_lat_basin                    47
g_lon_basin                    47
g_max_el_basin                 47
g_mean_el_basin                47
g_min_el_basin                 47
g_slope_basin                  47
g_slopel20_basin               47
gaps_hourly_gauge              35
hfd_mean_basin                 42
hfd_mean_gauge                 42
high_prec_timing_basin          4
high_q_dur_basin               67
high_q_dur_gauge               67
high_q_freq_basin              67
high_q_freq_gauge              67
low_prec_timing_ERA5L_basin     2
low_prec_timing_basin           1
low_q_dur_basin                70
low_q_dur_gauge                70
low_q_freq_basin               70
low_q_freq_gauge               70
q_mean_basin                   37
q_mean_gauge                   37
runoff_ratio_basin             37
runoff_ratio_gauge             37
slope_fdc_basin                37
slope_fdc_gauge                37
stream_elas_basin              37
stream_elas_gauge              37
water_year_all_basin           37
water_year_unfiltered_basin    14
zero_q_freq_gauge              37
dtype: int64
[246]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp_max, 2m_dp_temp_mean,
2m_dp_temp_min, 2m_qv_rav, 2m_temp_rav, airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m,
fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, pcp_mm, pet_mm, prec_carra, prec_rav, q_cms_obs,
ref_et_mm, surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad_max,
surf_net_solar_rad_mean, surf_net_therm_rad_max, surf_net_therm_rad_mean, surf_outg_therm_rad_rav,
surf_press, surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[247]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Poland

[248]:
dataset = RainfallRunoff('Poland', path=DATA_PATH, verbosity=0)
print(dataset)
Poland with 1287 stations, 10 dynamic and 214 static features

The static features of Poland are same as that of EStreams.

[249]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[250]:
df = dataset.fetch_static_features()
print(df.shape)
(1287, 214)
[251]:
print(df.isna().sum().sum())
df.isna().sum()
15798
[251]:
static_features
area_flag            0
area_km2             0
area_official        6
area_rel             6
aridity            270
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        270
Length: 214, dtype: int64

find those columns which have at least one NaN value

[252]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[253]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[253]:
static_features
area_official                   6
area_rel                        6
aridity                       270
baseflow_index                270
dam_yr_first                 1099
dam_yr_last                  1099
duplicated_suspect           1276
elevation                    1287
end_date                      210
end_date_climatic             270
end_date_hydro                270
frac_snow                     270
hfd_mean                      277
hfd_std                       283
hp_dur                        270
hp_freq                       270
hp_time                       270
hq_dur                        485
hq_freq                       485
lp_dur                        270
lp_freq                       270
lp_time                       270
lq_dur                        507
lq_freq                       507
num_days_gaps                 210
num_years_climatic            270
num_years_hydro               270
p_mean                        270
p_seasonality                 270
pet_mean                      270
q_5                           270
q_95                          270
q_elas_Sankarasubramanian     270
q_mean                        270
q_runoff_ratio                270
res_tot_sto                  1101
slope_no_unit                 270
start_date                    210
start_date_climatic           270
start_date_hydro              270
zero_q_freq                   270
dtype: int64
[254]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

Portugal

[255]:
dataset = RainfallRunoff('Portugal', path=DATA_PATH, verbosity=0)
print(dataset)
Portugal with 280 stations, 10 dynamic and 214 static features

The static features of Portugal are same as that of EStreams.

[256]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[257]:
df = dataset.fetch_static_features()
print(df.shape)
(280, 214)
[258]:
print(df.isna().sum().sum())
df.isna().sum()
2172
[258]:
static_features
area_flag           0
area_km2            0
area_official      25
area_rel           25
aridity            33
                   ..
steep_area_fra      0
strm_dens           0
tot_area            0
watershed_group     0
zero_q_freq        34
Length: 214, dtype: int64

find those columns which have at least one NaN value

[259]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[260]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[260]:
static_features
area_official                 25
area_rel                      25
aridity                       33
baseflow_index                94
dam_yr_first                 221
dam_yr_last                  221
duplicated_suspect           280
end_date                       1
end_date_climatic             33
end_date_hydro                34
frac_snow                     33
hfd_mean                      38
hfd_std                       41
hp_dur                        33
hp_freq                       33
hp_time                       33
hq_dur                        36
hq_freq                       36
lp_dur                        33
lp_freq                       33
lp_time                       33
lq_dur                        36
lq_freq                       36
num_days_gaps                  1
num_years_climatic            33
num_years_hydro               33
p_mean                        33
p_seasonality                 33
pet_mean                      33
q_5                           34
q_95                          34
q_elas_Sankarasubramanian     34
q_mean                        34
q_runoff_ratio                34
res_tot_sto                  221
slope_no_unit                 92
start_date                     1
start_date_climatic           33
start_date_hydro              34
zero_q_freq                   34
dtype: int64
[261]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[262]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Simbi

[263]:
dataset = RainfallRunoff('Simbi', path= DATA_PATH, verbosity=0)
print(dataset)
Simbi with 24 stations, 3 dynamic and 232 static features
/home/atr/AquaFetch/aqua_fetch/rr/_simbi.py:318: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[264]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Alluvial aquifers with free water, Alluvial aquifers with partly confined water, Alluvium & detrital
materials_geol, Andesites & rhyodacites_geol, Aridity_mon_arid, BFI1_d, BFI2_d, BFI3_d, BFI_d,
Basalt_geol, Beaches & dunes_lc_98, Carb_Rocks_Perc, Carbonate aquifers with marl intercalation,
Closed Shrubland_lc_95, Continuous urban_lc_98, Cropland_lc_95, Crystalline formation,
Cumul_Freq_1%, Cumul_Freq_10%, Cumul_Freq_100%, Cumul_Freq_11%, Cumul_Freq_12%, Cumul_Freq_13%,
Cumul_Freq_14%, Cumul_Freq_15%, Cumul_Freq_16%, Cumul_Freq_17%, Cumul_Freq_18%, Cumul_Freq_19%,
Cumul_Freq_2%, Cumul_Freq_20%, Cumul_Freq_21%, Cumul_Freq_22%, Cumul_Freq_23%, Cumul_Freq_24%,
Cumul_Freq_25%, Cumul_Freq_26%, Cumul_Freq_27%, Cumul_Freq_28%, Cumul_Freq_29%, Cumul_Freq_3%,
Cumul_Freq_30%, Cumul_Freq_31%, Cumul_Freq_32%, Cumul_Freq_33%, Cumul_Freq_34%, Cumul_Freq_35%,
Cumul_Freq_36%, Cumul_Freq_37%, Cumul_Freq_38%, Cumul_Freq_39%, Cumul_Freq_4%, Cumul_Freq_40%,
Cumul_Freq_41%, Cumul_Freq_42%, Cumul_Freq_43%, Cumul_Freq_44%, Cumul_Freq_45%, Cumul_Freq_46%,
Cumul_Freq_47%, Cumul_Freq_48%, Cumul_Freq_49%, Cumul_Freq_5%, Cumul_Freq_50%, Cumul_Freq_51%,
Cumul_Freq_52%, Cumul_Freq_53%, Cumul_Freq_54%, Cumul_Freq_55%, Cumul_Freq_56%, Cumul_Freq_57%,
Cumul_Freq_58%, Cumul_Freq_59%, Cumul_Freq_6%, Cumul_Freq_60%, Cumul_Freq_61%, Cumul_Freq_62%,
Cumul_Freq_63%, Cumul_Freq_64%, Cumul_Freq_65%, Cumul_Freq_66%, Cumul_Freq_67%, Cumul_Freq_68%,
Cumul_Freq_69%, Cumul_Freq_7%, Cumul_Freq_70%, Cumul_Freq_71%, Cumul_Freq_72%, Cumul_Freq_73%,
Cumul_Freq_74%, Cumul_Freq_75%, Cumul_Freq_76%, Cumul_Freq_77%, Cumul_Freq_78%, Cumul_Freq_79%,
Cumul_Freq_8%, Cumul_Freq_80%, Cumul_Freq_81%, Cumul_Freq_82%, Cumul_Freq_83%, Cumul_Freq_84%,
Cumul_Freq_85%, Cumul_Freq_86%, Cumul_Freq_87%, Cumul_Freq_88%, Cumul_Freq_89%, Cumul_Freq_9%,
Cumul_Freq_90%, Cumul_Freq_91%, Cumul_Freq_92%, Cumul_Freq_93%, Cumul_Freq_94%, Cumul_Freq_95%,
Cumul_Freq_96%, Cumul_Freq_97%, Cumul_Freq_98%, Cumul_Freq_99%, Deciduous Broadleaf Forest_lc_95,
Deciduous Needleleaf Forest_lc_95, Dense agricultural crops_lc_98, Dense agroforestry systems_lc_98,
Diorite & tonalite_geol, Discontinuous urban_lc_98, Dominant pastures_lc_98, ETP_5_mon_q5,
ETP_95_mon_q95, ETP_mon_avg, Evergreen Broadleaf Forest_lc_95, Evergreen Needleleaf Forest_lc_95,
Fissured & partitioned carbonate aquifers, Flysch & sandstone & limestone_geol, Forest_lc_98,
Grassland_lc_95, Gravelius, Hard limestone_geol, Highly permeable fissured & porous carbonate
aquifers, Industrial areas_lc_98, Karst aquifer, Lat_Exu, Lon_Exu, Low permeability sedimentary
formation, Magma_Perc, Mangroves_lc_98, Marl & marly limestone_geol, Marl & sand_geol, Marly
limestone_geol, Max_Elv, Medium-density agricultural crops_lc_98, Min_Elv, Mixed Forest_lc_95, More
productive alluvial area, Open Shrubland_lc_95, P_5_mon_q5, P_95_mon_q95, P_max10_mon_QMXA10,
P_min5_mon_QMNA5, P_mon_avg, Pasture with other presence_lc_98, Ports & airports_lc_98, Q1_5_mon_q5,
Q1_95_mon_q95, Q1_max10_mon_QMXA10, Q1_min5_mon_QMNA5, Q1_mm_d_hq_dur, Q1_mm_d_hq_freq,
Q1_mm_d_lq_dur, Q1_mm_d_lq_freq, Q1_mm_d_mean, Q1_mm_d_q5, Q1_mm_d_q95, Q1_mon_avg, Q2_5_mon_q5,
Q2_95_mon_q95, Q2_max10_mon_QMXA10, Q2_min5_mon_QMNA5, Q2_mm_d_hq_dur, Q2_mm_d_hq_freq,
Q2_mm_d_lq_dur, Q2_mm_d_lq_freq, Q2_mm_d_mean, Q2_mm_d_q5, Q2_mm_d_q95, Q2_mon_avg, Q3_5_mon_q5,
Q3_95_mon_q95, Q3_max10_mon_QMXA10, Q3_min5_mon_QMNA5, Q3_mm_d_hq_dur, Q3_mm_d_hq_freq,
Q3_mm_d_lq_dur, Q3_mm_d_lq_freq, Q3_mm_d_mean, Q3_mm_d_q5, Q3_mm_d_q95, Q3_mon_avg, Q_5_mon_q5,
Q_95_mon_q95, Q_max10_mon_QMXA10, Q_min5_mon_QMNA5, Q_mm_d_hq_dur, Q_mm_d_hq_freq, Q_mm_d_lq_dur,
Q_mm_d_lq_freq, Q_mm_d_mean, Q_mm_d_q5, Q_mm_d_q95, Q_mon_avg, Quarry_lc_98, River beds & recent
alluvium_lc_98, Rock outcrops & bare soil_lc_98, Runoff_Ratio_mon_arid, Saline areas_lc_98,
Savannahs with other presence_lc_98, Savannahs_lc_98, Sd_Elv, Sedim_Perc, Stream_density,
Temp_5_mon_q5, Temp_95_mon_q95, Temp_mon_avg, Ultrabasic rocks_geol, Urban_lc_95, Volcano-
sedimentary rock_geol, Water plan_lc_98, Water_lc_95, Wetlands_lc_98, Wooded Grassland_lc_95,
Woodland_lc_95, area_km2, lat, long, slope_degrees
[265]:
df = dataset.fetch_static_features()
print(df.shape)
(24, 232)
/home/atr/AquaFetch/aqua_fetch/rr/_simbi.py:318: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[266]:
print(df.isna().sum().sum())
df.isna().sum()
96
[266]:
Alluvial aquifers with free water               0
Alluvial aquifers with partly confined water    0
Alluvium & detrital materials_geol              0
Andesites & rhyodacites_geol                    0
Aridity_mon_arid                                0
                                               ..
Woodland_lc_95                                  0
area_km2                                        0
lat                                             0
long                                            0
slope_degrees                                   0
Length: 232, dtype: int64

find those columns which have at least one NaN value

[267]:
df.loc[:, (df.isna().sum()>0)]
[267]:
BFI1_d BFI2_d BFI3_d BFI_d Q1_mm_d_hq_dur Q1_mm_d_hq_freq Q1_mm_d_lq_dur Q1_mm_d_lq_freq Q1_mm_d_mean Q1_mm_d_q5 ... Q3_mm_d_mean Q3_mm_d_q5 Q3_mm_d_q95 Q_mm_d_hq_dur Q_mm_d_hq_freq Q_mm_d_lq_dur Q_mm_d_lq_freq Q_mm_d_mean Q_mm_d_q5 Q_mm_d_q95
001 0.46 0.68 0.55 0.49 2.12 2.43 27.62 34.19 1.16 0.2 ... 1.18 0.2 3.10 1.62 0.86 0.00 0.00 1.23 0.4 3.10
004 0.59 0.38 0.42 0.38 1.98 5.38 0.00 0.00 2.26 0.6 ... 2.23 0.3 8.15 2.00 5.00 0.00 0.00 1.97 0.6 4.80
006 0.47 0.66 0.61 0.50 1.90 5.52 0.00 0.00 1.50 0.5 ... 1.36 0.4 3.50 1.89 1.00 4.00 0.80 1.35 0.4 3.60
007 0.49 0.53 0.50 0.47 2.65 5.05 19.03 29.00 1.91 0.3 ... 2.04 0.5 5.60 2.33 2.50 5.51 16.86 2.08 0.5 5.50
008 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
010 0.29 0.21 0.31 0.32 6.08 20.86 23.73 117.52 2.10 0.2 ... 2.19 0.1 7.10 2.85 8.23 9.95 35.23 2.61 0.4 6.70
023 0.16 0.34 0.29 0.20 3.65 20.14 18.33 89.05 1.87 0.2 ... 1.81 0.2 6.70 1.91 7.79 5.34 24.79 1.98 0.3 5.58
024 0.38 0.42 0.39 0.42 3.38 15.14 0.00 0.00 1.37 0.3 ... 1.38 0.4 4.50 1.75 2.75 18.19 109.00 1.13 0.1 3.70
029 0.38 0.28 0.33 0.39 2.43 6.95 34.18 61.86 2.31 0.2 ... 2.13 0.1 6.30 1.56 2.71 0.00 0.00 2.30 0.7 4.50
036 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
037 0.24 0.47 0.34 0.41 9.02 36.10 83.00 158.10 0.61 0.0 ... 0.71 0.0 2.90 1.33 0.38 12.67 62.08 1.03 0.2 2.90
041 0.39 0.61 0.52 0.39 6.14 14.90 130.76 105.86 0.89 0.0 ... 0.93 0.0 3.20 0.00 0.00 14.83 57.07 1.05 0.2 3.00
044 0.32 0.28 0.36 0.25 7.22 40.24 70.33 160.76 1.05 0.0 ... 0.89 0.0 4.10 1.38 9.86 12.38 15.57 1.36 0.2 4.46
045 0.52 0.39 0.44 0.42 2.95 2.95 0.00 0.00 0.44 0.1 ... 0.43 0.1 1.10 1.60 0.17 5.64 13.17 0.36 0.1 1.00
051 0.23 0.21 0.13 0.18 3.62 13.10 25.48 110.43 1.06 0.1 ... 1.61 0.1 6.00 2.27 12.00 7.72 49.25 1.67 0.2 4.50
052 0.49 0.62 0.58 0.29 1.90 2.71 37.16 97.33 2.66 0.2 ... 2.88 0.8 8.70 3.11 33.50 28.00 160.33 2.39 0.0 9.96
053 0.31 0.10 0.11 0.12 2.30 12.62 11.45 44.71 2.66 0.4 ... 2.72 0.0 15.60 1.89 14.33 7.15 79.33 2.41 0.2 6.90
056 0.22 0.49 0.47 0.15 3.49 11.29 18.35 70.76 1.21 0.1 ... 1.28 0.2 4.30 1.78 10.55 8.69 81.64 1.20 0.1 4.30
057 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
058 0.29 0.27 0.06 0.32 3.71 38.29 81.74 163.48 1.83 0.1 ... 1.50 0.0 6.80 2.32 20.00 20.33 86.50 0.96 0.1 2.80
060 0.44 0.53 0.41 0.26 3.03 9.67 29.38 55.95 1.53 0.3 ... 1.48 0.4 4.50 2.17 9.67 7.76 32.00 1.48 0.2 4.80
061 0.33 0.49 0.41 0.30 2.41 12.71 16.44 54.81 2.69 0.4 ... 2.87 0.5 9.20 1.85 6.92 4.80 19.00 2.86 0.6 8.90
065 0.28 0.34 0.31 0.25 2.39 17.52 28.92 50.95 1.74 0.3 ... 1.72 0.3 6.20 2.30 11.75 13.11 73.31 1.74 0.2 6.00
068 0.55 0.55 0.55 0.52 2.67 3.05 13.68 42.33 2.55 0.4 ... 2.31 0.3 6.10 2.12 2.83 8.60 7.17 2.19 0.6 4.60

24 rows × 32 columns

[268]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[268]:
BFI1_d             3
BFI2_d             3
BFI3_d             3
BFI_d              3
Q1_mm_d_hq_dur     3
Q1_mm_d_hq_freq    3
Q1_mm_d_lq_dur     3
Q1_mm_d_lq_freq    3
Q1_mm_d_mean       3
Q1_mm_d_q5         3
Q1_mm_d_q95        3
Q2_mm_d_hq_dur     3
Q2_mm_d_hq_freq    3
Q2_mm_d_lq_dur     3
Q2_mm_d_lq_freq    3
Q2_mm_d_mean       3
Q2_mm_d_q5         3
Q2_mm_d_q95        3
Q3_mm_d_hq_dur     3
Q3_mm_d_hq_freq    3
Q3_mm_d_lq_dur     3
Q3_mm_d_lq_freq    3
Q3_mm_d_mean       3
Q3_mm_d_q5         3
Q3_mm_d_q95        3
Q_mm_d_hq_dur      3
Q_mm_d_hq_freq     3
Q_mm_d_lq_dur      3
Q_mm_d_lq_freq     3
Q_mm_d_mean        3
Q_mm_d_q5          3
Q_mm_d_q95         3
dtype: int64
[269]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[270]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Slovenia

[271]:
dataset = RainfallRunoff('Slovenia', path=DATA_PATH, verbosity=0)
print(dataset)
Slovenia with 117 stations, 10 dynamic and 214 static features

The static features of Slovenia are same as that of EStreams.

[272]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[273]:
df = dataset.fetch_static_features()
print(df.shape)
(117, 214)
[274]:
print(df.isna().sum().sum())
df.isna().sum()
1163
[274]:
static_features
area_flag           0
area_km2            0
area_official      17
area_rel           17
aridity            21
                   ..
steep_area_fra      0
strm_dens           0
tot_area            0
watershed_group     0
zero_q_freq        21
Length: 214, dtype: int64

find those columns which have at least one NaN value

[275]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[276]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[276]:
static_features
area_official                 17
area_rel                      17
aridity                       21
baseflow_index                21
dam_yr_first                 113
dam_yr_last                  113
duplicated_suspect           117
elevation                      7
end_date                       1
end_date_climatic             21
end_date_hydro                21
frac_snow                     21
hfd_mean                      21
hfd_std                       21
hp_dur                        21
hp_freq                       21
hp_time                       21
hq_dur                        21
hq_freq                       21
lp_dur                        21
lp_freq                       21
lp_time                       21
lq_dur                        27
lq_freq                       27
num_days_gaps                  1
num_years_climatic            21
num_years_hydro               21
p_mean                        21
p_seasonality                 21
pet_mean                      21
q_5                           21
q_95                          21
q_elas_Sankarasubramanian     21
q_mean                        21
q_runoff_ratio                21
res_tot_sto                  113
slope_no_unit                 21
start_date                     1
start_date_climatic           21
start_date_hydro              21
zero_q_freq                   21
dtype: int64
[277]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[278]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Spain

[279]:
dataset = RainfallRunoff('Spain', path=DATA_PATH, verbosity=0)
print(dataset)
Spain with 889 stations, 27 dynamic and 35 static features

The static features of Spain are same as that of GSHA.

[280]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[281]:
df = dataset.fetch_static_features()
print(df.shape)
(889, 35)
[282]:
print(df.isna().sum().sum())
df.isna().sum()
30
[282]:
EVP_uncertainty(%)     11
HYRIV_ID                0
LRAD_uncertainty(%)     6
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area_km2                0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     13
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slope_degrees           0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[283]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[284]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[284]:
EVP_uncertainty(%)     11
LRAD_uncertainty(%)     6
pet_uncertainty(%)     13
dtype: int64
[285]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[286]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Thailand

[287]:
dataset = RainfallRunoff('Thailand', path=DATA_PATH, verbosity=0)
print(dataset)
Thailand with 73 stations, 27 dynamic and 35 static features

The static features of Thailand are same as that of GSHA.

[288]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[289]:
df = dataset.fetch_static_features()
print(df.shape)
(73, 35)
[290]:
print(df.isna().sum().sum())
df.isna().sum()
0
[290]:
EVP_uncertainty(%)     0
HYRIV_ID               0
LRAD_uncertainty(%)    0
P_uncertainty(%)       0
SRAD_uncertainty(%)    0
T_uncertainty(%)       0
agency                 0
area_km2               0
cly_pc_uav             0
ele_mt_uav             0
ero_kh_uav             0
gla_pc_use             0
glc_cl_cmj             0
gwt_cm_cav             0
inu_pc_ult             0
lat                    0
lit_cl_cmj             0
long                   0
pet_uncertainty(%)     0
pnv_cl_cmj             0
prm_pc_use             0
sgr_dk_rav             0
slope_degrees          0
slt_pc_uav             0
snd_pc_uav             0
wet_pc_u01             0
wet_pc_u02             0
wet_pc_u03             0
wet_pc_u04             0
wet_pc_u05             0
wet_pc_u06             0
wet_pc_u07             0
wet_pc_u08             0
wet_pc_u09             0
wind_uncertainty(%)    0
dtype: int64

find those columns which have at least one NaN value

[291]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[292]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[292]:
Series([], dtype: float64)
[293]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[294]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

USGS

[295]:
dataset = RainfallRunoff('USGS', path=DATA_PATH, verbosity=0)
print(dataset)
USGS with 12004 stations, 20 dynamic and 29 static features

The static features of USGS are same as that of HYSETS.

[296]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Drainage_Area_GSIM_km2, Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries,
Flag_Land_Use_Extraction, Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction,
Gravelius, Hydrometric_station_latitude, Hydrometric_station_longitude, Land_Use_Crops_frac,
Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac, Land_Use_Snow_Ice_frac,
Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name, Perimeter,
Permeability_logk_m2, Porosity_frac, Source, area_km2, lat, long, slope_degrees
[297]:
df = dataset.fetch_static_features()
print(df.shape)
(12004, 29)
[298]:
print(df.isna().sum().sum())
df.isna().sum()
16551
[298]:
Source                               0
Name                                 0
lat                                  0
long                                 0
area_km2                             0
Drainage_Area_GSIM_km2           11884
Flag_GSIM_boundaries                 0
Flag_Artificial_Boundaries           0
Elevation_m                          1
slope_degrees                        1
Gravelius                         1168
Perimeter                         1168
Flag_Shape_Extraction                0
Aspect_deg                           1
Flag_Terrain_Extraction              0
Land_Use_Forest_frac                 3
Land_Use_Grass_frac                  3
Land_Use_Wetland_frac                3
Land_Use_Water_frac                  3
Land_Use_Urban_frac                  3
Land_Use_Shrubs_frac                 3
Land_Use_Crops_frac                  3
Land_Use_Snow_Ice_frac               3
Flag_Land_Use_Extraction             0
Permeability_logk_m2              1152
Porosity_frac                     1152
Flag_Subsoil_Extraction              0
Hydrometric_station_latitude         0
Hydrometric_station_longitude        0
dtype: int64

find those columns which have at least one NaN value

[299]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[300]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[300]:
Drainage_Area_GSIM_km2    11884
Elevation_m                   1
slope_degrees                 1
Gravelius                  1168
Perimeter                  1168
Aspect_deg                    1
Land_Use_Forest_frac          3
Land_Use_Grass_frac           3
Land_Use_Wetland_frac         3
Land_Use_Water_frac           3
Land_Use_Urban_frac           3
Land_Use_Shrubs_frac          3
Land_Use_Crops_frac           3
Land_Use_Snow_Ice_frac        3
Permeability_logk_m2       1152
Porosity_frac              1152
dtype: int64
[301]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_2m_max, airtemp_C_2m_min, cloudcover, dptemp_C_mean_2m, evap_mm,
evap_mm_snow, lwdownrad_wm2, lwnetrad_wm2, pcp_mm, q_cms_obs, q_mm_obs, snowdensity_kgm3,
snowfall_mm, snowmelt_mm, solrad_wm2, solradnet_wm2, swe_mm, windspeedu_mps, windspeedv_mps
[302]:
# _, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

print total number of nans for each of dynamic feature. for feat, nans in zip( dyn_ds.dynamic_features.data.tolist(), dyn_ds.to_array().isnull().sum(dim=[“time”, “dynamic_features”]).data.tolist() ):

[303]:
#     print(feat, nans)

WaterBenchIowa

[304]:
dataset = RainfallRunoff('WaterBenchIowa', path=DATA_PATH, verbosity=0)
print(dataset)
WaterBenchIowa with 125 stations, 3 dynamic and 7 static features
[305]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_km2, loam, sandy_clay_loam, silt, silty_clay_loam, slope_perc, travel_time
[306]:
df = dataset.fetch_static_features()
print(df.shape)
(125, 7)
[307]:
print(df.isna().sum().sum())
df.isna().sum()
0
[307]:
travel_time        0
area_km2           0
slope_perc         0
loam               0
silt               0
sandy_clay_loam    0
silty_clay_loam    0
dtype: int64

find those columns which have at least one NaN value

[308]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[309]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[309]:
Series([], dtype: float64)
[310]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
et, pcp_mm, q_mm_obs

Regional Datsets without observed streamflow

The following datasets do not have observed streamflow data. However, they behave similar to the datasets with observed streamflow data.

GSHA

This dataset contains climate (dynamic) variables and static features for catchments around the world. These dynamic and static features are used for other dataset classes like Spain, Thailand and Japan.

[311]:
dataset = RainfallRunoff('GSHA', path=DATA_PATH, verbosity=0)
print(dataset)
GSHA with 21568 stations, 26 dynamic and 35 static features
[312]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[313]:
df = dataset.fetch_static_features()
print(df.shape)
(21568, 35)
[314]:
print(df.isna().sum().sum())
df.isna().sum()
3442
[314]:
EVP_uncertainty(%)     1224
HYRIV_ID                  0
LRAD_uncertainty(%)     630
P_uncertainty(%)          0
SRAD_uncertainty(%)       0
T_uncertainty(%)          8
agency                    0
area_km2                  0
cly_pc_uav                0
ele_mt_uav                0
ero_kh_uav                0
gla_pc_use                0
glc_cl_cmj                0
gwt_cm_cav                0
inu_pc_ult                0
lat                       0
lit_cl_cmj                0
long                      0
pet_uncertainty(%)     1580
pnv_cl_cmj                0
prm_pc_use                0
sgr_dk_rav                0
slope_degrees             0
slt_pc_uav                0
snd_pc_uav                0
wet_pc_u01                0
wet_pc_u02                0
wet_pc_u03                0
wet_pc_u04                0
wet_pc_u05                0
wet_pc_u06                0
wet_pc_u07                0
wet_pc_u08                0
wet_pc_u09                0
wind_uncertainty(%)       0
dtype: int64

find those columns which have at least one NaN value

[315]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[316]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[316]:
EVP_uncertainty(%)     1224
LRAD_uncertainty(%)     630
T_uncertainty(%)          8
pet_uncertainty(%)     1580
dtype: int64
[317]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2, swe_mm_era5,
windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

EStreams

The EStreams dataset does not contain observed streamflow data. However, it contains other climate (dynamic) variables and static features for european catchments. These dynamic and static features are used for other Euoropean dataset classes like Portugal, Spain, Finland, Italy, Ireland and Poland.

[318]:
dataset = RainfallRunoff('EStreams', path=DATA_PATH, verbosity=0)
print(dataset)
EStreams with 17130 stations, 9 dynamic and 214 static features
[319]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[320]:
df = dataset.fetch_static_features()
print(df.shape)
(17130, 214)
[321]:
print(df.isna().sum().sum())
df.isna().sum()
193553
[321]:
static_features
area_flag             0
area_km2              0
area_official      1355
area_rel           1356
aridity            3393
                   ...
steep_area_fra        0
strm_dens             0
tot_area              0
watershed_group       0
zero_q_freq        3614
Length: 214, dtype: int64

find those columns which have at least one NaN value

[322]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[323]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[323]:
static_features
area_official            1355
area_rel                 1356
aridity                  3393
baseflow_index           3728
bedrk_dep                   8
                         ...
stations_dens_t_max         1
stations_dens_t_mean        1
stations_dens_t_min         1
stations_dens_ws_mean       1
zero_q_freq              3614
Length: 156, dtype: int64
[324]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, rh_%, solrad_wm2, sp_mean,
windspeed_mps