Summary of Daily Rainfall Runoff Datasets

This file shows summary of all rainfall-runoff datasets available in the package which have hydrometeorological data at daily timestep. It also shows how to access these datasets using a unified interface of RainfallRunoff class.

At the time of running this script, the datasets had been previosly downloaded. Therefore, if you run this script for the first time, it may take days to run or may even not run successfully till the end due to internet connection issues.

[1]:
import os
import site

wd_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__')))))
# wd_dir = os.path.dirname(os.path.dirname(os.path.realpath('__file__')))
# wd_dir = os.path.dirname(os.path.realpath('__file__'))
print(wd_dir)
site.addsitedir(wd_dir)

import textwrap

import matplotlib
nice_fonts = {
    #"text.usetex": True,
    "font.family": "sans-serif",  #sans -serif
    #"font.serif" : "Times New Roman",
}
matplotlib.rcParams.update(nice_fonts)

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

from easy_mpl.utils import despine_axes

from aqua_fetch.utils import print_info
from aqua_fetch import RainfallRunoff

print_info()
# path where the data will be downloaded or has previously been downloaded
DATA_PATH = '/mnt/datawaha/hyex/atr/gscad_database/raw'
/home/abbaa0a/AquaFetch
numpy 1.26.4
pandas 2.2.3
aqua_fetch 1.0.0rc3
python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]
os posix
matplotlib 3.8.4
shapefile 2.3.1
xarray 2024.7.0
netCDF4 1.6.2
scipy 1.13.0
fiona 1.10.1
shapely 2.0.6
Script Executed on:  28 July 2025 10:06:23
tot_cpus 112
avail_cpus 112
mem_gib 251.52816772460938
[2]:
datasets = {
    "Arcticnet" : DATA_PATH,
    "Bull" : DATA_PATH,
    "CABra" : DATA_PATH,
    # GRDC Caravan is overshadowing the other datasets
    # so better put it at start
    "GRDCCaravan": DATA_PATH,
    #"CAMELS_AUS" : os.path.join(DATA_PATH, 'CAMELS_AUS_V1'),
    "CAMELS_AUS": os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_GB" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_BR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_COL" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_US" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CL" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DK" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CH" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_FR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_FI" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_SE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_SK" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_LUX" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_IND" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_NZ" : os.path.join(DATA_PATH, 'CAMELS'),
    "Caravan_DK": DATA_PATH,
    "LamaHCE" : DATA_PATH,
    "LamaHIce" : os.path.join(DATA_PATH, 'LamaHIce_daily'),
    "HYSETS": DATA_PATH,
    "CCAM": DATA_PATH,
    "Japan": DATA_PATH,
    "Ireland": DATA_PATH,
    "Finland": DATA_PATH,
    "Italy": DATA_PATH,
    "Poland": DATA_PATH,
    "Portugal": DATA_PATH,
    "Slovenia": DATA_PATH,
    "Simbi": DATA_PATH,
    "Spain": DATA_PATH,
    "Thailand": DATA_PATH,
    "USGS": DATA_PATH,
}

colors = plt.cm.tab20.colors + plt.cm.tab20b.colors

rets = {}
items = {}

block1 = ['HYSETS', 'Italy', 'CAMELS_COL', 'LamaHCE', 'LamaHIce', "CABra", "CAMELS_US",
          "CAMELS_CL", 'Ireland', 'Spain', 'Poland', 'CAMELS_SE', 'USGS', "Bull", "CAMELS_BR"]

block2 = ['CAMELS_DK', 'CAMELS_FR', 'CAMELS_DE', 'Portugal',
          "CAMELS_GB", "CAMELS_CH", "Caravan_DK"]

block3 = ['Thailand', 'CCAM', 'CAMELS_LUX', 'Finland',
          'CAMELS_IND', "Simbi", 'GRDCCaravan']

block4 = ['CAMELS_NZ', 'CAMELS_AUS', 'CAMELS_SK', "Japan", 'Arcticnet',
          'CAMELS_FI', 'Slovenia']

# collect the coords data
coords_data = {}
for idx, (src, path) in enumerate(datasets.items()):

    ds = RainfallRunoff(src, path=path, verbosity=0)

    coords_data[src] = ds.stn_coords()

# draw the figure
_, ax = plt.subplots(figsize=(10, 12))

map = Basemap(ax=ax, resolution='l',
              #llcrnrlon=-180, llcrnrlat=-60, urcrnrlon=180, urcrnrlat=90
              )
map.drawcoastlines(linewidth=0.3, ax=ax, color="gray", zorder=0)
for idx, src in enumerate(datasets.keys()):

    coords = coords_data[src]

    ret = map.scatter(coords['long'].values, coords['lat'].values,
                marker=".",
                s=2,
                linewidths=0.0,
                color = colors[idx],
                alpha=1.0,
                label=f"{src} (n={coords.shape[0]})")

    rets[src] = ret
    items[src] = coords.shape[0]

leg1 = ax.legend(
    [rets[src] for src in sorted(block1)],
    [f"{src} (n={items[src]})" for src in sorted(block1)],
    markerscale=12,
    fontsize=8,
    borderpad=0.2,
    labelspacing=0.5,
    title_fontproperties={'weight': 'bold', 'size': 8+2},
    bbox_to_anchor=(0.001, 0.001),
    loc="lower left",
    framealpha=0.6
    )
leg2 = ax.legend([rets[src] for src in sorted(block2)],
                [f"{src} (n={items[src]})" for src in sorted(block2)],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.34, 0.001),
        loc="lower left",
        )
leg3 = ax.legend([rets[src] for src in block3],
                [f"{src} (n={items[src]})" for src in block3],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.60, 0.001),
        loc="lower left",
        )
leg4 = ax.legend([rets[src] for src in block4],
                [f"{src} (n={items[src]})" for src in block4],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.79, 0.71),
        framealpha=0.5,
        loc="lower left",
        )
ax.add_artist(leg1)
ax.add_artist(leg2)
ax.add_artist(leg3)
ax.add_artist(leg4)

despine_axes(ax)
#plt.savefig("rr_stations.png", dpi=600, bbox_inches="tight")
plt.show()
Extracted attributes.7z
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2179: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2732: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(os.path.join(fpath),
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2743: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(fpath,
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
../_images/_notebooks_rr_summary_2_2.svg

Arcticnet

[3]:
dataset = RainfallRunoff('Arcticnet', path=DATA_PATH, verbosity=0)
print(dataset)
Arcticnet with 106 stations, 27 dynamic and 35 static features

The static features of Arcticnet are same as that of GSHA.

[4]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[5]:
df = dataset.fetch_static_features()
print(df.shape)
(106, 35)
[6]:
print(df.isna().sum().sum())
df.isna().sum()
22
[6]:
EVP_uncertainty(%)      9
HYRIV_ID                0
LRAD_uncertainty(%)     2
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area_km2                0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     11
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slope_degrees           0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[7]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[8]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[8]:
EVP_uncertainty(%)      9
LRAD_uncertainty(%)     2
pet_uncertainty(%)     11
dtype: int64
[9]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature of Arcticnet.

[10]:
# _, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Bull

[11]:
dataset = RainfallRunoff('Bull', path=DATA_PATH, verbosity=0)
print(dataset)
Extracted attributes.7z
Bull with 484 stations, 55 dynamic and 214 static features
[12]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE, aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_hydroatlas, area_km2, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj, cmi_ix_s01,
cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08, cmi_ix_s09,
cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn, dis_m3_pmx,
dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj, fmh_cl_smj,
for_pc_sse, frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01,
glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09,
glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17,
glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09,
hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse,
kar_pc_sse, lat, lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq,
moisture_index, nli_ix_sav, non-altered, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[13]:
df = dataset.fetch_static_features()
print(df.shape)
(484, 214)
[14]:
print(df.isna().sum().sum())
df.isna().sum()
0
[14]:
NSE           0
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 214, dtype: int64

find those columns which have at least one NaN value

[15]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[16]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[16]:
Series([], dtype: float64)
[17]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_AEMET_max, airtemp_C_AEMET_min, airtemp_C_EMO1arc_max,
airtemp_C_EMO1arc_min, airtemp_C_ERA5Land_max, airtemp_C_ERA5Land_min, airtemp_C_mean_2m,
airtemp_C_mean_AEMET, airtemp_C_mean_EMO1arc, airtemp_C_mean_ERA5Land, dptemp_C_max, dptemp_C_mean,
dptemp_C_min, pcp_mm_AEMET, pcp_mm_BULL, pcp_mm_EMO1arc, pcp_mm_ERA5Land, pet_mm_AEMET,
pet_mm_EMO1arc, pet_mm_ERA5Land, pevap_mm, q_cms_obs, solrad_wm2, solrad_wm2_max, solrad_wm2_min,
streamflow, surface_pressure_max_BULL, surface_pressure_mean_BULL, surface_pressure_min_BULL,
swe_mm, swe_mm_max, swe_mm_min, thermrad_wm2, thermrad_wm2_max, thermrad_wm2_min,
volumetric_soil_water_layer_1_max_BULL, volumetric_soil_water_layer_1_mean_BULL,
volumetric_soil_water_layer_1_min_BULL, volumetric_soil_water_layer_2_max_BULL,
volumetric_soil_water_layer_2_mean_BULL, volumetric_soil_water_layer_2_min_BULL,
volumetric_soil_water_layer_3_max_BULL, volumetric_soil_water_layer_3_mean_BULL,
volumetric_soil_water_layer_3_min_BULL, volumetric_soil_water_layer_4_max_BULL,
volumetric_soil_water_layer_4_mean_BULL, volumetric_soil_water_layer_4_min_BULL,
windspeedu_mps_max_10m, windspeedu_mps_mean_10m, windspeedu_mps_min_10m, windspeedv_mps_max_10m,
windspeedv_mps_mean_10m, windspeedv_mps_min_10m

print total number of nans for each of dynamic feature of Bull.

[18]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[18]:
<xarray.Dataset> Size: 6GB
Dimensions:           (time: 25932, dynamic_features: 55)
Coordinates:
  * time              (time) datetime64[ns] 207kB 1951-01-02 ... 2021-12-31
  * dynamic_features  (dynamic_features) object 440B 'airtemp_C_2m_max' ... '...
Data variables: (12/484)
    BULL_2161         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9049         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9122         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_1724         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_1765         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_1519         (time, dynamic_features) float64 11MB nan nan ... nan nan
    ...                ...
    BULL_1378         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_3244         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_3261         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_2005         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_9255         (time, dynamic_features) float64 11MB nan nan ... nan nan
    BULL_8083         (time, dynamic_features) float64 11MB nan nan ... nan nan
[19]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_2m_max 585811
airtemp_C_2m_min 572499
airtemp_C_AEMET_max 570635
airtemp_C_AEMET_min 578900
airtemp_C_EMO1arc_max 569999
airtemp_C_EMO1arc_min 586794
airtemp_C_ERA5Land_max 577950
airtemp_C_ERA5Land_min 589089
airtemp_C_mean_2m 584973
airtemp_C_mean_AEMET 598617
airtemp_C_mean_EMO1arc 572544
airtemp_C_mean_ERA5Land 570020
dptemp_C_max 598018
dptemp_C_mean 571355
dptemp_C_min 581343
pcp_mm_AEMET 575104
pcp_mm_BULL 588146
pcp_mm_EMO1arc 579974
pcp_mm_ERA5Land 591836
pet_mm_AEMET 591136
pet_mm_EMO1arc 577725
pet_mm_ERA5Land 592994
pevap_mm 593155
q_cms_obs 570794
solrad_wm2 570828
solrad_wm2_max 596744
solrad_wm2_min 584098
streamflow 571009
surface_pressure_max_BULL 591149
surface_pressure_mean_BULL 579847
surface_pressure_min_BULL 577954
swe_mm 588528
swe_mm_max 578312
swe_mm_min 577366
thermrad_wm2 585104
thermrad_wm2_max 579395
thermrad_wm2_min 585983
volumetric_soil_water_layer_1_max_BULL 594719
volumetric_soil_water_layer_1_mean_BULL 595829
volumetric_soil_water_layer_1_min_BULL 578394
volumetric_soil_water_layer_2_max_BULL 582855
volumetric_soil_water_layer_2_mean_BULL 594826
volumetric_soil_water_layer_2_min_BULL 588055
volumetric_soil_water_layer_3_max_BULL 580137
volumetric_soil_water_layer_3_mean_BULL 576932
volumetric_soil_water_layer_3_min_BULL 569987
volumetric_soil_water_layer_4_max_BULL 596923
volumetric_soil_water_layer_4_mean_BULL 570587
volumetric_soil_water_layer_4_min_BULL 575241
windspeedu_mps_max_10m 572876
windspeedu_mps_mean_10m 582761
windspeedu_mps_min_10m 582280
windspeedv_mps_max_10m 575214
windspeedv_mps_mean_10m 570233
windspeedv_mps_min_10m 588150

CABra

[20]:
dataset = RainfallRunoff('CABra', path=DATA_PATH, verbosity=0)
print(dataset)
CABra with 735 stations, 13 dynamic and 87 static features
[21]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ANA_ID, aquif_name, aquif_type, area_km2, aridity_index, baseflow_index, catch_hand, catch_lith,
catch_order, catch_wtd, clim_et, clim_p, clim_pet, clim_quality, clim_rh, clim_srad, clim_tmax,
clim_tmin, clim_wind, cover_bare, cover_crops, cover_forest, cover_grass, cover_main, cover_moss,
cover_shrub, cover_snow, cover_urban, cover_waterp, cover_waters, dist_urban, elev_gauge, elev_max,
elev_mean, elev_min, fdc_slope, gauge_biome, gauge_hreg, gauge_state, hand_class, hdisturb_index,
lat, long, missing_data, ndvi_djf, ndvi_jja, ndvi_mam, ndvi_son, p_seasonality, q_1, q_5, q_95,
q_99, q_cv, q_elasticity, q_hcv, q_hd, q_hf, q_hfd, q_lcv, q_ld, q_lf, q_mean, q_zero,
quality_index, res_area, res_number, res_regulation, res_volume, runoff_coef, series_length,
slope_perc, soil_bulk, soil_carbon, soil_clay, soil_depth, soil_sand, soil_silt, soil_textclass,
soil_type, sub_hconduc, sub_permeability, sub_porosity, water_demand, well_dynamic, well_number,
well_static
[22]:
df = dataset.fetch_static_features()
print(df.shape)
(735, 87)
[23]:
print(df.isna().sum().sum())
df.isna().sum()
0
[23]:
ANA_ID           0
aquif_name       0
aquif_type       0
area_km2         0
aridity_index    0
                ..
sub_porosity     0
water_demand     0
well_dynamic     0
well_number      0
well_static      0
Length: 87, dtype: int64

find those columns which have at least one NaN value

[24]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[25]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[25]:
Series([], dtype: float64)
[26]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Quality, aet_mm_ens, airtemp_C_ens_max, airtemp_C_ens_min, airtemp_C_mean_ens, pcp_mm_ens,
pet_mm_hg, pet_mm_pm, pet_mm_pt, q_cms_obs, rh_%_ens, solrad_wm2_ens, windspeed_mps_ens

print total number of nans for each of dynamic feature of CABra.

[27]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[27]:
<xarray.Dataset> Size: 838MB
Dimensions:           (time: 10957, dynamic_features: 13)
Coordinates:
  * time              (time) datetime64[ns] 88kB 1980-10-01 ... 2010-09-30
  * dynamic_features  (dynamic_features) <U18 936B 'Quality' ... 'windspeed_m...
Data variables: (12/735)
    1                 (time, dynamic_features) float64 1MB ...
    2                 (time, dynamic_features) float64 1MB ...
    3                 (time, dynamic_features) float64 1MB ...
    4                 (time, dynamic_features) float64 1MB ...
    5                 (time, dynamic_features) float64 1MB ...
    6                 (time, dynamic_features) float64 1MB ...
    ...                ...
    730               (time, dynamic_features) float64 1MB ...
    731               (time, dynamic_features) float64 1MB ...
    732               (time, dynamic_features) float64 1MB ...
    733               (time, dynamic_features) float64 1MB ...
    734               (time, dynamic_features) float64 1MB ...
    735               (time, dynamic_features) float64 1MB ...
[28]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
Quality 0
aet_mm_ens 0
airtemp_C_ens_max 0
airtemp_C_ens_min 0
airtemp_C_mean_ens 0
pcp_mm_ens 0
pet_mm_hg 0
pet_mm_pm 0
pet_mm_pt 0
q_cms_obs 122
rh_%_ens 92
solrad_wm2_ens 173
windspeed_mps_ens 151

CAMELS_AUS

[29]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS_AUS_V1'), version=1, verbosity=0)
print(dataset)
CAMELS_AUS with 222 stations, 28 dynamic and 166 static features
[30]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, anngro_mega, anngro_meso, anngro_micro, area_km2, aridity, baseflow_index, carbnatesed,
catchment_di, claya, clayb, confinement, daystart, daystart_P, daystart_Q, distupdamw,
drainage_division, elev_max, elev_mean, elev_min, elev_range, elongratio, end_date, erosivity,
extract_ind_fac, flow_div_fac, flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec,
geol_sec_prop, gromega_seas, gromeso_seas, gromicro_seas, hdf_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, igneous, impound_fac, infrastruc_fac, ksat, landuse_fac,
lat, lat_centroid, lc01_extracti, lc03_waterbo, lc04_saltlak, lc05_irrcrop, lc06_irrpast,
lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands, lc14_tussclo, lc15_alpineg,
lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope, lc31_forclos, lc32_foropen,
lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long, long_centroid, low_prec_dur,
low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, map_zone, mean_slope_pct, metamorph,
mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5, mrvbf_prop_6,
mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes, npp_1, npp_10,
npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann, num_nested_within,
nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e, nvis_grasses_n,
nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e, nvis_woodlands_n,
oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max, pop_mean,
prop_forested, prop_missing_data, q_mean, q_uncert_n, q_uncert_num_curves, q_uncert_q10,
q_uncert_q10_lower, q_uncert_q10_upper, q_uncert_q50, q_uncert_q50_lower, q_uncert_q50_upper,
q_uncert_q90, q_uncert_q90_lower, q_uncert_q90_upper, relief, reliefratio, river_di, river_region,
runoff_ratio, sanda, sedvolc, settlement_fac, silicsed, slope_fdc, solpawhc, solum_thickness,
start_date, state_alt, state_outlet, station_name, strahler, strdensity, stream_elas, unconsoldted,
upsdist, zero_q_freq
[31]:
df = dataset.fetch_static_features()
print(df.shape)
(222, 166)
[32]:
print(df.isna().sum().sum())
df.isna().sum()
1175
[32]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat                  0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 166, dtype: int64

find those columns which have at least one NaN value

[33]:
df.loc[:, (df.isna().sum()>0)]
[33]:
state_alt next_station_ds q_uncert_num_curves q_uncert_n q_uncert_q10 q_uncert_q10_upper q_uncert_q10_lower q_uncert_q50 q_uncert_q50_upper q_uncert_q50_lower q_uncert_q90 q_uncert_q90_upper q_uncert_q90_lower
station_id
912101A NT NaN 3.0 15226.0 0.015122 25.07% -21.06% 0.027200 20.06% -17.82% 0.121670 18.46% -15.13%
912105A NT 912101A 1.0 15232.0 0.016572 196.84% -93.24% 0.031969 129.72% -77.38% 0.161384 49.79% -40.02%
915011A NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
917107A NaN NaN 2.0 15772.0 0.001552 143.47% -66.93% 0.036077 51.70% -37.00% 0.371124 26.85% -22.30%
919003A NaN NaN 1.0 14933.0 0.004731 21.65% -18.16% 0.053229 15.45% -13.59% 1.273285 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
312061 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314207 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314213 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
315450 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
318076 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

222 rows × 13 columns

[34]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[34]:
state_alt              212
next_station_ds        192
q_uncert_num_curves     56
q_uncert_n              56
q_uncert_q10            56
q_uncert_q10_upper     118
q_uncert_q10_lower     118
q_uncert_q50            56
q_uncert_q50_upper      66
q_uncert_q50_lower      67
q_uncert_q90            56
q_uncert_q90_upper      61
q_uncert_q90_lower      61
dtype: int64
[35]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_awap_max, airtemp_C_awap_min, airtemp_C_mean_awap, airtemp_C_mean_silo,
airtemp_C_silo_max, airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO,
evap_syn_SILO, mslp_SILO, pcp_mm_awap, pcp_mm_silo, precipitation_var_AWAP, q_cms_obs, q_mmd_obs,
rh_%_silo_tmax, rh_%_silo_tmin, solrad_wm2_awap, solrad_wm2_silo, streamflow_MLd_inclInfilled,
vp_deficit_SILO, vp_hpa_awap, vp_hpa_silo

print total number of nans for each of dynamic feature of CAMELS_AUS.

[36]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[36]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 23376, dynamic_features: 28)
Coordinates:
  * time              (time) datetime64[ns] 187kB 1951-01-01 ... 2014-12-31
  * dynamic_features  (dynamic_features) <U27 3kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/222)
    912101A           (time, dynamic_features) float64 5MB ...
    912105A           (time, dynamic_features) float64 5MB ...
    915011A           (time, dynamic_features) float64 5MB ...
    917107A           (time, dynamic_features) float64 5MB ...
    919003A           (time, dynamic_features) float64 5MB ...
    919201A           (time, dynamic_features) float64 5MB ...
    ...                ...
    308799            (time, dynamic_features) float64 5MB ...
    312061            (time, dynamic_features) float64 5MB ...
    314207            (time, dynamic_features) float64 5MB ...
    314213            (time, dynamic_features) float64 5MB ...
    315450            (time, dynamic_features) float64 5MB ...
    318076            (time, dynamic_features) float64 5MB ...
[37]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 44364
streamflow_MLd_inclInfilled 45447
q_mmd_obs 47407
aet_mm_silo_morton 43179
aet_mm_silo_morton_point 43447
et_morton_wet_SILO 44551
aet_mm_silo_short_crop 44842
aet_mm_silo_tall_crop 43675
evap_morton_lake_SILO 33747
evap_pan_SILO 46760
evap_syn_SILO 53777
pcp_mm_awap 45586
pcp_mm_silo 42297
precipitation_var_AWAP 41227
solrad_wm2_awap 47647
airtemp_C_awap_max 45007
airtemp_C_awap_min 46809
vp_hpa_awap 48264
mslp_SILO 23733
solrad_wm2_silo 42692
rh_%_silo_tmax 48848
rh_%_silo_tmin 27130
airtemp_C_silo_max 39627
airtemp_C_silo_min 25826
vp_deficit_SILO 26976
vp_hpa_silo 23379
airtemp_C_mean_silo 45916
airtemp_C_mean_awap 41402
[38]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS'), version=2, verbosity=0)
print(dataset)
CAMELS_AUS with 561 stations, 28 dynamic and 187 static features
[39]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
anngro_mega, anngro_meso, anngro_micro, area_km2, aridity, carbnatesed, catchment_di, claya, clayb,
confinement, daystart, daystart_P, daystart_Q, distupdamw, drainage_division, elev_max, elev_mean,
elev_min, elev_range, elongratio, end_date, erosivity, extract_ind_fac, flow_div_fac,
flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec, geol_sec_prop, gromega_seas,
gromeso_seas, gromicro_seas, high_prec_dur, high_prec_freq, high_prec_timing, igneous, impound_fac,
infrastruc_fac, ksat, landuse_fac, lat, lat_centroid, lc01_extracti, lc03_waterbo, lc04_saltlak,
lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands,
lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope,
lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long,
long_centroid, low_prec_dur, low_prec_freq, low_prec_timing, map_zone, mean_slope_pct, metamorph,
mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5, mrvbf_prop_6,
mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes, npp_1, npp_10,
npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann, num_nested_within,
nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e, nvis_grasses_n,
nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e, nvis_woodlands_n,
oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max, pop_mean,
prop_forested, prop_missing_data, q_uncert_Q_above, q_uncert_days_above, q_uncert_rmse_all,
q_uncert_rmse_lower, q_uncert_rmse_upper, q_uncert_unique_curves, relief, reliefratio, river_di,
river_region, sanda, sedvolc, settlement_fac, sig_dur_RespTime, sig_dur_high_Q_dur,
sig_dur_low_Q_dur, sig_dur_zero_Q_dur, sig_freq_high_Q_freq, sig_freq_low_Q_freq,
sig_freq_zero_Q_freq, sig_mag_BFI, sig_mag_BaseMag, sig_mag_Q5, sig_mag_Q95, sig_mag_Q_7_day_max,
sig_mag_Q_7_day_min, sig_mag_Q_CoV, sig_mag_Q_mean, sig_mag_Q_skew, sig_mag_Q_var, sig_mag_VarIdx,
sig_other_EventRR, sig_other_PeakDistribution, sig_other_PeakDistribution_low,
sig_other_QP_elasticity, sig_other_RR_seasonality, sig_other_SnowDayRatio, sig_other_SnowStorage,
sig_other_Spearmans_rho, sig_other_StorageFromBase, sig_other_TotalRR,
sig_other_ratio_Event_TotalRR, sig_roc_AC1, sig_roc_AC1_low, sig_roc_BaseRecesK, sig_roc_FDC_slope,
sig_roc_FlashIdx, sig_roc_RLD, sig_roc_RecesK_early, sig_roc_RecesVarSeasonality,
sig_timing_HFD_mean, sig_timing_HFI_mean, silicsed, solpawhc, solum_thickness, start_date,
state_alt, state_outlet, station_name, strahler, strdensity, unconsoldted, upsdist
[40]:
df = dataset.fetch_static_features()
print(df.shape)
(561, 187)
[41]:
print(df.isna().sum().sum())
df.isna().sum()
1643
[41]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat                  0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 187, dtype: int64

find those columns which have at least one NaN value

[42]:
df.loc[:, (df.isna().sum()>0)]
[42]:
state_alt next_station_ds q_uncert_unique_curves q_uncert_rmse_all q_uncert_rmse_lower q_uncert_rmse_upper q_uncert_days_above q_uncert_Q_above sig_mag_VarIdx sig_roc_FDC_slope sig_other_PeakDistribution_low
station_id
912101A NT NaN NaN NaN NaN NaN NaN NaN 0.292867 -1.916733 -2.180623
912105A NT 912101A NaN NaN NaN NaN NaN NaN 0.304694 -1.795139 -1.254491
915011A NaN NaN NaN NaN NaN NaN NaN NaN 1.083646 NaN -6.090788
915206A NaN NaN 25.0 25.172244 6.506520 20.955888 0.078362 19.011459 1.009843 NaN -8.491230
917107A NaN NaN 16.0 53.380009 1168.007627 21.192680 0.132802 12.283859 0.641856 -3.957062 -3.631162
... ... ... ... ... ... ... ... ... ... ... ...
318150 NaN 318181 8.0 13.679565 13.569136 10.168856 0.000000 0.000000 0.459891 -3.489307 -5.351701
318181 NaN NaN 24.0 8.209045 23.363542 5.920785 0.004200 0.450893 0.507649 -3.661257 -5.290249
318191 NaN 318150 11.0 8.226708 12.538870 6.093167 0.000000 0.000000 0.514683 -3.525028 -8.555535
318311 NaN 318150 10.0 19.588965 34.652832 14.517310 0.121428 11.333069 0.678704 -4.723863 -7.717046
319204 NaN NaN 5.0 6.379150 20.465465 4.794664 0.005493 0.536084 0.683732 -5.213989 -6.477004

561 rows × 11 columns

[43]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[43]:
state_alt                         544
next_station_ds                   391
q_uncert_unique_curves            102
q_uncert_rmse_all                 102
q_uncert_rmse_lower               102
q_uncert_rmse_upper               102
q_uncert_days_above               102
q_uncert_Q_above                  102
sig_mag_VarIdx                      2
sig_roc_FDC_slope                  91
sig_other_PeakDistribution_low      3
dtype: int64
[44]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_agcd_max, airtemp_C_agcd_min, airtemp_C_mean_agcd, airtemp_C_mean_silo,
airtemp_C_silo_max, airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO,
evap_syn_SILO, mslp_SILO, pcp_mm_agcd, pcp_mm_silo, precipitation_var_AGCD, q_cms_obs, q_mmd_obs,
rh_%_silo_tmax, rh_%_silo_tmin, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO,
vp_hpa_agcd_h09, vp_hpa_agcd_h15, vp_hpa_silo

print total number of nans for each of dynamic feature of CAMELS_AUS version 2.

[45]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[45]:
<xarray.Dataset> Size: 3GB
Dimensions:           (time: 26388, dynamic_features: 28)
Coordinates:
  * time              (time) datetime64[ns] 211kB 1950-01-01 ... 2022-03-31
  * dynamic_features  (dynamic_features) <U27 3kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/561)
    912101A           (time, dynamic_features) float64 6MB ...
    912105A           (time, dynamic_features) float64 6MB ...
    915011A           (time, dynamic_features) float64 6MB ...
    915206A           (time, dynamic_features) float64 6MB ...
    917107A           (time, dynamic_features) float64 6MB ...
    919003A           (time, dynamic_features) float64 6MB ...
    ...                ...
    318076            (time, dynamic_features) float64 6MB ...
    318150            (time, dynamic_features) float64 6MB ...
    318181            (time, dynamic_features) float64 6MB ...
    318191            (time, dynamic_features) float64 6MB ...
    318311            (time, dynamic_features) float64 6MB ...
    319204            (time, dynamic_features) float64 6MB ...
[46]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 55225
streamflow_MLd_inclInfilled 56308
q_mmd_obs 58268
aet_mm_silo_morton 49250
aet_mm_silo_morton_point 47412
et_morton_wet_SILO 46342
aet_mm_silo_short_crop 46691
aet_mm_silo_tall_crop 47024
evap_morton_lake_SILO 45811
evap_pan_SILO 54536
evap_syn_SILO 36902
pcp_mm_agcd 48884
pcp_mm_silo 51410
precipitation_var_AGCD 64638
airtemp_C_agcd_max 56447
airtemp_C_agcd_min 46595
vp_hpa_agcd_h09 46673
vp_hpa_agcd_h15 44586
mslp_SILO 50694
solrad_wm2_silo 55868
rh_%_silo_tmax 50138
rh_%_silo_tmin 59125
airtemp_C_silo_max 25797
airtemp_C_silo_min 45619
vp_deficit_SILO 50491
vp_hpa_silo 51691
airtemp_C_mean_silo 63567
airtemp_C_mean_agcd 62430

CAMELS_GB

[47]:
dataset = RainfallRunoff('CAMELS_GB', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_GB with 671 stations, 10 dynamic and 145 static features
[48]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, abs_agriculture_perc, abs_amenities_perc, abs_energy_perc, abs_environmental_perc,
abs_industry_perc, abs_watersupply_perc, area_km2, aridity, bankfull_flow, bares_perc,
baseflow_index, baseflow_index_ceh, benchmark_catch, bulkdens, bulkdens_5, bulkdens_50, bulkdens_95,
bulkdens_missing, clay_perc, clay_perc_missing, conductivity_cosby, conductivity_cosby_5,
conductivity_cosby_50, conductivity_cosby_95, conductivity_cosby_missing, conductivity_hypres,
conductivity_hypres_5, conductivity_hypres_50, conductivity_hypres_95, conductivity_hypres_missing,
crop_perc, discharges, dom_land_cover, dpsbar, dwood_perc, elev_10, elev_50, elev_90, elev_max,
elev_mean, elev_min, ewood_perc, flow_perc_complete, flow_period_end, flow_period_start,
frac_high_perc, frac_low_perc, frac_mod_perc, frac_snow, gauge_easting, gauge_elev, gauge_name,
gauge_northing, grass_perc, groundwater_abs, hfd_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, inter_high_perc, inter_low_perc, inter_mod_perc,
inwater_perc, lat, long, low_nsig_perc, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur,
low_q_freq, no_gw_perc, nsig_low_perc, num_reservoir, organic_perc, organic_perc_missing, p_mean,
p_seasonality, pet_mean, porosity_cosby, porosity_cosby_5, porosity_cosby_50, porosity_cosby_95,
porosity_cosby_missing, porosity_hypres, porosity_hypres_5, porosity_hypres_50, porosity_hypres_95,
porosity_hypres_missing, q25_uncert_lower, q25_uncert_upper, q50_uncert_lower, q50_uncert_upper,
q5_uncert_lower, q5_uncert_upper, q75_uncert_lower, q75_uncert_upper, q95_uncert_lower,
q95_uncert_upper, q99_uncert_lower, q99_uncert_upper, q_mean, quncert_meta, reservoir_cap,
reservoir_drain, reservoir_env, reservoir_fs, reservoir_he, reservoir_nav, reservoir_nousedata,
reservoir_wr, reservoir_year_first, reservoir_year_last, root_depth, root_depth_5, root_depth_50,
root_depth_95, root_depth_missing, runoff_ratio, sand_perc, sand_perc_missing, shrub_perc,
silt_perc, silt_perc_missing, slope_, soil_depth_pelletier, soil_depth_pelletier_5,
soil_depth_pelletier_50, soil_depth_pelletier_95, soil_depth_pelletier_missing, station_type,
stream_elas, structurefull_flow, surfacewater_abs, tawc, tawc_5, tawc_50, tawc_95, tawc_missing,
urban_perc, zero_q_freq
[49]:
df = dataset.fetch_static_features()
print(df.shape)
(671, 145)
[50]:
print(df.isna().sum().sum())
df.isna().sum()
10316
[50]:
Q5                        0
Q95                       0
abs_agriculture_perc    313
abs_amenities_perc      313
abs_energy_perc         313
                       ...
tawc_50                   0
tawc_95                   0
tawc_missing              0
urban_perc                0
zero_q_freq               0
Length: 145, dtype: int64

find those columns which have at least one NaN value

[51]:
df.loc[:, (df.isna().sum()>0)]
[51]:
abs_agriculture_perc abs_amenities_perc abs_energy_perc abs_environmental_perc abs_industry_perc abs_watersupply_perc bankfull_flow discharges dpsbar elev_mean ... reservoir_he reservoir_nav reservoir_nousedata reservoir_wr reservoir_year_first reservoir_year_last slope_ station_type structurefull_flow surfacewater_abs
gauge_id
38017 28.43 0.00 0.00 0.00 0.00 71.57 0.26 0.005 48.3 138.0 ... NaN NaN NaN NaN NaN NaN 1.50 C 0.5 0.000
42001 2.79 0.00 0.00 0.00 0.29 96.92 10.00 0.003 46.9 69.0 ... NaN NaN NaN NaN NaN NaN 3.80 FV 9.2 0.004
55014 47.16 0.00 0.00 0.00 0.00 52.84 46.00 0.000 158.5 299.0 ... NaN NaN NaN NaN NaN NaN 2.78 FV 8.6 0.001
27041 58.32 0.12 14.58 0.00 2.23 24.74 NaN 0.014 76.8 128.0 ... NaN NaN NaN NaN NaN NaN 2.04 C US 74.8 0.047
39078 0.65 0.00 0.00 0.00 12.04 87.31 NaN 0.049 55.3 153.0 ... NaN NaN NaN NaN NaN NaN 2.02 MIS NaN 0.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66006 NaN NaN NaN NaN NaN NaN NaN NaN 129.0 284.0 ... 0.0 0.0 0.0 100.0 1934.0 1934.0 3.62 VA NaN NaN
39014 0.94 0.00 0.00 0.03 0.00 99.03 NaN 0.019 38.5 136.0 ... NaN NaN NaN NaN NaN NaN 1.72 CC 9.8 0.000
42010 66.65 0.00 0.46 9.92 0.02 22.95 NaN 0.046 54.0 111.0 ... NaN NaN NaN NaN NaN NaN 1.06 C+TP NaN 0.714
42011 0.69 0.00 0.00 0.00 0.08 99.23 5.50 0.070 52.2 77.0 ... NaN NaN NaN NaN NaN NaN 2.06 C 4.7 0.000
43009 54.02 1.73 36.56 0.00 0.21 7.48 NaN 0.026 49.9 100.0 ... NaN NaN NaN NaN NaN NaN 3.63 CC 60.0 0.050

671 rows × 38 columns

[52]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[52]:
abs_agriculture_perc      313
abs_amenities_perc        313
abs_energy_perc           313
abs_environmental_perc    313
abs_industry_perc         313
abs_watersupply_perc      313
bankfull_flow             310
discharges                231
dpsbar                      2
elev_mean                   2
groundwater_abs           229
high_prec_timing           15
low_prec_timing             3
q25_uncert_lower          173
q25_uncert_upper          173
q50_uncert_lower          168
q50_uncert_upper          168
q5_uncert_lower           235
q5_uncert_upper           235
q75_uncert_lower          170
q75_uncert_upper          170
q95_uncert_lower          195
q95_uncert_upper          195
q99_uncert_lower          250
q99_uncert_upper          250
reservoir_drain           509
reservoir_env             509
reservoir_fs              509
reservoir_he              509
reservoir_nav             509
reservoir_nousedata       509
reservoir_wr              509
reservoir_year_first      530
reservoir_year_last       530
slope_                      3
station_type                1
structurefull_flow        408
surfacewater_abs          229
dtype: int64
[53]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_intercep, q_cms_obs, q_mmd_obs, rh_%,
solrad_wm2, windspeed_mps

print total number of nans for each of dynamic feature of CAMELS-GB.

[54]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[54]:
<xarray.Dataset> Size: 882MB
Dimensions:           (time: 16436, dynamic_features: 10)
Coordinates:
  * time              (time) datetime64[ns] 131kB 1970-10-01 ... 2015-09-30
  * dynamic_features  (dynamic_features) <U15 600B 'pcp_mm' ... 'windspeed_mps'
Data variables: (12/671)
    38017             (time, dynamic_features) float64 1MB ...
    42001             (time, dynamic_features) float64 1MB ...
    55014             (time, dynamic_features) float64 1MB ...
    27041             (time, dynamic_features) float64 1MB ...
    39078             (time, dynamic_features) float64 1MB ...
    68005             (time, dynamic_features) float64 1MB ...
    ...                ...
    28033             (time, dynamic_features) float64 1MB ...
    66006             (time, dynamic_features) float64 1MB ...
    39014             (time, dynamic_features) float64 1MB ...
    42010             (time, dynamic_features) float64 1MB ...
    42011             (time, dynamic_features) float64 1MB ...
    43009             (time, dynamic_features) float64 1MB ...
[55]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
pcp_mm 0
pet_mm 424
airtemp_C_mean 0
q_mmd_obs 2136
q_cms_obs 5516
pet_mm_intercep 854
rh_% 5906
solrad_wm2 0
lwdownrad_wm2 1044
windspeed_mps 368

CAMELS_BR

[56]:
dataset = RainfallRunoff('CAMELS_BR', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_BR with 897 stations, 11 dynamic and 67 static features
[57]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_ana, area_gsim, area_gsim_quality, area_km2, aridity, asynchronicity, barren_perc,
baseflow_index, bedrock_depth, carb_rocks_perc, clay_perc, consumptive_use, consumptive_use_perc,
crop_mosaic_perc, crop_perc, dom_land_cover, dom_land_cover_perc, elev_gauge, elev_mean, et_mean,
forest_perc, frac_snow, gauge_name, gauge_region, geol_class_1st, geol_class_1st_perc,
geol_class_2nd, geol_class_2nd_perc, geol_permeability, geol_porosity, grass_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, imperv_perc, lat, long,
low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, org_carbon_content, p_mean,
p_seasonality, pet_mean, q_mean, q_quality_control_perc, q_stream_stage_perc, regulation_degree,
reservoirs_vol, runoff_ratio, sand_perc, shrub_perc, silt_perc, slope_degrees, slope_fdc, snow_perc,
stream_elas, water_table_depth, wet_perc, zero_q_freq
[58]:
df = dataset.fetch_static_features()
print(df.shape)
(897, 67)
[59]:
print(df.isna().sum().sum())
df.isna().sum()
133
[59]:
elev_gauge            0
elev_mean             0
slope_degrees         0
area_km2              0
q_mean                0
                     ..
silt_perc             0
clay_perc             0
org_carbon_content    0
bedrock_depth         0
water_table_depth     0
Length: 67, dtype: int64

find those columns which have at least one NaN value

[60]:
df.loc[:, (df.isna().sum()>0)]
[60]:
slope_fdc baseflow_index geol_class_2nd area_ana frac_snow high_prec_timing
gauge_id
58030000 1.08954 0.79986 acid_plutonic_rocks 796.0 0.0 djf
57170000 1.35609 0.76520 acid_plutonic_rocks 980.0 0.0 djf
39580000 1.68712 0.66356 siliciclastic_sedimentary_rocks 756.0 0.0 mam
41818000 1.98782 0.64151 metamorphics 16600.0 0.0 djf
58870000 1.39838 0.71359 metamorphics 1120.0 0.0 djf
... ... ... ... ... ... ...
26720000 6.72326 0.57567 siliciclastic_sedimentary_rocks 6610.0 0.0 djf
65925000 2.24256 0.54385 NaN 1660.0 0.0 son
39560000 2.14107 0.65966 metamorphics 4910.0 0.0 mam
71550000 2.51894 0.58372 siliciclastic_sedimentary_rocks NaN 0.0 son
41539998 1.74270 0.69456 metamorphics NaN 0.0 djf

897 rows × 6 columns

[61]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[61]:
slope_fdc           16
baseflow_index      18
geol_class_2nd      47
area_ana            43
frac_snow            5
high_prec_timing     4
dtype: int64
[62]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_mgb, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cpc,
pcp_mm_mswep, pet_mm_gleam, q_cms_obs, q_mmd_obs

print total number of nans for each of dynamic feature of CAMELS-BR.

[63]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[63]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14245, dynamic_features: 11)
Coordinates:
  * time              (time) datetime64[ns] 114kB 1980-01-01 ... 2018-12-31
  * dynamic_features  (dynamic_features) <U14 616B 'q_mmd_obs' ... 'q_cms_obs'
Data variables: (12/897)
    58030000          (time, dynamic_features) float64 1MB ...
    57170000          (time, dynamic_features) float64 1MB ...
    39580000          (time, dynamic_features) float64 1MB ...
    41818000          (time, dynamic_features) float64 1MB ...
    58870000          (time, dynamic_features) float64 1MB ...
    42546000          (time, dynamic_features) float64 1MB ...
    ...                ...
    53880000          (time, dynamic_features) float64 1MB ...
    26720000          (time, dynamic_features) float64 1MB ...
    65925000          (time, dynamic_features) float64 1MB ...
    39560000          (time, dynamic_features) float64 1MB ...
    71550000          (time, dynamic_features) float64 1MB ...
    41539998          (time, dynamic_features) float64 1MB ...
[64]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_mmd_obs 2696
pcp_mm_cpc 2634
pcp_mm_mswep 9258
pcp_mm_chirps 3014
aet_mm_gleam 2874
aet_mm_mgb 3220
pet_mm_gleam 2708
airtemp_C_min 3004
airtemp_C_mean 9208
airtemp_C_max 2808
q_cms_obs 2946

CAMELS_COL

[65]:
dataset = RainfallRunoff('CAMELS_COL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_COL with 347 stations, 6 dynamic and 255 static features
[66]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aquaculture, Beach, dune and sand spot, C-Pi, C-Sctm, C2P-Sm, D2D3-Sctm, DC-Sctm, DC1-Mmg, E1-Mlg,
E1-Pi, E1-Pm, E1-Sc, E1-St, E1E2-Hi, E1E2-Pi, E1E2-Pm, E1E2-VCm, E2-Pi, E2-Pm, E3-Pi, E3-Sc, E3-Sm,
E3-St, E3N1-Sct, E3N1-Stm, Flooded forest, Forest, Forest plantation, Glacier, Grasslands /
herbaceous, Infrastructure, J-Hf, J-Mlg, J-Pi, J-VCc, J-Vf, J-Vi, J1-Sct, J1J2-VCc, J1J2-VCct,
J1J2-Vf, J2J3-Sm, J3-Sc, J3K1?-Mlg, J?-Mhg, K1-Mlg, K1-Mmg, K1-Pf, K1-Pi, K1-Pm, K1-Pu, K1-Sct,
K1-Sctm, K1-Sm, K1-VCm, K2-Mhg, K2-Mlg, K2-Pf, K2-Pi, K2-Pm, K2-Pu, K2-Vf, K2-Vm, K2-Vu, MP-Mvlg,
MP-Pf, MP3NP1-Mhg, Mangrove, Mining, Mosaic of agriculture and pasture, N1-Sc, N1-Sct, N1-Sm, N1-St,
N1-VCc, N2-Sc, N2-Sm, N2-VCc, N2-Vi, N2-py, N2Q1-Sc, N2Q1-VCc, N2Q1-Vi, NP-Mmg, NP-Pm, NP-VCc,
NP3-Pf, NP3ε-Pm, O-Pf, O-Sm, O-Vf, O1-Pf, OS1-Mlg, OS1-Mmg, OS1-Pf, Other non forest formation,
Other non-vegetated area, P-Pf, P-Pi, P-Sctm, PP3PP4-Mmg, PP4-Pf, PZ-Sm, Palm oil, Q-Vi, Q-Vm, Q-af,
Q-al, Q-d, Q-gl, Q-py, Q-t, Q-vc, Q1-Hi, Q1-Sm, Q1-p, Q2-p, Q2-sw, Q2-vc, Q5, Q95, River, lake or
ocean, Rocky outcrop, S4D1-Mlg, T-Mhg, T-Mlg, T-Mm, T-Mmg, T-Mvlg, T-Pf, T-Pi, T-Pm, T-Pu, T2J1-VCm,
T3-Sm, T3-VCct, T3J-Pi, T?-Sc, Wetland, Wooded sand vegetation, alfisols_perc, andisols_perc,
area_km2, aridisols_perc, aridity, b1-Sct, b1-Sctm, b1?b4-Sct, b1b2-Sctm, b1b2-Stm, b1b5-Stm,
b1k1-Sm, b2b5-Sctm, b2b6-Sm, b2b6-Stm, b2k1-Sm, b2k5-Pm, b4?b6-Stm, b4b5-Mhp, b4b6-Sm, b4k1-Sm,
b5?k6-Sctm, b5b6-Sctm, b5k1-Sm, b5k4-VCm, b5k6-Sm, b6-Vf, b6?k1-Sm, b6k1-Stm, b6k1?-Sctm, b6k2-Mvlg,
b6k5-Sm, b6k6-Stm, baseflow_index, cn_catchment, coal_mine_pit_perc, e3e4-Sm, e5e6-Stm, e6e7-Stm,
e6e7-VCm, e6e8-Sc, e6e9-Sc, e6e9-Sct, e8n2-Sm, e8n2-St, e8n3-Sc, elev_gauge_m, entisols_perc,
equi_slope, eroded_misce_perc, factor_form, gauge_department, gravelius_index , high_prec_dur,
high_prec_freq, high_q_dur, high_q_freq, histosols_perc, inceptisols_perc, k1?k5-Sm, k1k4-Sm,
k1k6-Stm, k2k6-Sm, k5E1-Stm, k6E1-Sm, k6E1-Stm, k6e2-Mhp, lat, long, low_prec_dur, low_prec_freq,
low_q_dur, low_q_freq, mollisols_perc, n1?n5?-VCc, n1n2-Pi, n1n2-Sc, n1n2-St, n2n3-Hi, n3n4-Sm,
n3n5-Sc, n3n5-Sm, n3n5-St, n4n5-Pi, n4n5-Vm, n4n6-Hi, n4n6-Sc, n5n6-Sm, n5n7-Sct, n5n7-VCc, n6n7-Sc,
n6n7-Sm, n6n7-St, non_identi_land_perc, oxisols_perc, perimeter_km, perpetual_snow_perc, q_mean,
rocky_misce_perc, runoff_ratio, slope_fdc, spodosols_perc, stream_elas, streng_chanel ,
tc_Engi_Corps, tc_Johnstone, tc_chow, tc_kirpich, ultisols_perc, urban_perc, vertisols_perc,
water_bodies_perc, ¿K?-Sm, εO-Mlg, εO-Sm
[67]:
df = dataset.fetch_static_features()
print(df.shape)
(347, 255)
[68]:
print(df.isna().sum().sum())
df.isna().sum()
68620
[68]:
gauge_department      0
lat                   0
long                  0
elev_gauge_m          0
aridity               0
                   ...
n1?n5?-VCc          338
N1-VCc              332
n5n7-VCc            333
N2-VCc              317
N2Q1-VCc            330
Length: 255, dtype: int64

find those columns which have at least one NaN value

[69]:
df.loc[:, (df.isna().sum()>0)]
[69]:
urban_perc alfisols_perc andisols_perc aridisols_perc water_bodies_perc entisols_perc spodosols_perc coal_mine_pit_perc histosols_perc inceptisols_perc ... J-VCc K1-VCm b5k4-VCm E1E2-VCm e6e7-VCm n1?n5?-VCc N1-VCc n5n7-VCc N2-VCc N2Q1-VCc
26247030 0.9 1.8 43.8 NaN 0.6 7.2 NaN NaN 0.4 3.8 ... NaN NaN 4.815502 0.209814 NaN 2.313818 NaN 3.771071 0.005785 NaN
36027050 0.1 NaN NaN NaN 1.7 7.5 NaN NaN NaN 79.8 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
35097080 NaN NaN 61.2 NaN 0.1 4.4 NaN NaN NaN 34.3 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
16047010 NaN NaN 37.6 NaN 0.1 21.6 NaN NaN NaN 26.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
32157060 NaN NaN 0.1 NaN 1.2 23.5 NaN NaN 0.4 34.3 ... NaN NaN NaN NaN NaN NaN 34.020233 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52037010 0.3 NaN 38.7 NaN NaN 14.7 NaN NaN NaN 46.3 ... NaN NaN 26.072141 NaN NaN NaN NaN NaN NaN NaN
13047040 NaN 6.8 8.7 NaN 1.6 29.5 NaN NaN NaN 53.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
24027050 NaN 1.0 37.4 NaN 0.3 12.0 NaN NaN NaN 48.3 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
21047010 0.3 4.5 33.7 NaN 0.3 13.8 NaN NaN NaN 42.0 ... 20.530669 NaN NaN NaN NaN NaN NaN NaN 15.412016 3.13022
35027190 0.1 2.4 22.8 NaN 0.7 2.1 NaN NaN NaN 71.7 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

347 rows × 222 columns

[70]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[70]:
urban_perc           152
alfisols_perc        221
andisols_perc         56
aridisols_perc       342
water_bodies_perc     85
                    ...
n1?n5?-VCc           338
N1-VCc               332
n5n7-VCc             333
N2-VCc               317
N2Q1-VCc             330
Length: 222, dtype: int64
[71]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs

print total number of nans for each of dynamic feature of CAMELS-BR.

[72]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[72]:
<xarray.Dataset> Size: 256MB
Dimensions:           (time: 15340, dynamic_features: 6)
Coordinates:
  * time              (time) datetime64[ns] 123kB 1981-01-01 ... 2022-12-31
  * dynamic_features  (dynamic_features) <U14 336B 'pcp_mm' ... 'q_cms_obs'
Data variables: (12/347)
    26247030          (time, dynamic_features) float64 736kB ...
    36027050          (time, dynamic_features) float64 736kB ...
    35097080          (time, dynamic_features) float64 736kB ...
    16047010          (time, dynamic_features) float64 736kB ...
    32157060          (time, dynamic_features) float64 736kB ...
    35067040          (time, dynamic_features) float64 736kB ...
    ...                ...
    24037580          (time, dynamic_features) float64 736kB ...
    52037010          (time, dynamic_features) float64 736kB ...
    13047040          (time, dynamic_features) float64 736kB ...
    24027050          (time, dynamic_features) float64 736kB ...
    21047010          (time, dynamic_features) float64 736kB ...
    35027190          (time, dynamic_features) float64 736kB ...
[73]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
pcp_mm 8214
pet_mm 32604
airtemp_C_max 18888
airtemp_C_min 240
airtemp_C_mean 12612
q_cms_obs 67134

CAMELS_CL

[74]:
dataset = RainfallRunoff('CAMELS_CL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CL with 516 stations, 12 dynamic and 104 static features
[75]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_km2, aridity_chirps, aridity_cr2met, aridity_mswep, aridity_tmpa, baseflow_index,
big_dam, carb_rocks_frac, crop_frac, dom_land_cover, dom_land_cover_frac, elev_gauge, elev_max,
elev_mean, elev_med, elev_min, forest_frac, fp_frac, fp_nf_index, frac_snow_chirps,
frac_snow_cr2met, frac_snow_mswep, frac_snow_tmpa, gauge_name, geol_class_1st, geol_class_1st_frac,
geol_class_2nd, geol_class_2nd_frac, grass_frac, gw_rights_flow, gw_rights_n, hfd_mean,
high_prec_dur_chirps, high_prec_dur_cr2met, high_prec_dur_mswep, high_prec_dur_tmpa,
high_prec_freq_chirps, high_prec_freq_cr2met, high_prec_freq_mswep, high_prec_freq_tmpa,
high_prec_timing_chirps, high_prec_timing_cr2met, high_prec_timing_mswep, high_prec_timing_tmpa,
high_q_dur, high_q_freq, imp_frac, interv_degree, land_cover_missing, lat, lc_barren, lc_glacier,
location_type, long, low_prec_dur_chirps, low_prec_dur_cr2met, low_prec_dur_mswep,
low_prec_dur_tmpa, low_prec_freq_chirps, low_prec_freq_cr2met, low_prec_freq_mswep,
low_prec_freq_tmpa, low_prec_timing_chirps, low_prec_timing_cr2met, low_prec_timing_mswep,
low_prec_timing_tmpa, low_q_dur, low_q_freq, n_obs, nested_inner, nested_outer, nf_frac,
p_mean_chirps, p_mean_cr2met, p_mean_mswep, p_mean_spread, p_mean_tmpa, p_seasonality_chirps,
p_seasonality_cr2met, p_seasonality_mswep, p_seasonality_tmpa, pet_mean, q_mean, record_period_end,
record_period_start, runoff_ratio_chirps, runoff_ratio_cr2met, runoff_ratio_mswep,
runoff_ratio_tmpa, shrub_frac, slope_fdc, slope_mkm-1, snow_frac, stream_elas_chirps,
stream_elas_cr2met, stream_elas_mswep, stream_elas_tmpa, sur_rights_flow, sur_rights_n, swe_ratio,
wet_frac, zero_q_freq
[76]:
df = dataset.fetch_static_features()
print(df.shape)
(516, 104)
[77]:
print(df.isna().sum().sum())
df.isna().sum()
12185
[77]:
gauge_id
Q5                 278
Q95                278
area_km2             0
aridity_chirps      43
aridity_cr2met       0
                  ...
sur_rights_flow      0
sur_rights_n         0
swe_ratio          397
wet_frac             0
zero_q_freq        278
Length: 104, dtype: int64

find those rows which have at least one NaN value

[78]:
df.loc[:, (df.isna().sum()>0)]
[78]:
gauge_id Q5 Q95 aridity_chirps aridity_tmpa baseflow_index frac_snow_chirps frac_snow_tmpa geol_class_2nd hfd_mean high_prec_dur_chirps ... runoff_ratio_cr2met runoff_ratio_mswep runoff_ratio_tmpa slope_fdc stream_elas_chirps stream_elas_cr2met stream_elas_mswep stream_elas_tmpa swe_ratio zero_q_freq
3820003 0.01720068787 0.186048257 24.1396815 NaN 0.8442551 0.0039066811482 NaN Acid volcanic rocks 171.2500 1.330218 ... 0.374945095 0.239262023 NaN 1.0346765 3.102340653 0.726999547 0.66141617 NaN 0.946533851825 0.0000000000
12660001 NaN NaN NaN NaN NaN NaN NaN Unconsolidated sediments NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12284006 0.11884663804 2.578205293 NaN NaN 0.5687716 NaN NaN Mixed sedimentary rocks 210.0000 NaN ... 0.720861996 0.675319119 NaN 2.7683250 NaN 0.930985370 1.25019564 NaN NaN 0.0000000000
2110002 0.00737807572 0.014444286 11.4299221 NaN 0.9021013 0.0021539836182 NaN Siliciclastic sedimentary rocks 190.0625 1.861472 ... 0.071789082 0.063901111 NaN 0.4707589 0.454089183 0.138673001 0.29473920 NaN NaN 0.0000000000
5406002 NaN NaN 1.6429815 NaN NaN 0.3975444172892 NaN Acid volcanic rocks NaN 1.405714 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9437002 2.27004961549 18.450676257 0.4774513 NaN 0.7641388 0.0024017022788 NaN Basic volcanic rocks 144.5263 1.317935 ... 1.214302836 1.057415272 NaN 2.3324960 1.171523318 1.187299182 0.99004072 NaN NaN 0.0001448016
1044001 NaN NaN 3.9739998 NaN NaN 0.0329222988148 NaN Siliciclastic sedimentary rocks NaN 1.587209 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5410005 0.01352921264 2.377129689 2.1608262 NaN 0.6605756 0.1797926057066 NaN Pyroclastics 226.8824 1.407080 ... 0.469287733 0.229201243 NaN 3.8793813 2.613255185 1.673265007 1.45521068 NaN 0.721177721334 0.0000000000
7104001 NaN NaN 1.0899089 NaN NaN 0.0058230888819 NaN Acid plutonic rocks NaN 1.552381 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7341001 0.00920686838 3.506830403 1.2665541 NaN 0.3915067 0.0000000000000 NaN Unconsolidated sediments 103.3529 1.429412 ... 0.420797549 0.328001261 NaN 4.9899950 1.843974630 1.456028242 1.62750248 NaN NaN 0.0000000000

516 rows × 42 columns

[79]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[79]:
gauge_id
Q5                         278
Q95                        278
aridity_chirps              43
aridity_tmpa               516
baseflow_index             278
frac_snow_chirps            43
frac_snow_tmpa             516
geol_class_2nd              16
hfd_mean                   278
high_prec_dur_chirps        43
high_prec_dur_tmpa         516
high_prec_freq_chirps       43
high_prec_freq_tmpa        516
high_prec_timing_chirps     43
high_prec_timing_tmpa      516
high_q_dur                 278
high_q_freq                278
location_type              386
low_prec_dur_chirps         43
low_prec_dur_tmpa          516
low_prec_freq_chirps        43
low_prec_freq_tmpa         516
low_prec_timing_chirps      43
low_prec_timing_tmpa       516
low_q_dur                  278
low_q_freq                 278
p_mean_chirps               43
p_mean_tmpa                516
p_seasonality_chirps        43
p_seasonality_tmpa         516
q_mean                     278
runoff_ratio_chirps        297
runoff_ratio_cr2met        278
runoff_ratio_mswep         278
runoff_ratio_tmpa          516
slope_fdc                  278
stream_elas_chirps         297
stream_elas_cr2met         278
stream_elas_mswep          278
stream_elas_tmpa           516
swe_ratio                  397
zero_q_freq                278
dtype: int64
[80]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cr2met, pcp_mm_mswep,
pcp_mm_tmpa, pet_mm_hargreaves, pet_mm_modis, q_cms_obs, q_mmd_obs, swe

print total number of nans for each of dynamic feature.

[81]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[81]:
<xarray.Dataset> Size: 2GB
Dimensions:           (time: 38374, dynamic_features: 12)
Coordinates:
  * time              (time) datetime64[ns] 307kB 1913-02-15 ... 2018-03-09
  * dynamic_features  (dynamic_features) <U17 816B 'q_cms_obs' ... 'swe'
Data variables: (12/516)
    3820003           (time, dynamic_features) float64 4MB ...
    12660001          (time, dynamic_features) float64 4MB ...
    12284006          (time, dynamic_features) float64 4MB ...
    2110002           (time, dynamic_features) float64 4MB ...
    5406002           (time, dynamic_features) float64 4MB ...
    4711001           (time, dynamic_features) float64 4MB ...
    ...                ...
    9107002           (time, dynamic_features) float64 4MB ...
    9437002           (time, dynamic_features) float64 4MB ...
    1044001           (time, dynamic_features) float64 4MB ...
    5410005           (time, dynamic_features) float64 4MB ...
    7104001           (time, dynamic_features) float64 4MB ...
    7341001           (time, dynamic_features) float64 4MB ...
[82]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 307366
q_mmd_obs 369495
pcp_mm_cr2met 350819
pcp_mm_chirps 329458
pcp_mm_mswep 329388
pcp_mm_tmpa 304126
airtemp_C_min 309816
airtemp_C_max 307350
airtemp_C_mean 346392
pet_mm_modis 349518
pet_mm_hargreaves 336330
swe 336472

CAMELS_DK

[83]:
dataset = RainfallRunoff('CAMELS_DK', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DK with 304 stations, 13 dynamic and 119 static features
[84]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
FC, HCC, KS, MRC, THS, WP, area_km2, aridity, bulk_density, catch_accum_number, catch_flow_dir,
chalk_d, dem_max, dem_mean, dem_median, dem_min, frac_snow_daily, gauge_record_pct, gauged_type,
high_prec_dur, high_prec_freq, high_prec_timing, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, p_mean, p_seasonality, pct_aeolain_sand, pct_agriculture_corine_1990,
pct_agriculture_corine_2000, pct_agriculture_corine_2006, pct_agriculture_corine_2012,
pct_agriculture_corine_2018, pct_agriculture_levin_2011, pct_agriculture_levin_2016,
pct_agriculture_levin_2018, pct_agriculture_levin_2021, pct_beach, pct_clay, pct_claynor_100,
pct_claynor_200, pct_claynor_30, pct_claynor_60, pct_down_sand, pct_flat_area,
pct_forest_corine_1990, pct_forest_corine_2000, pct_forest_corine_2006, pct_forest_corine_2012,
pct_forest_corine_2018, pct_forest_levin_2011, pct_forest_levin_2016, pct_forest_levin_2018,
pct_forest_levin_2021, pct_fsandno_100, pct_fsandno_200, pct_fsandno_30, pct_fsandno_60,
pct_glaf_sand, pct_glal_clay, pct_glam_clay, pct_gravel, pct_gsandno_100, pct_gsandno_200,
pct_gsandno_30, pct_gsandno_60, pct_marine_sand, pct_marsh, pct_naturedry_levin_2011,
pct_naturedry_levin_2016, pct_naturedry_levin_2018, pct_naturedry_levin_2021,
pct_naturewet_levin_2011, pct_naturewet_levin_2016, pct_naturewet_levin_2018,
pct_naturewet_levin_2021, pct_organic, pct_sand, pct_sandy_till, pct_silt, pct_till,
pct_urban_corine_1990, pct_urban_corine_2000, pct_urban_corine_2006, pct_urban_corine_2012,
pct_urban_corine_2018, pct_urban_levin_2011, pct_urban_levin_2016, pct_urban_levin_2018,
pct_urban_levin_2021, pct_water_corine_1990, pct_water_corine_2000, pct_water_corine_2006,
pct_water_corine_2012, pct_water_corine_2018, pct_water_deposit, pct_water_levin_2011,
pct_water_levin_2016, pct_water_levin_2018, pct_water_levin_2021, pct_wetlands_corine_1990,
pct_wetlands_corine_2000, pct_wetlands_corine_2006, pct_wetlands_corine_2012,
pct_wetlands_corine_2018, pet_mean, root_depth, slope_max, slope_median, slope_min, slope_mkm-1,
t_mean, tawc, uaquifer_d, uaquifer_t, uclay_t, usand_t
[85]:
df = dataset.fetch_static_features()
print(df.shape)
(304, 119)
[86]:
print(df.isna().sum().sum())
df.isna().sum()
23
[86]:
FC            0
HCC           0
KS            0
MRC           0
THS           0
             ..
tawc          0
uaquifer_d    3
uaquifer_t    3
uclay_t       3
usand_t       3
Length: 119, dtype: int64

find those columns which have at least one NaN value

[87]:
df.loc[:, (df.isna().sum()>0)]
[87]:
chalk_d gauge_record_pct uaquifer_d uaquifer_t uclay_t usand_t
16200607 348.941440 100.000000 6.166402 16.930742 5.468584 5.671323
37470466 451.491863 100.000000 0.630618 46.024319 0.550030 31.309439
67221267 82.682739 100.000000 30.316439 10.650221 28.674266 0.580311
35321353 425.690145 100.000000 7.186167 39.533252 6.985663 0.198207
53411137 287.510620 100.000000 15.445959 8.634338 15.021151 0.210060
... ... ... ... ... ... ...
32211121 68.721796 54.840134 7.125480 8.509604 6.253781 1.478004
42320708 155.292211 100.000000 24.663791 9.827164 21.955691 1.744912
71270476 10.931980 100.000000 9.366122 45.329497 9.176942 0.364699
32240800 19.045147 100.000000 9.834996 28.712421 9.284840 0.198684
42600042 124.778317 90.673026 12.625913 17.442115 12.289986 2.165011

304 rows × 6 columns

[88]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[88]:
chalk_d             3
gauge_record_pct    8
uaquifer_d          3
uaquifer_t          3
uclay_t             3
usand_t             3
dtype: int64
[89]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Abstraction, DKM_dtp, DKM_gwh, DKM_irr, DKM_sdr, DKM_sre, DKM_wcr, Qdkm, aet_mm, airtemp_C_mean,
pcp_mm, pet_mm, q_cms_obs

print total number of nans for each of dynamic feature.

[90]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[90]:
<xarray.Dataset> Size: 202MB
Dimensions:           (time: 12782, dynamic_features: 13)
Coordinates:
  * time              (time) datetime64[ns] 102kB 1989-01-02 ... 2023-12-31
  * dynamic_features  (dynamic_features) <U14 728B 'Abstraction' ... 'q_cms_obs'
Data variables: (12/304)
    16200607          (time, dynamic_features) float32 665kB ...
    37470466          (time, dynamic_features) float32 665kB ...
    67221267          (time, dynamic_features) float32 665kB ...
    35321353          (time, dynamic_features) float32 665kB ...
    53411137          (time, dynamic_features) float32 665kB ...
    62231076          (time, dynamic_features) float32 665kB ...
    ...                ...
    71288759          (time, dynamic_features) float32 665kB ...
    32211121          (time, dynamic_features) float32 665kB ...
    42320708          (time, dynamic_features) float32 665kB ...
    71270476          (time, dynamic_features) float32 665kB ...
    32240800          (time, dynamic_features) float32 665kB ...
    42600042          (time, dynamic_features) float32 665kB ...
[91]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
Abstraction 4748
DKM_dtp 4755
DKM_gwh 29582
DKM_irr 4755
DKM_sdr 4748
DKM_sre 38767
DKM_wcr 10598
Qdkm 5964
aet_mm 29582
airtemp_C_mean 4755
pcp_mm 59530
pet_mm 29582
q_cms_obs 29582

CAMELS_CH

[92]:
dataset = RainfallRunoff('CAMELS_CH', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CH with 331 stations, 9 dynamic and 209 static features
[93]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, aap, acid_plutonic, acid_volcanic, amk, api, area_km2, aridity, baseflow_index_landson,
basic_plutonic, basic_volcanic, bulk_dens, bulk_dens_25, bulk_dens_5, bulk_dens_50, bulk_dens_75,
bulk_dens_90, bulk_dens_missing, bulk_dens_skewness, carbonate_sedimentary, clay_perc, clay_perc_25,
clay_perc_5, clay_perc_50, clay_perc_75, clay_perc_90, clay_perc_missing, clay_perc_skewness,
coarse_fragm_perc, coarse_fragm_perc_25, coarse_fragm_perc_5, coarse_fragm_perc_50,
coarse_fragm_perc_75, coarse_fragm_perc_90, coarse_fragm_perc_missing, coarse_fragm_perc_skewness,
conductivity, conductivity_25, conductivity_5, conductivity_50, conductivity_75, conductivity_90,
conductivity_missing, conductivity_skewness, country, crop_perc, dens_inhabitants, dom_land_cover,
dup, dwood_perc, elev_max, elev_mean, elev_min, elev_percentile10, elev_percentile25,
elev_percentile50, elev_percentile75, elev_percentile90, ewood_perc, ext_area_perc, fju,
flat_area_perc, frac_snow, gauge_easting, gauge_elevation, gauge_name, gauge_northing,
geo_log10_permeability, geo_porosity, glac_area, glac_area_neighbours, glac_mass, glac_vol,
grass_perc, hardrock_imperm_perc, hardrock_perc, hes, hfd_mean, high_prec_dur, high_prec_freq,
high_prec_timing, high_q_dur, high_q_freq, hp_count, hp_inst_turb, hp_max_power, hp_qturb, ice_geo,
ice_perc, id6, ind_end_date, ind_number_of_years, ind_start_date, intermediate_plutonic,
inwater_perc, karst_perc, lat, long, loose_rock_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, metamorphics, mixed_sedimentary, mixed_wood_perc, mpk, mps, n_inhabitants,
null_perc, num_reservoir, omm, ood, oos, ops, organic_perc, organic_perc_25, organic_perc_5,
organic_perc_50, organic_perc_75, organic_perc_90, organic_perc_missing, organic_perc_skewness, osm,
p_mean, p_seasonality, pet_mean, porosity, porosity_25, porosity_5, porosity_50, porosity_75,
porosity_90, porosity_missing, porosity_skewness, pyroclastic, q_mean, qua, reservoir_cap,
reservoir_fs, reservoir_he, reservoir_irr, reservoir_nousedata, reservoir_year_first,
reservoir_year_last, rock_perc, root_depth, root_depth_25, root_depth_5, root_depth_50,
root_depth_75, root_depth_90, root_depth_missing, root_depth_skewness, runoff_ratio, sal, sand_perc,
sand_perc_25, sand_perc_5, sand_perc_50, sand_perc_75, sand_perc_90, sand_perc_missing,
sand_perc_skewness, scrub_perc, sign_end_date, sign_number_of_years, sign_start_date,
siliciclastic_sedimentary, silt_perc, silt_perc_25, silt_perc_5, silt_perc_50, silt_perc_75,
silt_perc_90, silt_perc_missing, silt_perc_skewness, slope_degrees, slope_fdc, steep_area_perc,
stream_elas, sus, tie, tot_avail_water, tot_avail_water_25, tot_avail_water_5, tot_avail_water_50,
tot_avail_water_75, tot_avail_water_90, tot_avail_water_missing, tot_avail_water_skewness, ukd,
unconsol_coarse_perc, unconsol_fine_perc, unconsol_imperm_perc, unconsol_medium_perc,
unconsol_sediments, uod, ups, urban_perc, usm, water_body_name, water_body_type, water_geo,
water_perc, wetlands_perc, zero_q_freq
[94]:
df = dataset.fetch_static_features()
print(df.shape)
(331, 209)
[95]:
print(df.isna().sum().sum())
df.isna().sum()
2097
[95]:
ind_start_date         0
ind_end_date           0
ind_number_of_years    0
p_mean                 0
pet_mean               0
                      ..
elev_percentile90      0
elev_max               0
slope_degrees          0
flat_area_perc         0
steep_area_perc        0
Length: 209, dtype: int64

find those columns which have at least one NaN value

[96]:
df.loc[:, (df.isna().sum()>0)]
[96]:
p_seasonality frac_snow high_prec_timing low_prec_timing reservoir_he reservoir_fs reservoir_irr reservoir_nousedata reservoir_year_first reservoir_year_last ... baseflow_index_landson hfd_mean Q5 Q95 high_q_freq high_q_dur low_q_freq low_q_dur zero_q_freq silt_perc_skewness
gauge_id
2004 0.159 0.039 jja son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN -0.252
2007 -0.118 0.170 djf son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.635
2009 0.078 0.436 jja son 0.999 0.0 0.001 0.0 1914.0 1989.0 ... 0.787 243.282 1.279 6.207 0.000 0.000 0.051 2.000 0.0 0.285
2011 0.106 0.474 son son 0.998 0.0 0.002 0.0 1927.0 1989.0 ... 0.751 263.667 0.821 6.681 0.051 1.000 0.436 1.000 0.0 0.267
2014 0.279 0.223 jja son 1.000 0.0 0.000 0.0 1910.0 2015.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.421
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6007 0.228 0.379 son djf 1.000 0.0 0.000 0.0 2010.0 2010.0 ... 0.715 211.333 1.298 8.789 2.005 1.714 2.451 7.333 0.0 0.393
6008 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.602 188.875 0.632 10.751 4.385 2.593 29.315 8.069 0.0 -0.603
6009 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.318 191.714 0.127 15.376 26.897 2.667 155.228 12.056 0.0 0.310
6010 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.494 198.400 1.002 12.617 12.195 2.103 4.998 3.571 0.0 -0.744
6011 0.272 0.110 NaN djf 1.000 0.0 0.000 0.0 1918.0 2010.0 ... 0.697 204.250 1.165 10.371 0.000 0.000 0.000 0.000 0.0 0.272

331 rows × 26 columns

[97]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[97]:
p_seasonality              54
frac_snow                  54
high_prec_timing           13
low_prec_timing             5
reservoir_he              223
reservoir_fs              223
reservoir_irr             223
reservoir_nousedata       223
reservoir_year_first      223
reservoir_year_last       223
sign_start_date            42
sign_end_date              42
q_mean                     42
runoff_ratio               42
stream_elas                44
slope_fdc                  42
baseflow_index_landson     42
hfd_mean                   42
Q5                         42
Q95                        42
high_q_freq                42
high_q_dur                 42
low_q_freq                 42
low_q_dur                  42
zero_q_freq                42
silt_perc_skewness          1
dtype: int64
[98]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, q_cms_obs, q_mmd_obs, rel_sun_dur(%), swe_mm,
waterlevel(m)

print total number of nans for each of dynamic feature.

[99]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[99]:
<xarray.Dataset> Size: 174MB
Dimensions:           (time: 14610, dynamic_features: 9)
Coordinates:
  * time              (time) datetime64[ns] 117kB 1981-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U14 504B 'airtemp_C_max' ... 'water...
Data variables: (12/331)
    2004              (time, dynamic_features) float32 526kB ...
    2007              (time, dynamic_features) float32 526kB ...
    2009              (time, dynamic_features) float32 526kB ...
    2011              (time, dynamic_features) float32 526kB ...
    2014              (time, dynamic_features) float32 526kB ...
    2016              (time, dynamic_features) float32 526kB ...
    ...                ...
    6006              (time, dynamic_features) float32 526kB ...
    6007              (time, dynamic_features) float32 526kB ...
    6008              (time, dynamic_features) float32 526kB ...
    6009              (time, dynamic_features) float32 526kB ...
    6010              (time, dynamic_features) float32 526kB ...
    6011              (time, dynamic_features) float32 526kB ...
[100]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_max 35673
airtemp_C_mean 36038
airtemp_C_min 6453
pcp_mm 6453
q_cms_obs 45534
q_mmd_obs 6453
rel_sun_dur(%) 35673
swe_mm 6453
waterlevel(m) 6453

CAMELS_DE

[101]:
dataset = RainfallRunoff('CAMELS_DE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DE with 1555 stations, 21 dynamic and 111 static features
[102]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE_conceptual, NSE_lstm, Q5, Q95, agricultural_areas_perc, aquifer_aquitard_mixed_perc,
aquifer_perc, aquitard_perc, area_km2, area_metadata, artificial_surfaces_perc,
bulk_density_0_30cm_mean, bulk_density_100_200cm_mean, bulk_density_30_100cm_mean,
cavity_fissure_karst_perc, cavity_fissure_perc, cavity_fissure_pores_perc, cavity_pores_perc,
clay_0_30cm_mean, clay_100_200cm_mean, clay_30_100cm_mean, coarse_fragments_0_30cm_mean,
coarse_fragments_100_200cm_mean, coarse_fragments_30_100cm_mean, consolidation_solid_rock_perc,
consolidation_unconsolidated_rock_perc, dams_names, dams_num, dams_purposes, dams_river_names,
dams_total_lake_area, dams_total_lake_volume, dams_year_first, dams_year_last, elev_5, elev_50,
elev_95, elev_max, elev_mean, elev_min, federal_state, flow_perc_complete, flow_period_end,
flow_period_start, forests_and_seminatural_areas_perc, frac_snow, gauge_easting, gauge_elev,
gauge_elev_metadata, gauge_name, gauge_northing,
geochemical_rocktype_anthropogenically_modified_through_filling_perc,
geochemical_rocktype_carbonatic_perc, geochemical_rocktype_halitic_perc,
geochemical_rocktype_silicate_carbonatic_perc,
geochemical_rocktype_silicate_organic_components_perc, geochemical_rocktype_silicate_perc,
geochemical_rocktype_sulfatic_halitic_perc, geochemical_rocktype_sulfatic_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, kf_extremely_low_perc,
kf_high_perc, kf_highly_variable_perc, kf_low_perc, kf_low_to_extremely_low_perc, kf_medium_perc,
kf_medium_to_moderate_perc, kf_moderate_perc, kf_moderate_to_low_perc, kf_very_high_perc,
kf_very_high_to_high_perc, kf_very_low_perc, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, low_q_dur, low_q_freq, no_data_perc, p_mean, p_seasonality, provider_id, q_mean,
rocktype_magmatite_perc, rocktype_metamorphite_perc, rocktype_sediment_perc, runoff_ratio,
sand_0_30cm_mean, sand_100_200cm_mean, sand_30_100cm_mean, silt_0_30cm_mean, silt_100_200cm_mean,
silt_30_100cm_mean, slope_, soil_organic_carbon_0_30cm_mean, soil_organic_carbon_100_200cm_mean,
soil_organic_carbon_30_100cm_mean, testing_perc_complete, training_perc_complete,
validation_perc_complete, water_bodies_perc, water_body_name, waterbody_perc, wetlands_perc,
zero_q_freq
[103]:
df = dataset.fetch_static_features()
print(df.shape)
(1555, 111)
[104]:
print(df.isna().sum().sum())
df.isna().sum()
6862
[104]:
p_mean            0
p_seasonality     0
frac_snow         0
high_prec_freq    0
high_prec_dur     0
                 ..
elev_min          0
elev_5            0
elev_50           0
elev_95           0
elev_max          0
Length: 111, dtype: int64

find those columns which have at least one NaN value

[105]:
df.loc[:, (df.isna().sum()>0)]
[105]:
high_prec_timing low_prec_timing dams_names dams_river_names dams_year_first dams_year_last dams_total_lake_area dams_total_lake_volume dams_purposes NSE_lstm NSE_conceptual gauge_elev_metadata
gauge_id
DEA11180 jja mam Aabachtalsperre|Borchen Hochwasserrückhaltebec... Aabookach (Afte)|Afte bzw. Wiele|Altenau (Alme... 1930.0 1996.0 10.28 71.90 Water supply|Recreational use|Flood control 0.929 0.854 30.86
DEE10940 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.094 0.140 69.77
DE911160 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.841 0.844 NaN
DE212640 jja son NaN NaN NaN NaN 0.00 0.00 NaN 0.727 0.620 326.68
DE112130 mam mam NaN NaN NaN NaN 0.00 0.00 NaN 0.688 0.605 105.48
... ... ... ... ... ... ... ... ... ... ... ... ...
DEF13210 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.917 NaN NaN
DEF10460 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.574 0.355 NaN
DE912320 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.770 0.589 NaN
DEA11090 jja mam Aabachtalsperre|Borchen Hochwasserrückhaltebec... Afte bzw. Wiele|Aabookach (Afte)|Altenau (Alme... 1974.0 1996.0 5.37 39.41 Water supply|Flood control 0.928 0.884 63.64
DE213310 jja son NaN NaN NaN NaN 0.00 0.00 NaN 0.637 0.512 542.11

1555 rows × 12 columns

[106]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[106]:
high_prec_timing             8
low_prec_timing              3
dams_names                1240
dams_river_names          1240
dams_year_first           1251
dams_year_last            1251
dams_total_lake_area        41
dams_total_lake_volume       2
dams_purposes             1241
NSE_lstm                    43
NSE_conceptual             157
gauge_elev_metadata        385
dtype: int64
[107]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_max, pcp_mm_mean, pcp_mm_median, pcp_mm_min,
pcp_mm_std, q_cms_obs, q_mmd_obs, rh_%, rh_%_max, rh_%_med, rh_%_min, rh_%_std, solrad_wm2_max,
solrad_wm2_mean, solrad_wm2_med, solrad_wm2_min, solrad_wm2_std, water_level

print total number of nans for each of dynamic feature.

[108]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[108]:
<xarray.Dataset> Size: 7GB
Dimensions:           (time: 25568, dynamic_features: 21)
Coordinates:
  * time              (time) datetime64[ns] 205kB 1951-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U15 1kB 'q_cms_obs' ... 'airtemp_C_...
Data variables: (12/1555)
    DEA11180          (time, dynamic_features) float64 4MB ...
    DEE10940          (time, dynamic_features) float64 4MB ...
    DE911160          (time, dynamic_features) float64 4MB ...
    DE212640          (time, dynamic_features) float64 4MB ...
    DE112130          (time, dynamic_features) float64 4MB ...
    DE212760          (time, dynamic_features) float64 4MB ...
    ...                ...
    DE212300          (time, dynamic_features) float64 4MB ...
    DEF13210          (time, dynamic_features) float64 4MB ...
    DEF10460          (time, dynamic_features) float64 4MB ...
    DE912320          (time, dynamic_features) float64 4MB ...
    DEA11090          (time, dynamic_features) float64 4MB ...
    DE213310          (time, dynamic_features) float64 4MB ...
[109]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 3
q_mmd_obs 42186
water_level 25020
pcp_mm_mean 14793
pcp_mm_min 61664
pcp_mm_median 2861
pcp_mm_max 29501
pcp_mm_std 5959
rh_% 33327
rh_%_min 63155
rh_%_med 35334
rh_%_max 20733
rh_%_std 42187
solrad_wm2_mean 15888
solrad_wm2_min 186
solrad_wm2_med 18662
solrad_wm2_max 31058
solrad_wm2_std 46024
airtemp_C_mean 55727
airtemp_C_min 56635
airtemp_C_max 18180

CAMELS_FI

[110]:
dataset = RainfallRunoff('CAMELS_FI', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FI with 320 stations, 16 dynamic and 106 static features
[111]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area_km2, aridity, bares_perc_2000, bares_perc_2006, bares_perc_2012, bares_perc_2018,
baseflow_index_ladson, baseflow_index_lfstat, basin_id, basin_name, bedrock_perc, clay_perc,
coarse_perc, crop_frac_2000, crop_frac_2006, crop_frac_2012, crop_frac_2018, cross_border_perc,
dwood_perc_2000, dwood_perc_2006, dwood_perc_2012, dwood_perc_2018, elev_10, elev_90,
elev_catch_max_m, elev_gauge_m, elev_max, elev_mean, elev_min, elev_range, ewood_perc_2000,
ewood_perc_2006, ewood_perc_2012, ewood_perc_2018, frac_snow, gauge_easting, gauge_name,
gauge_northing, grass_frac_2000, grass_frac_2006, grass_frac_2012, grass_frac_2018, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, ice_correction,
inwater_perc_2000, inwater_perc_2006, inwater_perc_2012, inwater_perc_2018, lat, long, low_prec_dur,
low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, nestedness, num_dam, num_inhabitants,
num_regulation_other, num_reservoir, owner_id, owner_name, p_mean, p_seasonality, peat_perc,
pet_mean, pop_density_km2, q_mean, reference_gauge, regulation_level, reservoir_cap, runoff_ratio,
shrub_perc_2000, shrub_perc_2006, shrub_perc_2012, shrub_perc_2018, sign_end_date,
sign_number_of_obs, sign_number_of_years, sign_start_date, silt_perc, slope_fdc, slope_percent,
soil_depth_m, stream_elas, temperature_mean, till_perc, timeseries_number_of_years, urban_frac_2000,
urban_frac_2006, urban_frac_2012, urban_frac_2018, water_region_code, water_region_name,
wetland_perc_2000, wetland_perc_2006, wetland_perc_2012, wetland_perc_2018, zero_q_freq
[112]:
df = dataset.fetch_static_features()
print(df.shape)
(320, 106)
[113]:
print(df.isna().sum().sum())
df.isna().sum()
22
[113]:
timeseries_number_of_years    0
sign_start_date               0
sign_end_date                 0
sign_number_of_years          0
sign_number_of_obs            0
                             ..
num_dam                       0
num_reservoir                 0
reservoir_cap                 0
num_regulation_other          0
regulation_level              0
Length: 106, dtype: int64

find those columns which have at least one NaN value

[114]:
df.loc[:, (df.isna().sum()>0)]
[114]:
slope_fdc baseflow_index_ladson baseflow_index_lfstat high_prec_timing low_prec_timing
gauge_id
1116 4.12 0.35 0.31 jja mam
1427 1.03 0.91 0.99 jja mam
1125 2.32 0.80 0.90 jja mam
1351 NaN 0.64 0.66 jja mam
943 2.85 0.47 0.18 jja mam
... ... ... ... ... ...
931 2.31 0.61 0.61 NaN mam
1303 2.08 0.66 0.48 jja mam
3907 4.25 0.51 0.46 jja mam
929 1.32 0.82 0.97 jja mam
1101 3.45 0.72 0.80 jja mam

320 rows × 5 columns

[115]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[115]:
slope_fdc                4
baseflow_index_ladson    2
baseflow_index_lfstat    2
high_prec_timing         5
low_prec_timing          9
dtype: int64
[116]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pe_era5_land, pet_fmi, pet_mm, q_cms_obs,
q_mmd_obs, radiation_global, rh_%, snow_evaporation, snowdepth_m, swe_mm_cci3-1, swe_mm_era5,
temperature_gmin

print total number of nans for each of dynamic feature.

[117]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[117]:
<xarray.Dataset> Size: 943MB
Dimensions:           (time: 23010, dynamic_features: 16)
Coordinates:
  * time              (time) datetime64[ns] 184kB 1961-01-01 ... 2023-12-31
  * dynamic_features  (dynamic_features) <U16 1kB 'q_cms_obs' ... 'radiation_...
Data variables: (12/320)
    1116              (time, dynamic_features) float64 3MB ...
    1427              (time, dynamic_features) float64 3MB ...
    1125              (time, dynamic_features) float64 3MB ...
    1351              (time, dynamic_features) float64 3MB ...
    943               (time, dynamic_features) float64 3MB ...
    1379              (time, dynamic_features) float64 3MB ...
    ...                ...
    1200              (time, dynamic_features) float64 3MB ...
    931               (time, dynamic_features) float64 3MB ...
    1303              (time, dynamic_features) float64 3MB ...
    3907              (time, dynamic_features) float64 3MB ...
    929               (time, dynamic_features) float64 3MB ...
    1101              (time, dynamic_features) float64 3MB ...
[118]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 36296
q_mmd_obs 43432
pcp_mm 39244
pet_mm 43494
pe_era5_land 57866
pet_fmi 38644
snow_evaporation 44078
swe_mm_era5 35950
swe_mm_cci3-1 43256
snowdepth_m 36222
temperature_gmin 65724
airtemp_C_min 40026
airtemp_C_mean 64076
airtemp_C_max 46940
rh_% 59680
radiation_global 74102

CAMELS_FR

[119]:
dataset = RainfallRunoff('CAMELS_FR', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FR with 654 stations, 22 dynamic and 344 static features
[120]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_km2, clc_1990_lvl1_1, clc_1990_lvl1_2, clc_1990_lvl1_3, clc_1990_lvl1_4, clc_1990_lvl1_5,
clc_1990_lvl1_dom_class, clc_1990_lvl1_na, clc_1990_lvl2_11, clc_1990_lvl2_12, clc_1990_lvl2_13,
clc_1990_lvl2_14, clc_1990_lvl2_21, clc_1990_lvl2_22, clc_1990_lvl2_23, clc_1990_lvl2_24,
clc_1990_lvl2_31, clc_1990_lvl2_32, clc_1990_lvl2_33, clc_1990_lvl2_41, clc_1990_lvl2_42,
clc_1990_lvl2_51, clc_1990_lvl2_52, clc_1990_lvl2_dom_class, clc_1990_lvl2_na, clc_1990_lvl3_111,
clc_1990_lvl3_112, clc_1990_lvl3_121, clc_1990_lvl3_122, clc_1990_lvl3_123, clc_1990_lvl3_124,
clc_1990_lvl3_131, clc_1990_lvl3_132, clc_1990_lvl3_133, clc_1990_lvl3_141, clc_1990_lvl3_142,
clc_1990_lvl3_211, clc_1990_lvl3_212, clc_1990_lvl3_213, clc_1990_lvl3_221, clc_1990_lvl3_222,
clc_1990_lvl3_223, clc_1990_lvl3_231, clc_1990_lvl3_241, clc_1990_lvl3_242, clc_1990_lvl3_243,
clc_1990_lvl3_244, clc_1990_lvl3_311, clc_1990_lvl3_312, clc_1990_lvl3_313, clc_1990_lvl3_321,
clc_1990_lvl3_322, clc_1990_lvl3_323, clc_1990_lvl3_324, clc_1990_lvl3_331, clc_1990_lvl3_332,
clc_1990_lvl3_333, clc_1990_lvl3_334, clc_1990_lvl3_335, clc_1990_lvl3_411, clc_1990_lvl3_412,
clc_1990_lvl3_421, clc_1990_lvl3_422, clc_1990_lvl3_423, clc_1990_lvl3_511, clc_1990_lvl3_512,
clc_1990_lvl3_521, clc_1990_lvl3_522, clc_1990_lvl3_523, clc_1990_lvl3_dom_class, clc_1990_lvl3_na,
clc_2018_lvl1_1, clc_2018_lvl1_2, clc_2018_lvl1_3, clc_2018_lvl1_4, clc_2018_lvl1_5,
clc_2018_lvl1_dom_class, clc_2018_lvl1_na, clc_2018_lvl2_11, clc_2018_lvl2_12, clc_2018_lvl2_13,
clc_2018_lvl2_14, clc_2018_lvl2_21, clc_2018_lvl2_22, clc_2018_lvl2_23, clc_2018_lvl2_24,
clc_2018_lvl2_31, clc_2018_lvl2_32, clc_2018_lvl2_33, clc_2018_lvl2_41, clc_2018_lvl2_42,
clc_2018_lvl2_51, clc_2018_lvl2_52, clc_2018_lvl2_dom_class, clc_2018_lvl2_na, clc_2018_lvl3_111,
clc_2018_lvl3_112, clc_2018_lvl3_121, clc_2018_lvl3_122, clc_2018_lvl3_123, clc_2018_lvl3_124,
clc_2018_lvl3_131, clc_2018_lvl3_132, clc_2018_lvl3_133, clc_2018_lvl3_141, clc_2018_lvl3_142,
clc_2018_lvl3_211, clc_2018_lvl3_212, clc_2018_lvl3_213, clc_2018_lvl3_221, clc_2018_lvl3_222,
clc_2018_lvl3_223, clc_2018_lvl3_231, clc_2018_lvl3_241, clc_2018_lvl3_242, clc_2018_lvl3_243,
clc_2018_lvl3_244, clc_2018_lvl3_311, clc_2018_lvl3_312, clc_2018_lvl3_313, clc_2018_lvl3_321,
clc_2018_lvl3_322, clc_2018_lvl3_323, clc_2018_lvl3_324, clc_2018_lvl3_331, clc_2018_lvl3_332,
clc_2018_lvl3_333, clc_2018_lvl3_334, clc_2018_lvl3_335, clc_2018_lvl3_411, clc_2018_lvl3_412,
clc_2018_lvl3_421, clc_2018_lvl3_422, clc_2018_lvl3_423, clc_2018_lvl3_511, clc_2018_lvl3_512,
clc_2018_lvl3_521, clc_2018_lvl3_522, clc_2018_lvl3_523, clc_2018_lvl3_dom_class, clc_2018_lvl3_na,
cli_aridity_ou, cli_aridity_pe, cli_aridity_pm, cli_assync_ou, cli_assync_pe, cli_assync_pm,
cli_pet_ou_mean, cli_pet_ou_yr, cli_pet_pe_mean, cli_pet_pe_yr, cli_pet_pm_mean, cli_pet_pm_yr,
cli_prec_date_max, cli_prec_dur_high, cli_prec_dur_low, cli_prec_freq_high, cli_prec_freq_low,
cli_prec_intensity, cli_prec_max, cli_prec_mean, cli_prec_mean_yr, cli_prec_season_pet_ou,
cli_prec_season_pet_pe, cli_prec_season_pet_pm, cli_prec_season_temp, cli_prec_timing_high,
cli_prec_timing_low, cli_psol_frac_berghuijs, cli_psol_frac_safran, cli_temp_mean, dam_influence,
dam_n, dam_volume, geo_dom_class, geo_ev, geo_ig, geo_mt, geo_nd, geo_pa, geo_pb, geo_pi, geo_py,
geo_sc, geo_sm, geo_ss, geo_su, geo_va, geo_vb, geo_vi, geo_wb, hgl_krs_karstic,
hgl_krs_not_karstic, hgl_krs_unknown, hgl_permeability, hgl_porosity, hgl_thm_alluvial,
hgl_thm_bedrock, hgl_thm_intense_folded, hgl_thm_sedimentary, hgl_thm_unknown, hgl_thm_volcanism,
hyc_jay_pet_ou, hyc_jay_pet_pe, hyc_jay_pet_pm, hyc_jay_prec_mean, hyc_jay_ratio_prec_pet_ou,
hyc_jay_ratio_prec_pet_pe, hyc_jay_ratio_prec_pet_pm, hyc_jay_ratio_q_prec, hyd_bfi_ladson,
hyd_bfi_lfstat, hyd_bfi_pelletier_pet_ou, hyd_hfd_mean, hyd_q_date_max, hyd_q_date_qmna,
hyd_q_dur_high, hyd_q_dur_low, hyd_q_freq_high, hyd_q_freq_low, hyd_q_freq_zero, hyd_q_max,
hyd_q_mean, hyd_q_mean_yr, hyd_q_qmna_min, hyd_stream_elas, hym_q_anomaly_inrae, hym_q_date_end,
hym_q_date_start, hym_q_low_uncertainty_inrae, hym_q_n_year, hym_q_na_period, hym_q_na_total,
hym_q_questionable, hym_q_unqualified, lat, long, sit_altitude, sit_altitude_datum, sit_area_hydro,
sit_city, sit_code_h3, sit_comment, sit_comment_impact_gene, sit_crs, sit_date_start,
sit_date_update, sit_entity, sit_flood_duration, sit_impact, sit_kp_down, sit_kp_up, sit_label,
sit_label_add, sit_label_usual, sit_latitude, sit_longitude, sit_mnemonic, sit_month1_low_water,
sit_month1_year, sit_publication_rights, sit_section, sit_section_vigilance, sit_status,
sit_test_site, sit_type, sit_type_add, sit_tz, sit_waterbody, sit_watercourse_acc, sit_zone_hydro,
slope_, sta_altitude_snap, sta_altitude_staff_gauge, sta_area_snap, sta_city, sta_code_child,
sta_code_h2, sta_code_parent, sta_comment, sta_comment_impact_local, sta_crs, sta_date_altitude_ref,
sta_date_end, sta_date_start, sta_date_update, sta_display_level, sta_dual_staff_gauge, sta_epsg,
sta_impact_local, sta_kp, sta_label, sta_label_add, sta_main_prod_code, sta_main_prod_name,
sta_main_prod_name_short, sta_monitor, sta_publication_right, sta_purpose, sta_qual_highflow,
sta_qual_lowflow, sta_qual_meanflow, sta_territory, sta_test_station, sta_time_data_gap,
sta_time_discontinuity, sta_type, sta_x_l2e, sta_x_l2e_snap, sta_x_l93, sta_x_l93_snap,
sta_x_w84_snap, sta_y_l2e, sta_y_l2e_snap, sta_y_l93, sta_y_l93_snap, sta_y_w84_snap,
top_altitude_mean, top_dist_outlet_mean, top_drainage_density, top_itopo_mean, top_mor_circ_ratio,
top_mor_compact_coef, top_mor_elong_ratio_catchment, top_mor_elong_ratio_circ,
top_mor_form_factor_horton, top_mor_form_factor_square, top_mor_relief_ratio, top_mor_shape_factor,
top_slo_flat, top_slo_gentle, top_slo_mean, top_slo_moderate, top_slo_ori_e, top_slo_ori_n,
top_slo_ori_ne, top_slo_ori_nw, top_slo_ori_s, top_slo_ori_se, top_slo_ori_sw, top_slo_ori_w,
top_slo_steep, top_slo_strong, top_slo_very_steep
[121]:
df = dataset.fetch_static_features()
print(df.shape)
(654, 344)
[122]:
print(df.isna().sum().sum())
df.isna().sum()
12253
[122]:
area_km2              7
clc_1990_lvl1_1       0
clc_1990_lvl1_2       0
clc_1990_lvl1_3       0
clc_1990_lvl1_4       0
                     ..
top_slo_ori_sw        0
top_slo_ori_w         0
top_slo_steep         0
top_slo_strong        0
top_slo_very_steep    0
Length: 344, dtype: int64

find those columns which have at least one NaN value

[123]:
df.loc[:, (df.isna().sum()>0)]
[123]:
area_km2 clc_1990_lvl1_dom_class clc_1990_lvl2_dom_class clc_1990_lvl3_dom_class clc_2018_lvl1_dom_class clc_2018_lvl2_dom_class clc_2018_lvl3_dom_class cli_prec_timing_high cli_prec_timing_low hyd_bfi_ladson ... sta_code_h2 sta_code_parent sta_comment sta_comment_impact_local sta_date_altitude_ref sta_date_end sta_display_level sta_kp sta_label_add sta_purpose
A105003001 233.0 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.56723 ... A1050310 NaN Mise à l'heure TU le 05/11/2009. - Remplacemen... NaN 2022-02-24 08:21:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A107020001 70.0 2.0 21.0 211.0 2.0 21.0 211.0 son son 0.56320 ... A1072010 NaN Nivellement de juillet 2002, géomètre Faber-Sc... NaN 2020-12-14 11:19:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A112020001 129.0 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.44951 ... A1122010 NaN Arrêt des observations le 10/01/2008. - Nivell... NaN NaN 2008-01-10 11:20:00 NaN NaN NaN Low flow monitoring - Flood forecasting
A116003002 666.0 2.0 21.0 211.0 2.0 21.0 211.0 jja son 0.53010 ... A1080320 NaN Echelle et pont arrachés en mai 1983. Seuil re... NaN 2018-12-05 07:24:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A140202001 7.6 3.0 31.0 311.0 3.0 31.0 311.0 djf son 0.50286 ... A1402020 NaN Passage à l'heure TU le 29/10/2009. - Nivellé ... NaN 2020-12-14 11:20:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Y781000101 129.0 3.0 32.0 333.0 3.0 32.0 333.0 son jja 0.37525 ... Y7804010 NaN Station du réseau de base sur seuil naturel, é... Pompages Manso et Galeria 2021-04-15 08:18:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y862000101 331.0 3.0 31.0 311.0 3.0 31.0 311.0 djf jja NaN ... Y8624010 NaN Courbes de tarage à partir du 31/12/1979 revue... NaN NaN NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y881000102 130.0 3.0 31.0 311.0 3.0 31.0 311.0 djf jja 0.55820 ... Y8814020 NaN NaN NaN NaN 2012-04-30 12:00:00 NaN NaN Zoza ancien Flood forecasting - Streamflow monitoring
Y902000101 147.0 3.0 31.0 312.0 3.0 31.0 312.0 son jja 0.51000 ... Y9025010 NaN NaN Influence forte des barrages de baigneurs en é... NaN NaN NaN NaN Pont de Noceta Low flow monitoring - Flood forecasting - Stre...
Y960000102 99.7 3.0 32.0 323.0 3.0 31.0 313.0 djf jja 0.34639 ... Y9605230 NaN STATION EN REMPLACEMENT DE CELLE DE TAFONATO Y... Pompages amont ? 2017-09-13 09:38:00 NaN NaN NaN Canniciu Streamflow monitoring

654 rows × 57 columns

[124]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[124]:
area_km2                      7
clc_1990_lvl1_dom_class       4
clc_1990_lvl2_dom_class       5
clc_1990_lvl3_dom_class       5
clc_2018_lvl1_dom_class       5
clc_2018_lvl2_dom_class       6
clc_2018_lvl3_dom_class       7
cli_prec_timing_high         15
cli_prec_timing_low           2
hyd_bfi_ladson               42
hyd_bfi_lfstat               42
hyd_bfi_pelletier_pet_ou     42
sit_altitude                 10
sit_altitude_datum           10
sit_area_hydro              641
sit_city                      2
sit_comment                 515
sit_comment_impact_gene     630
sit_crs                       2
sit_date_start              654
sit_date_update               2
sit_entity                    2
sit_flood_duration          654
sit_impact                    6
sit_kp_down                 590
sit_kp_up                   654
sit_label                     2
sit_label_add               464
sit_label_usual             326
sit_latitude                  2
sit_longitude                 2
sit_mnemonic                619
sit_month1_low_water          2
sit_month1_year               2
sit_publication_rights        2
sit_section                   2
sit_section_vigilance       127
sit_status                    2
sit_test_site                 2
sit_type                      2
sit_type_add                  2
sit_tz                        2
sit_waterbody               654
sit_watercourse_acc         632
sit_zone_hydro                2
sta_altitude_staff_gauge    120
sta_code_child              654
sta_code_h2                  13
sta_code_parent             654
sta_comment                 305
sta_comment_impact_local    624
sta_date_altitude_ref       120
sta_date_end                580
sta_display_level           654
sta_kp                      583
sta_label_add               527
sta_purpose                  17
dtype: int64
[125]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, lwdownrad_wm2, pcp_mm, pcp_mm_solfrac, pet_mm_ou,
pet_mm_pe, pet_mm_pm, q_cms_obs, q_mmd_obs, solrad_wm2, spechum_gkg, tsd_swe_isba, tsd_swi_gr,
tsd_swi_isba, tsd_val_c, tsd_val_i, tsd_val_m, tsd_val_q, tsd_val_s, windspeed_mps

print total number of nans for each of dynamic feature.

[126]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[126]:
<xarray.Dataset> Size: 2GB
Dimensions:           (time: 18993, dynamic_features: 22)
Coordinates:
  * time              (time) datetime64[ns] 152kB 1970-01-01 ... 2021-12-31
  * dynamic_features  (dynamic_features) <U14 1kB 'airtemp_C_max' ... 'windsp...
Data variables: (12/654)
    A105003001        (time, dynamic_features) float64 3MB ...
    A107020001        (time, dynamic_features) float64 3MB ...
    A112020001        (time, dynamic_features) float64 3MB ...
    A116003002        (time, dynamic_features) float64 3MB ...
    A140202001        (time, dynamic_features) float64 3MB ...
    A202030001        (time, dynamic_features) float64 3MB ...
    ...                ...
    Y661401001        (time, dynamic_features) float64 3MB ...
    Y781000101        (time, dynamic_features) float64 3MB ...
    Y862000101        (time, dynamic_features) float64 3MB ...
    Y881000102        (time, dynamic_features) float64 3MB ...
    Y902000101        (time, dynamic_features) float64 3MB ...
    Y960000102        (time, dynamic_features) float64 3MB ...
[127]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_max 0
airtemp_C_mean 27965
airtemp_C_min 35903
lwdownrad_wm2 12075
pcp_mm 27300
pcp_mm_solfrac 19320
pet_mm_ou 45521
pet_mm_pe 35973
pet_mm_pm 38605
q_cms_obs 35791
q_mmd_obs 987
solrad_wm2 10136
spechum_gkg 56196
tsd_swe_isba 469
tsd_swi_gr 2366
tsd_swi_isba 0
tsd_val_c 53627
tsd_val_i 15330
tsd_val_m 0
tsd_val_q 15337
tsd_val_s 15120
windspeed_mps 4970

CAMELS_IND

[128]:
dataset = RainfallRunoff('CAMELS_IND', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2732: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(os.path.join(fpath),
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2743: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(fpath,
CAMELS_IND with 472 stations, 20 dynamic and 210 static features
[129]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_gleam_mean, ai_mean, annual_max_1day, annual_max_30day, annual_max_3day, annual_max_7day,
annual_max_90day, annual_min_7day, annual_q, area_km2, aridity_p_pet, aridity_pet_aet,
asynchronicity, bare_frac, bfi, built_area_frac, bulkdens_sub_major, bulkdens_sub_mean,
bulkdens_top_major, bulkdense_top_mean, carb_rocks_frac, cen_time, clay_frac_sub, clay_frac_top,
crops_frac, crops_frac_1985, crops_frac_1995, crops_frac_2005, cv_apr_flow, cv_aug_flow,
cv_dec_flow, cv_feb_flow, cv_jan_flow, cv_jul_flow, cv_jun_flow, cv_mar_flow, cv_may_flow,
cv_nov_flow, cv_oct_flow, cv_sep_flow, cwc_river, cwc_site_name, dom_land_cover,
dom_land_cover_frac, doy_max_flow, doy_max_flow_7, doy_min_flow, doy_min_flow_7, drinking_frac,
dspbar, elev_max, elev_mean, elev_median, elev_min, evap_canopy_anum, evap_canopy_max,
evap_canopy_mean, evap_canopy_min, evap_surface_anum, evap_surface_max, evap_surface_mean,
evap_surface_min, fall_days, fall_rate_mean, fall_rate_median, first_dam_year, flood_frac,
flooded_veg_frac, flow_availability, freq_q_high, freq_q_low, gauge_elevation, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, geol_permeability, geol_porosity,
ghi_area, ghi_group, ghi_lat, ghi_lon, ghi_stn_id, gini_flow, gravel_frac_sub, gravel_frac_top,
high_prec_dur, high_prec_freq, high_prec_timing, hsg_major, hydroelec_frac, irrigation_frac,
lai_diff, lai_max, lai_mean, lai_min, last_dam_year, lat, long, low_prec_dur, low_prec_freq,
low_prec_timing, max_high_prec_dur, max_low_prec_dur, mean_anum_flow, mean_apr_flow, mean_atmn_flow,
mean_aug_flow, mean_dec_flow, mean_feb_flow, mean_jan_flow, mean_jul_flow, mean_jun_flow,
mean_mar_flow, mean_may_flow, mean_nov_flow, mean_oct_flow, mean_sep_flow, mean_sumr_flow,
mean_swmn_flow, mean_wint_flow, month_1day_max, month_1day_min, n_dams, navigation_frac, num_dams,
num_hyd_alt, org_carb_sub_major, org_carb_sub_mean, org_carb_top_major, org_carb_top_mean,
organic_frac_sub, organic_frac_top, overflow_frac, p_annual_variability, p_max, p_mean, p_mean_anum,
p_monthly_variability, p_unif, pet_gleam_mean, pet_max, pet_mean, pet_mean_anum, pet_min,
pop_density_2000, pop_density_2005, pop_density_2010, pop_density_2015, pop_density_2020, q_10,
q_25, q_25_swmn, q_50, q_50_swmn, q_5_swmn, q_75, q_75_swmn, q_90, q_95_swmn, q_cv, q_high_days,
q_low_days, q_mean, q_mean_swmn, q_zero, range_frac, rel_hum_mean, res_store_sum, reservoir_index,
rise_days, rise_rate_mean, rise_rate_median, river_basin, runoff_ratio, sand_frac_sub,
sand_frac_top, silt_frac_sub, silt_frac_top, slope_degrees, slope_fdc, slope_max, slope_median,
slope_min, sm_lvl1_mean, sm_lvl2_mean, sm_lvl3_mean, sm_lvl4_mean, soil_awc_sub, soil_awc_top,
soil_awsc_major, soil_awsc_max, soil_awsc_min, soil_conductivity_sub, soil_conductivity_top,
soil_depth, srad_lw_mean, srad_sw_mean, streamflow_elas, tailing_frac, tmax_mean, tmin_mean,
total_storage, trees_frac, urban_frac_1985, urban_frac_1995, urban_frac_2005, water_frac, wind_mean,
wtd
[130]:
df = dataset.fetch_static_features()
print(df.shape)
(472, 210)
[131]:
print(df.isna().sum().sum())
df.isna().sum()
20322
[131]:
aet_gleam_mean        0
ai_mean               0
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
                   ...
urban_frac_1995       0
urban_frac_2005       0
water_frac            0
wind_mean             0
wtd                   0
Length: 210, dtype: int64

find those columns which have at least one NaN value

[132]:
df.loc[:, (df.isna().sum()>0)]
[132]:
annual_max_1day annual_max_30day annual_max_3day annual_max_7day annual_max_90day annual_min_7day annual_q bfi bulkdens_sub_major bulkdens_sub_mean ... q_mean_swmn q_zero reservoir_index rise_days rise_rate_mean rise_rate_median runoff_ratio slope_fdc streamflow_elas tailing_frac
gauge_id
3001 NaN NaN NaN NaN NaN NaN NaN NaN 1.33 1.291356 ... 0.568 NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3002 756.807 136.155 485.599 284.301 90.494 0.000 828.584 0.372 1.45 1.450000 ... 4.587 86.667 0.000688 62.00 38.509 3.07 0.472 NaN 3.744 0.000000
3003 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.210000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3004 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.211649 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3005 14640.524 3780.572 11975.203 8184.747 2305.069 5.293 21163.952 0.385 1.21 1.218816 ... 3.028 0.000 0.507788 131.75 324.324 5.46 0.331 2.859 1.925 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17021 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.291081 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17022 370.817 98.813 269.619 191.648 60.805 0.060 322.091 0.254 NaN 1.318424 ... 0.005 148.950 1.030649 78.20 12.834 0.43 0.034 NaN 2.049 0.117647
17023 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.211304 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17024 609.535 154.046 460.329 312.861 81.274 0.000 356.873 0.169 NaN 1.320490 ... 0.001 306.800 0.942509 21.65 55.641 5.30 0.031 NaN 2.977 0.117647
17025 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.330000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

472 rows × 86 columns

[133]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[133]:
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
annual_max_7day     300
annual_max_90day    300
                   ...
rise_rate_median    299
runoff_ratio        244
slope_fdc           331
streamflow_elas     271
tailing_frac         66
Length: 86, dtype: int64
[134]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_canopy(kg/m2/s),
evap_surface(kg/m2/s), lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_gleam, q_cms_obs, rh_%, sm_lvl1(kg/m2),
sm_lvl2(kg/m2), sm_lvl3(kg/m2), sm_lvl4(kg/m2), solrad_wm2, windspeed_mps, windspeedu_mps,
windspeedv_mps

print total number of nans for each of dynamic feature.

[135]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[135]:
<xarray.Dataset> Size: 566MB
Dimensions:           (time: 14976, dynamic_features: 20)
Coordinates:
  * time              (time) datetime64[ns] 120kB 1980-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U21 2kB 'aet_mm_gleam' ... 'windspe...
Data variables: (12/472)
    3001              (time, dynamic_features) float32 1MB ...
    3002              (time, dynamic_features) float32 1MB ...
    3003              (time, dynamic_features) float32 1MB ...
    3004              (time, dynamic_features) float32 1MB ...
    3005              (time, dynamic_features) float32 1MB ...
    3006              (time, dynamic_features) float32 1MB ...
    ...                ...
    17020             (time, dynamic_features) float32 1MB ...
    17021             (time, dynamic_features) float32 1MB ...
    17022             (time, dynamic_features) float32 1MB ...
    17023             (time, dynamic_features) float32 1MB ...
    17024             (time, dynamic_features) float32 1MB ...
    17025             (time, dynamic_features) float32 1MB ...
[136]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
aet_mm_gleam 15097
airtemp_C_max 7102
airtemp_C_mean 15342
airtemp_C_min 15342
evap_canopy(kg/m2/s) 1091
evap_surface(kg/m2/s) 15342
lwdownrad_wm2 15342
pcp_mm 1133
pet_mm 8248
pet_mm_gleam 14015
q_cms_obs 13789
rh_% 15341
sm_lvl1(kg/m2) 3580
sm_lvl2(kg/m2) 15342
sm_lvl3(kg/m2) 11540
sm_lvl4(kg/m2) 15342
solrad_wm2 3984
windspeed_mps 1853
windspeedu_mps 4429
windspeedv_mps 2245

CAMELS_LUX

[137]:
dataset = RainfallRunoff('CAMELS_LUX', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_LUX with 56 stations, 25 dynamic and 61 static features
[138]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
AI_Oudin, AI_PM, IDPR_MAX, IDPR_MEAN, IDPR_MIN, IDPR_RANGE, IDPR_STD, Kc_Gravelius, PET_Oudin_sum,
PET_PM_sum, Qspec_sum, SLOPE_MAX, SLOPE_MIN, SLOPE_RANGE, SLOPE_STD, Station, TWI_MAX, TWI_MEAN,
TWI_MIN, TWI_RANGE, TWI_STD, VRM_MAX, VRM_MEAN, VRM_MIN, VRM_RANGE, VRM_STD, XLuref, YLuref, Z_MAX,
Z_MIN, Z_RANGE, Z_STD, agency, area_km2, catchment, crop_frac, elev_catch_m, end,
forests_naturalareas, grass_frac, impermeable_formations, lat, limestone_dolomites, long,
marl_claystone, perimeter_km, permeable_formations, prad_sum, pstn_sum, runoffratio,
sandstone_conglomerates, schists_quartzites, slope_degree, start, stream, surface_deposits, t2m_max,
t2m_mean, t2m_min, urban_frac, watercourses_waterbodies_wetlands
[139]:
df = dataset.fetch_static_features()
print(df.shape)
(56, 61)
[140]:
print(df.isna().sum().sum())
df.isna().sum()
0
[140]:
watercourses_waterbodies_wetlands    0
forests_naturalareas                 0
grass_frac                           0
crop_frac                            0
urban_frac                           0
                                    ..
VRM_MIN                              0
VRM_MAX                              0
VRM_RANGE                            0
VRM_MEAN                             0
VRM_STD                              0
Length: 61, dtype: int64

find those columns which have at least one NaN value

[141]:
df.loc[:, (df.isna().sum()>0)]
[141]:
gauge_id
ID_01
ID_02
ID_03
ID_04
ID_05
ID_06
ID_07
ID_08
ID_09
ID_10
ID_11
ID_12
ID_13
ID_14
ID_15
ID_16
ID_17
ID_18
ID_19
ID_20
ID_21
ID_22
ID_23
ID_24
ID_25
ID_26
ID_27
ID_28
ID_29
ID_30
ID_31
ID_32
ID_33
ID_34
ID_35
ID_36
ID_37
ID_38
ID_39
ID_40
ID_41
ID_42
ID_43
ID_44
ID_45
ID_46
ID_47
ID_48
ID_49
ID_50
ID_51
ID_52
ID_53
ID_54
ID_55
ID_56
[142]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[142]:
Series([], dtype: float64)
[143]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Qflag, RR_flag_rad, RR_max_rad, RR_min_rad, airtemp_C_mean, cape, cin, dls, kx, lls, pcp_mm_era5,
pcp_mm_radar, pcp_mm_station, pet_mm_oudin, pet_mm_pm, q_cms_obs, q_mmd_obs, rh_%, sml1, sml2, sml3,
sml4, spechum_gkg, tcwv, windspeed_mps

print total number of nans for each of dynamic feature.

[144]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[144]:
<xarray.Dataset> Size: 70MB
Dimensions:           (time: 6209, dynamic_features: 25)
Coordinates:
  * time              (time) datetime64[ns] 50kB 2004-11-01 ... 2021-10-31
  * dynamic_features  (dynamic_features) <U14 1kB 'q_cms_obs' ... 'sml4'
Data variables: (12/56)
    ID_01             (time, dynamic_features) float64 1MB ...
    ID_02             (time, dynamic_features) float64 1MB ...
    ID_03             (time, dynamic_features) float64 1MB ...
    ID_04             (time, dynamic_features) float64 1MB ...
    ID_05             (time, dynamic_features) float64 1MB ...
    ID_06             (time, dynamic_features) float64 1MB ...
    ...                ...
    ID_51             (time, dynamic_features) float64 1MB ...
    ID_52             (time, dynamic_features) float64 1MB ...
    ID_53             (time, dynamic_features) float64 1MB ...
    ID_54             (time, dynamic_features) float64 1MB ...
    ID_55             (time, dynamic_features) float64 1MB ...
    ID_56             (time, dynamic_features) float64 1MB ...
[145]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
q_cms_obs 0
q_mmd_obs 0
Qflag 0
pcp_mm_radar 0
RR_min_rad 0
RR_max_rad 0
RR_flag_rad 0
pcp_mm_station 0
pcp_mm_era5 0
airtemp_C_mean 0
pet_mm_oudin 0
pet_mm_pm 14800
cape 0
cin 0
kx 0
spechum_gkg 0
rh_% 0
tcwv 0
windspeed_mps 0
lls 17050
dls 0
sml1 0
sml2 0
sml3 0
sml4 0

CAMELS_SE

[146]:
dataset = RainfallRunoff('CAMELS_SE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_SE with 50 stations, 4 dynamic and 76 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2179: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(
[147]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Agriculture_percentage, Bedrock_percentage_sc, Clayey_till_and_clay_till_percentage_sc, DOR,
Elevation_mabsl, Forest_percentage, Glacier_percentage_sc, Glaciers_percentage,
Glaciofluvial_sediment_percentage_sc, Name, Open_land_percentage, Peat_percentage_sc, Pmean_mm_year,
Postglacial_sand_and_gravel_percentage_sc, RegVol_m3, S01_Qmean_CNP_61_90, S01_Qmean_CNP_91_20,
S01_Qmean_hs, S02_Qcoeff_CNP_61_90, S02_Qcoeff_CNP_91_20, S02_Qcoeff_hs, S03_COM_CNP_61_90,
S03_COM_CNP_91_20, S03_COM_hs, S04_SPD_CNP_61_90, S04_SPD_CNP_91_20, S04_SPD_hs,
S05_Qmean_spring_CNP_61_90, S05_Qmean_spring_CNP_91_20, S05_Qmean_spring_hs,
S06_Qmean_summer_CNP_61_90, S06_Qmean_summer_CNP_91_20, S06_Qmean_summer_hs,
S07_Qmean_autumn_CNP_61_90, S07_Qmean_autumn_CNP_91_20, S07_Qmean_autumn_hs,
S08_Qmean_winter_CNP_61_90, S08_Qmean_winter_CNP_91_20, S08_Qmean_winter_hs, S09_LFfreq_CNP_61_90,
S09_LFfreq_CNP_91_20, S09_LFfreq_hs, S10_T_minQ_d30_CNP_61_90, S10_T_minQ_d30_CNP_91_20,
S10_T_minQ_d30_hs, S11_minQ_d7_CNP_61_90, S11_minQ_d7_CNP_91_20, S11_minQ_d7_hs,
S12_minQ_d30_CNP_61_90, S12_minQ_d30_CNP_91_20, S12_minQ_d30_hs, S13_HFfreq_CNP_61_90,
S13_HFfreq_CNP_91_20, S13_HFfreq_hs, S14_T_maxQ_d1_CNP_61_90, S14_T_maxQ_d1_CNP_91_20,
S14_T_maxQ_d1_hs, S15_maxQ_d30_CNP_61_90, S15_maxQ_d30_CNP_91_20, S15_maxQ_d30_hs,
S16_maxQ_d1_CNP_61_90, S16_maxQ_d1_CNP_91_20, S16_maxQ_d1_hs, Shrubs_and_grassland_percentage,
Silt_percentage_sc, Slope_mean_degree, Till_and_weathered_deposit_percentage_sc, Till_percentage_sc,
Tmean_C, Urban_percentage, Water_percentage, Water_percentage_sc, Wetlands_percentage, area_km2,
lat, long
[148]:
df = dataset.fetch_static_features()
print(df.shape)
(50, 76)
[149]:
print(df.isna().sum().sum())
df.isna().sum()
0
[149]:
Agriculture_percentage                     0
Bedrock_percentage_sc                      0
Clayey_till_and_clay_till_percentage_sc    0
DOR                                        0
Elevation_mabsl                            0
                                          ..
Water_percentage_sc                        0
Wetlands_percentage                        0
area_km2                                   0
lat                                        0
long                                       0
Length: 76, dtype: int64

find those columns which have at least one NaN value

[150]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[151]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[151]:
Series([], dtype: float64)
[152]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs, q_mmd_obs

print total number of nans for each of dynamic feature.

[153]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[153]:
<xarray.Dataset> Size: 18MB
Dimensions:           (time: 21915, dynamic_features: 4)
Coordinates:
  * time              (time) datetime64[ns] 175kB 1961-01-01 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U14 224B 'airtemp_C_mean' ... 'q_mm...
Data variables: (12/50)
    5                 (time, dynamic_features) float32 351kB ...
    20                (time, dynamic_features) float32 351kB ...
    37                (time, dynamic_features) float32 351kB ...
    97                (time, dynamic_features) float32 351kB ...
    138               (time, dynamic_features) float32 351kB ...
    186               (time, dynamic_features) float32 351kB ...
    ...                ...
    1740              (time, dynamic_features) float32 351kB ...
    1762              (time, dynamic_features) float32 351kB ...
    1780              (time, dynamic_features) float32 351kB ...
    2020              (time, dynamic_features) float32 351kB ...
    2203              (time, dynamic_features) float32 351kB ...
    20002             (time, dynamic_features) float32 351kB ...
[154]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airtemp_C_mean 0
pcp_mm 0
q_cms_obs 0
q_mmd_obs 0

CAMELS_US

[155]:
dataset = RainfallRunoff('CAMELS_US', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_US with 671 stations, 8 dynamic and 59 static features
[156]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_geospa_fabric, area_km2, aridity, baseflow_index, carbonate_rocks_frac, clay_frac,
dom_land_cover, dom_land_cover_frac, elev_mean, frac_forest, frac_snow, gauge_name, geol_1st_class,
geol_2nd_class, geol_permeability, geol_porostiy, glim_1st_class_frac, glim_2nd_class_frac,
gvf_diff, gvf_max, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur,
high_q_freq, huc_02, lai_diff, lai_max, lat, long, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, max_water_content, organic_frac, other_frac, p_mean, p_seasonality, pet_mean,
q5, q95, q_mean, root_depth_50, root_depth_99, runoff_ratio, sand_frac, silt_frac, slope_fdc,
slope_mkm-1, soil_conductivity, soil_depth_pelletier, soil_depth_statsgo, soil_porosity,
stream_elas, water_frac, zero_q_freq
[157]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_min, dayl(s), pcp_mm, q_cms_obs, solrad_wm2, swe_mm, vp_hpa

print total number of nans for each of dynamic feature.

[158]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[158]:
<xarray.Dataset> Size: 549MB
Dimensions:           (time: 12784, dynamic_features: 8)
Coordinates:
  * time              (time) datetime64[ns] 102kB 1980-01-01 ... 2014-12-31
  * dynamic_features  (dynamic_features) <U13 416B 'dayl(s)' ... 'q_cms_obs'
Data variables: (12/671)
    12048000          (time, dynamic_features) float64 818kB ...
    14020000          (time, dynamic_features) float64 818kB ...
    12447390          (time, dynamic_features) float64 818kB ...
    14154500          (time, dynamic_features) float64 818kB ...
    12147500          (time, dynamic_features) float64 818kB ...
    14185900          (time, dynamic_features) float64 818kB ...
    ...                ...
    05408000          (time, dynamic_features) float64 818kB ...
    05488200          (time, dynamic_features) float64 818kB ...
    05592575          (time, dynamic_features) float64 818kB ...
    05393500          (time, dynamic_features) float64 818kB ...
    05591550          (time, dynamic_features) float64 818kB ...
    05595730          (time, dynamic_features) float64 818kB ...
[159]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
dayl(s) 0
pcp_mm 0
solrad_wm2 0
swe_mm 0
airtemp_C_max 0
airtemp_C_min 0
vp_hpa 0
q_cms_obs 0

Caravan_DK

[160]:
dataset = RainfallRunoff('Caravan_DK', path=DATA_PATH, verbosity=0)
print(dataset)
Caravan_DK with 308 stations, 39 dynamic and 211 static features
[161]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_km2, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj, cmi_ix_s01, cmi_ix_s02,
cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08, cmi_ix_s09, cmi_ix_s10,
cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn, dis_m3_pmx, dis_m3_pyr,
dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj, fmh_cl_smj, for_pc_sse,
frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02,
glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10,
glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18,
glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93,
high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lat,
lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq, moisture_index, nli_ix_sav,
p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05,
pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr,
pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07,
pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15,
pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06,
pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse,
pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality, sgr_dk_sav,
slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04, snw_pc_s05,
snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12, snw_pc_smx,
snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05, swc_pc_s06,
swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr, tbi_cl_smj,
tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06, tmp_dc_s07,
tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx, tmp_dc_syr,
urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05, wet_pc_s06,
wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[162]:
df = dataset.fetch_static_features()
print(df.shape)
(308, 211)
[163]:
print(df.isna().sum().sum())
df.isna().sum()
0
[163]:
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
aet_mm_s05    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 211, dtype: int64

find those columns which have at least one NaN value

[164]:
df.loc[:, (df.isna().sum()>0)]
[164]:
240001
590006
340003
450043
100009
...
610013
180078
150046
490082
20006

308 rows × 0 columns

[165]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[165]:
Series([], dtype: float64)
[166]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
dewpoint_temperature_2m_max, dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min,
potential_evaporation_sum, q_cms_obs, snow_depth_water_equivalent_max,
snow_depth_water_equivalent_mean, snow_depth_water_equivalent_min, surface_net_solar_radiation_max,
surface_net_solar_radiation_mean, surface_net_solar_radiation_min,
surface_net_thermal_radiation_max, surface_net_thermal_radiation_mean,
surface_net_thermal_radiation_min, surface_pressure_max, surface_pressure_mean,
surface_pressure_min, temperature_2m_max, temperature_2m_mean, temperature_2m_min,
total_precipitation_sum, u_component_of_wind_10m_max, u_component_of_wind_10m_mean,
u_component_of_wind_10m_min, v_component_of_wind_10m_max, v_component_of_wind_10m_mean,
v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max, volumetric_soil_water_layer_1_mean,
volumetric_soil_water_layer_1_min, volumetric_soil_water_layer_2_max,
volumetric_soil_water_layer_2_mean, volumetric_soil_water_layer_2_min,
volumetric_soil_water_layer_3_max, volumetric_soil_water_layer_3_mean,
volumetric_soil_water_layer_3_min, volumetric_soil_water_layer_4_max,
volumetric_soil_water_layer_4_mean, volumetric_soil_water_layer_4_min

print total number of nans for each of dynamic feature.

[167]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[167]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14609, dynamic_features: 39)
Coordinates:
  * time              (time) datetime64[ns] 117kB 1981-01-02 ... 2020-12-31
  * dynamic_features  (dynamic_features) <U34 5kB 'dewpoint_temperature_2m_ma...
Data variables: (12/308)
    240001            (time, dynamic_features) float64 5MB ...
    590006            (time, dynamic_features) float64 5MB ...
    340003            (time, dynamic_features) float64 5MB ...
    450043            (time, dynamic_features) float64 5MB ...
    100009            (time, dynamic_features) float64 5MB ...
    410023            (time, dynamic_features) float64 5MB ...
    ...                ...
    510022            (time, dynamic_features) float64 5MB ...
    610013            (time, dynamic_features) float64 5MB ...
    180078            (time, dynamic_features) float64 5MB ...
    150046            (time, dynamic_features) float64 5MB ...
    490082            (time, dynamic_features) float64 5MB ...
    20006             (time, dynamic_features) float64 5MB ...
[168]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
dewpoint_temperature_2m_max 4382
dewpoint_temperature_2m_mean 3287
dewpoint_temperature_2m_min 3287
potential_evaporation_sum 3287
q_cms_obs 3287
snow_depth_water_equivalent_max 7271
snow_depth_water_equivalent_mean 12417
snow_depth_water_equivalent_min 3287
surface_net_solar_radiation_max 3287
surface_net_solar_radiation_mean 4681
surface_net_solar_radiation_min 8404
surface_net_thermal_radiation_max 8400
surface_net_thermal_radiation_mean 7305
surface_net_thermal_radiation_min 3287
surface_pressure_max 3287
surface_pressure_mean 5113
surface_pressure_min 4796
temperature_2m_max 3287
temperature_2m_mean 3287
temperature_2m_min 7305
total_precipitation_sum 3287
u_component_of_wind_10m_max 4680
u_component_of_wind_10m_mean 3848
u_component_of_wind_10m_min 3287
v_component_of_wind_10m_max 3287
v_component_of_wind_10m_mean 10592
v_component_of_wind_10m_min 3287
volumetric_soil_water_layer_1_max 4438
volumetric_soil_water_layer_1_mean 6936
volumetric_soil_water_layer_1_min 4748
volumetric_soil_water_layer_2_max 3287
volumetric_soil_water_layer_2_mean 6939
volumetric_soil_water_layer_2_min 8003
volumetric_soil_water_layer_3_max 3287
volumetric_soil_water_layer_3_mean 3287
volumetric_soil_water_layer_3_min 8018
volumetric_soil_water_layer_4_max 3287
volumetric_soil_water_layer_4_mean 3287
volumetric_soil_water_layer_4_min 4675

LamaHCE

[169]:
dataset = RainfallRunoff('LamaHCE', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 22 dynamic and 84 static features
[170]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_gov, area_km2, area_ratio, arid_1, arid_2, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps_post, gaps_pre, gc_dom, gc_ig_fra, gc_mt_fra,
gc_pa_fra, gc_pb_fra, gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra,
gc_vb_fra, gc_wb_fra, geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max,
hi_prec_du, hi_prec_fr, hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du,
lo_prec_fr, lo_prec_ti, long, mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, nrs_euhyd, nrs_rivat,
obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean, p_season, region, river, root_dep, sand_fra,
silt_fra, slope_mkm-1, soil_condu, soil_poros, soil_tawc, strm_dens, typimpact, urban_fra
[171]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 84)
[172]:
print(df.isna().sum().sum())
df.isna().sum()
65
[172]:
agr_fra       0
area_gov      0
area_km2      0
area_ratio    0
arid_1        0
             ..
soil_poros    0
soil_tawc     0
strm_dens     0
typimpact     0
urban_fra     0
Length: 84, dtype: int64

find those columns which have at least one NaN value

[173]:
df.loc[:, (df.isna().sum()>0)]
[173]:
geol_perme hi_prec_ti lo_prec_ti nrs_rivat
ID
826 -12.4 NaN son 20376803.0
819 -11.5 son djf 20464042.0
79 -13.3 jja djf 20454049.0
696 -12.2 jja djf 20424102.0
98 -12.0 jja djf 20440228.0
... ... ... ... ...
261 -12.1 jja djf 20428827.0
587 -12.9 jja djf 20461304.0
827 -12.6 jja son 20379436.0
250 -13.4 jja djf 20441631.0
72 -12.4 jja djf 20451775.0

859 rows × 4 columns

[174]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[174]:
geol_perme     1
hi_prec_ti    42
lo_prec_ti     3
nrs_rivat     19
dtype: int64
[175]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_max, airtemp_C_mean, airtemp_C_min, dptemp_C_max_2m, dptemp_C_mean_2m,
dptemp_C_min_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm, q_cms_obs, solrad_wm2, solrad_wm2_max,
swe_mm, thermrad_wm2, thermrad_wm2_max, total_et, volsw_123, volsw_4, windspeedu_mps, windspeedv_mps

print total number of nans for each of dynamic feature.

[176]:
_, dyn_ds = dataset.fetch("all", dynamic_features=dataset.dynamic_features)

dyn_ds
[176]:
<xarray.Dataset> Size: 1GB
Dimensions:           (time: 14244, dynamic_features: 22)
Coordinates:
  * time              (time) datetime64[ns] 114kB 1981-01-01 ... 2019-12-31
  * dynamic_features  (dynamic_features) <U16 1kB 'airpres_hpa' ... 'windspee...
Data variables: (12/859)
    826               (time, dynamic_features) float32 1MB 951.0 3.4 ... -1.3
    819               (time, dynamic_features) float32 1MB 776.8 -7.3 ... -0.4
    79                (time, dynamic_features) float32 1MB 763.6 -9.6 ... -0.5
    696               (time, dynamic_features) float32 1MB 943.6 2.9 ... 1.3 0.5
    98                (time, dynamic_features) float32 1MB 897.0 -1.2 ... 0.9
    197               (time, dynamic_features) float32 1MB 930.4 1.7 ... -0.3
    ...                ...
    512               (time, dynamic_features) float32 1MB 960.9 2.4 ... 1.3 0.3
    261               (time, dynamic_features) float32 1MB 933.1 2.3 ... 1.1 0.3
    587               (time, dynamic_features) float32 1MB 884.3 0.0 ... -1.0
    827               (time, dynamic_features) float32 1MB 944.6 2.8 ... -0.9
    250               (time, dynamic_features) float32 1MB 856.6 -2.7 ... -0.2
    72                (time, dynamic_features) float32 1MB 813.5 -5.3 ... 0.1
[177]:
for feat, nans in zip(
    dyn_ds.dynamic_features.data.tolist(),
    dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
    ):

    print(feat, nans)
airpres_hpa 730
airtemp_C_max 730
airtemp_C_mean 730
airtemp_C_min 12418
dptemp_C_max_2m 1095
dptemp_C_mean_2m 2191
dptemp_C_min_2m 730
fcst_alb 9496
lai_high_veg 10444
lai_low_veg 730
pcp_mm 730
q_cms_obs 730
solrad_wm2 730
solrad_wm2_max 730
swe_mm 730
thermrad_wm2 730
thermrad_wm2_max 6208
total_et 730
volsw_123 730
volsw_4 730
windspeedu_mps 6208
windspeedv_mps 4382

LamaHIce

[178]:
dataset = RainfallRunoff('LamaHIce', data_type='total_upstrm',
                         path=os.path.join(DATA_PATH, 'LamaHIce_daily'), verbosity=0)
print(dataset)
LamaHIce with 111 stations, 36 dynamic and 154 static features
[179]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ET_ERA5L_all_basin, ET_ERA5L_unfiltered_basin, ET_rav_all_basin, ET_rav_unfiltered_basin,
PET_ERA5L_all_basin, PET_ERA5L_unfiltered_basin, PET_rav_all_basin, PET_rav_unfiltered_basin,
P_ERA5L_all_basin, P_ERA5L_unfiltered_basin, P_rav_all_basin, P_rav_unfiltered_basin, Q5_basin,
Q5_gauge, Q95_basin, Q95_gauge, Q_all_basin, Q_unfiltered_basin, VHM_no_gauge, V_no_gauge,
agr_fra_basin, area_km2, aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin,
baseflow_index_ladson_basin, baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin,
degimpact_basin, degimpact_gauge, elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin,
elevation_gauge, elon_ratio_basin, forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin,
g621_fra_basin, g701_fra_basin, g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin,
g_dom_NI_basin, g_frac_basin, g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin,
g_min_el_basin, g_slope_basin, g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin,
gbnew_fra_basin, gbold_fra_basin, gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin,
gc_23_vapy_basin, gc_23_vb_basin, gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin,
gc_pb_fra_basin, gc_va_fra_basin, gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin,
ghraun_fra_basin, glac_fra_basin, gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin,
gsn_fra_basin, gsnew_fra_basin, gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin,
hfd_mean_gauge, high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin,
high_prec_fr_basin, high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin,
high_q_dur_gauge, high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin,
lake_fra_basin, lat, lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, long,
low_prec_du_ERA5L_basin, low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin,
low_q_dur_basin, low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin,
mvert_dist_basin, name_gauge, ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge,
obsend_day_gauge, obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin,
p_season_ERA5L_basin, p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge,
ref_et_mean_basin, river_gauge, root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge,
sand_fra_basin, scrub_fra_basin, silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mkm-1,
soil_poros_basin, soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin,
typimpact_basin, typimpact_gauge, urban_fra_basin, water_year_all_basin,
water_year_unfiltered_basin, wetl_fra_basin, zero_q_freq_gauge
[180]:
df = dataset.fetch_static_features()
print(df.shape)
(111, 154)
[181]:
print(df.isna().sum().sum())
df.isna().sum()
2013
[181]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
                               ..
urban_fra_basin                 0
water_year_all_basin           37
water_year_unfiltered_basin    14
wetl_fra_basin                  0
zero_q_freq_gauge              37
Length: 154, dtype: int64

find those columns which have at least one NaN value

[182]:
df.loc[:, (df.isna().sum()>0)]
[182]:
ET_ERA5L_all_basin ET_ERA5L_unfiltered_basin ET_rav_all_basin ET_rav_unfiltered_basin PET_ERA5L_all_basin PET_ERA5L_unfiltered_basin PET_rav_all_basin PET_rav_unfiltered_basin P_ERA5L_all_basin P_ERA5L_unfiltered_basin ... q_mean_gauge runoff_ratio_basin runoff_ratio_gauge slope_fdc_basin slope_fdc_gauge stream_elas_basin stream_elas_gauge water_year_all_basin water_year_unfiltered_basin zero_q_freq_gauge
id
79 0.640192 0.636492 0.634929 0.633861 1.295533 1.293016 0.547516 0.557275 4.648427 4.631319 ... 9.389204 1.786 1.785626 0.404 0.404174 0.507 0.506978 2000.272524 1999.000666 0.0
98 0.603482 0.593119 0.599214 0.588846 2.000104 1.942428 0.559626 0.536703 4.717848 4.647344 ... 5.951890 1.178 1.178091 0.561 0.560945 0.666 0.666312 2010.601150 2011.000632 0.0
25 1.378187 1.396363 0.668298 0.665824 4.609922 4.601343 0.529151 0.506576 5.546952 5.682186 ... 10.316638 2.346 2.345917 1.441 1.440679 0.579 0.579003 2004.386057 2003.470929 0.0
1 0.710055 0.718156 0.651314 0.655982 1.472727 1.473907 0.616380 0.603120 3.769304 3.859496 ... 4.546374 1.115 1.114759 2.715 2.715291 1.365 1.364999 2010.248973 2010.499609 0.0
34 NaN 0.436237 NaN 0.282329 NaN 0.656889 NaN 0.243403 NaN 3.560044 ... NaN NaN NaN NaN NaN NaN NaN NaN 1999.000666 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
94 NaN 0.565821 NaN 0.646134 NaN 1.009009 NaN 0.569345 NaN 2.851701 ... NaN NaN NaN NaN NaN NaN NaN NaN 2010.014082 NaN
54 0.286762 0.318574 0.428488 0.405101 0.409014 0.466285 0.490258 0.472720 4.375074 4.262709 ... 6.364638 1.014 1.014307 3.767 3.766742 0.929 0.929429 1997.000000 2000.226138 0.0
77 0.383486 0.368716 0.464558 0.457334 0.557757 0.534627 0.620087 0.598473 3.094146 3.139548 ... 2.217995 0.781 0.780813 1.384 1.383765 0.459 0.458724 2000.151013 2002.000000 0.0
80 0.649492 0.646736 0.529909 0.521922 1.126047 1.143551 0.468229 0.485154 4.652074 4.312049 ... 4.225174 0.871 0.870663 1.945 1.945021 0.913 0.913007 2008.890240 2009.224360 0.0
72 NaN 0.476955 NaN 0.385069 NaN 0.928519 NaN 0.511240 NaN 3.048227 ... NaN NaN NaN NaN NaN NaN NaN NaN 2007.389354 NaN

111 rows × 53 columns

[183]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[183]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
PET_ERA5L_unfiltered_basin     14
PET_rav_all_basin              37
PET_rav_unfiltered_basin       14
P_ERA5L_all_basin              37
P_ERA5L_unfiltered_basin       14
P_rav_all_basin                37
P_rav_unfiltered_basin         14
Q5_basin                       37
Q5_gauge                       37
Q95_basin                      37
Q95_gauge                      37
Q_all_basin                    37
Q_unfiltered_basin             14
baseflow_index_ladson_basin    37
baseflow_index_ladson_gauge    37
g_aspect_basin                 47
g_lat_basin                    47
g_lon_basin                    47
g_max_el_basin                 47
g_mean_el_basin                47
g_min_el_basin                 47
g_slope_basin                  47
g_slopel20_basin               47
gaps_hourly_gauge              35
hfd_mean_basin                 42
hfd_mean_gauge                 42
high_prec_timing_basin          4
high_q_dur_basin               67
high_q_dur_gauge               67
high_q_freq_basin              67
high_q_freq_gauge              67
low_prec_timing_ERA5L_basin     2
low_prec_timing_basin           1
low_q_dur_basin                70
low_q_dur_gauge                70
low_q_freq_basin               70
low_q_freq_gauge               70
q_mean_basin                   37
q_mean_gauge                   37
runoff_ratio_basin             37
runoff_ratio_gauge             37
slope_fdc_basin                37
slope_fdc_gauge                37
stream_elas_basin              37
stream_elas_gauge              37
water_year_all_basin           37
water_year_unfiltered_basin    14
zero_q_freq_gauge              37
dtype: int64
[184]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp_max, 2m_dp_temp_mean,
2m_dp_temp_min, 2m_qv_rav, 2m_temp_rav, airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m,
fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, pcp_mm, pet_mm, prec_carra, prec_rav, q_cms_obs,
ref_et_mm, surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad_max,
surf_net_solar_rad_mean, surf_net_therm_rad_max, surf_net_therm_rad_mean, surf_outg_therm_rad_rav,
surf_press, surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[185]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

HYSETS

[186]:
dataset = RainfallRunoff('HYSETS', path=DATA_PATH, verbosity=0)
print(dataset)
HYSETS with 14425 stations, 20 dynamic and 30 static features
[187]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Drainage_Area_GSIM_km2, Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries,
Flag_Land_Use_Extraction, Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction,
Gravelius, Hydrometric_station_latitude, Hydrometric_station_longitude, Land_Use_Crops_frac,
Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac, Land_Use_Snow_Ice_frac,
Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name, Official_ID, Perimeter,
Permeability_logk_m2, Porosity_frac, Source, area_km2, lat, long, slope_degrees
[188]:
df = dataset.fetch_static_features()
print(df.shape)
(14425, 30)
[189]:
print(df.isna().sum().sum())
df.isna().sum()
20179
[189]:
Source                               0
Name                                 0
Official_ID                          0
lat                                  0
long                                 0
area_km2                             0
Drainage_Area_GSIM_km2           13561
Flag_GSIM_boundaries                 0
Flag_Artificial_Boundaries           0
Elevation_m                          6
slope_degrees                        6
Gravelius                         1633
Perimeter                         1633
Flag_Shape_Extraction                0
Aspect_deg                           6
Flag_Terrain_Extraction              0
Land_Use_Forest_frac                13
Land_Use_Grass_frac                 13
Land_Use_Wetland_frac               13
Land_Use_Water_frac                 13
Land_Use_Urban_frac                 13
Land_Use_Shrubs_frac                13
Land_Use_Crops_frac                 13
Land_Use_Snow_Ice_frac              13
Flag_Land_Use_Extraction             0
Permeability_logk_m2              1615
Porosity_frac                     1615
Flag_Subsoil_Extraction              0
Hydrometric_station_latitude         0
Hydrometric_station_longitude        0
dtype: int64

find those columns which have at least one NaN value

[190]:
df.loc[:, (df.isna().sum()>0)]
[190]:
Drainage_Area_GSIM_km2 Elevation_m slope_degrees Gravelius Perimeter Aspect_deg Land_Use_Forest_frac Land_Use_Grass_frac Land_Use_Wetland_frac Land_Use_Water_frac Land_Use_Urban_frac Land_Use_Shrubs_frac Land_Use_Crops_frac Land_Use_Snow_Ice_frac Permeability_logk_m2 Porosity_frac
Watershed_ID
1 NaN 362.3 3.5329 2.7834 1194.505 130.4023 0.7869 0.0147 0.0645 0.0258 0.0089 0.0749 0.0242 0.0 -14.719327 0.180905
2 NaN 353.4 4.6633 2.0656 269.164 91.7329 0.8452 0.0102 0.0228 0.0219 0.0174 0.0410 0.0414 0.0 -14.056491 0.206450
3 2693.814 293.3 4.4690 2.0620 381.994 223.9510 0.8207 0.0093 0.0032 0.0487 0.0230 0.0351 0.0600 0.0 -14.537390 0.165357
4 NaN 276.5 4.1819 2.4682 413.839 120.7400 0.6837 0.0226 0.1024 0.0630 0.0115 0.0641 0.0528 0.0 -14.687869 0.170597
5 NaN 201.8 2.8061 NaN NaN 56.8902 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14421 NaN 1987.9 17.1982 2.0752 208.852 28.9860 0.5356 0.0330 0.0000 0.0000 0.0202 0.0170 0.3941 0.0 -13.160658 0.096755
14422 NaN 769.5 6.5921 1.5715 325.714 110.5607 0.1348 0.3106 0.0025 0.0024 0.0305 0.0300 0.4874 0.0 -12.698509 0.119993
14423 NaN 1883.2 14.7005 2.5953 1621.229 224.3422 0.8674 0.0437 0.0000 0.0026 0.0027 0.0429 0.0408 0.0 -12.976926 0.090284
14424 NaN 1791.2 12.1021 2.4269 1288.932 184.5177 0.7720 0.1524 0.0000 0.0013 0.0029 0.0474 0.0241 0.0 -12.968686 0.094042
14425 NaN 2179.1 5.9444 2.0769 165.762 112.0832 0.1605 0.5639 0.0000 0.0012 0.0091 0.1116 0.1536 0.0 -12.792099 0.168963

14425 rows × 16 columns

[191]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[191]:
Drainage_Area_GSIM_km2    13561
Elevation_m                   6
slope_degrees                 6
Gravelius                  1633
Perimeter                  1633
Aspect_deg                    6
Land_Use_Forest_frac         13
Land_Use_Grass_frac          13
Land_Use_Wetland_frac        13
Land_Use_Water_frac          13
Land_Use_Urban_frac          13
Land_Use_Shrubs_frac         13
Land_Use_Crops_frac          13
Land_Use_Snow_Ice_frac       13
Permeability_logk_m2       1615
Porosity_frac              1615
dtype: int64
[192]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_2m_max, airtemp_C_2m_min, cloudcover, dptemp_C_mean_2m, evap_mm,
evap_mm_snow, lwdownrad_wm2, lwnetrad_wm2, pcp_mm, q_cms_obs, q_mmd_obs, snowdensity_kgm3,
snowfall_mm, snowmelt_mm, solrad_wm2, solradnet_wm2, swe_mm, windspeedu_mps, windspeedv_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[193]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

GRDCCaravan

[194]:
dataset = RainfallRunoff('GRDCCaravan', path=DATA_PATH, verbosity=0)
print(dataset)
GRDCCaravan with 5357 stations, 40 dynamic and 211 static features
[195]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area_fraction_used_for_aggregation,
area_km2, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj, cmi_ix_s01, cmi_ix_s02,
cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08, cmi_ix_s09, cmi_ix_s10,
cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn, dis_m3_pmx, dis_m3_pyr,
dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj, fmh_cl_smj, for_pc_sse,
frac_snow, gauge_name, gdp_ud_sav, gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02,
glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10,
glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18,
glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93,
high_prec_dur, high_prec_freq, inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lat,
lit_cl_smj, lka_pc_sse, lkv_mc_usu, long, low_prec_dur, low_prec_freq, moisture_index, nli_ix_sav,
p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05,
pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr,
pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07,
pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15,
pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06,
pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse,
pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality, sgr_dk_sav,
slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04, snw_pc_s05,
snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12, snw_pc_smx,
snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05, swc_pc_s06,
swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr, tbi_cl_smj,
tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06, tmp_dc_s07,
tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx, tmp_dc_syr,
urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05, wet_pc_s06,
wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[196]:
df = dataset.fetch_static_features()
print(df.shape)
(5357, 211)
[197]:
print(df.isna().sum().sum())
df.isna().sum()
0
[197]:
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
aet_mm_s05    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 211, dtype: int64

find those columns which have at least one NaN value

[198]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[199]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[199]:
Series([], dtype: float64)
[200]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m, dewpoint_temperature_2m_max,
dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min, pcp_mm, potential_evaporation_sum,
q_cms_obs, q_mmd_obs, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, surface_net_solar_radiation_max, surface_net_solar_radiation_mean,
surface_net_solar_radiation_min, surface_net_thermal_radiation_max,
surface_net_thermal_radiation_mean, surface_net_thermal_radiation_min, surface_pressure_max,
surface_pressure_mean, surface_pressure_min, u_component_of_wind_10m_max,
u_component_of_wind_10m_mean, u_component_of_wind_10m_min, v_component_of_wind_10m_max,
v_component_of_wind_10m_mean, v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max,
volumetric_soil_water_layer_1_mean, volumetric_soil_water_layer_1_min,
volumetric_soil_water_layer_2_max, volumetric_soil_water_layer_2_mean,
volumetric_soil_water_layer_2_min, volumetric_soil_water_layer_3_max,
volumetric_soil_water_layer_3_mean, volumetric_soil_water_layer_3_min,
volumetric_soil_water_layer_4_max, volumetric_soil_water_layer_4_mean,
volumetric_soil_water_layer_4_min

print total number of nans for each of dynamic feature for GRDCCaravan. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[201]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

CCAM

[202]:
dataset = RainfallRunoff('CCAM', path=DATA_PATH, verbosity=0)
print(dataset)
CCAM with 102 stations, 16 dynamic and 124 static features
[203]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_km2, barren, bdticm, bldfie_sl1, bldfie_sl2, bldfie_sl3, bldfie_sl4, bldfie_sl5, bldfie_sl6,
bldfie_sl7, cecsol_sl1, cecsol_sl2, cecsol_sl3, cecsol_sl4, cecsol_sl5, cecsol_sl6, cecsol_sl7,
circulatory_ratio, clay, closed_shrubland, compactness_coefficient, cropland,
cropland_natural_vegetaion, deciduous_broadleaf_tree, deciduous_needleleaf_tree, elev,
elongation_ratio, ev, evergreen_broadleaf_tree, evergreen_needleleaf_tree, evp_mean, form_factor,
frac_snow_daily, geol_permeability, geol_porosity, grassland, grav, gst_mean, high_prec_dur,
high_prec_freq, high_prec_timing, ig, lai_dif, lai_max, lat, length, length_continuous_runoff,
log_k_s_l1, log_k_s_l2, log_k_s_l3, log_k_s_l4, log_k_s_l5, log_k_s_l6, long, low_prec_dur,
low_prec_freq, low_prec_timing, mixed_forest, mt, nd, ndvi_mean, open_shrubland, orcdrc_sl1,
orcdrc_sl2, orcdrc_sl3, orcdrc_sl4, orcdrc_sl5, orcdrc_sl6, orcdrc_sl7, pa, pb, pdep,
permanent_wetland, pet_mean, phihox_sl1, phihox_sl2, phihox_sl3, phihox_sl4, phihox_sl5, phihox_sl6,
phihox_sl7, pi, pop, pop_dnsty, por, pre_mean, prs_mean, py, rhu_mean, root_depth_50, root_depth_99,
sand, savanna, sc, shape_factor, silt, slope_mkm-1, sm, snow_and_ice, som, ss, ssd_mean, su,
tem_mean, theta_s_l1, theta_s_l2, theta_s_l3, theta_s_l4, theta_s_l5, theta_s_l6, tksatu_l1,
tksatu_l2, tksatu_l3, tksatu_l4, tksatu_l5, tksatu_l6, urban_and_built-up_land, va, vb, vi,
water_bodies, wb, win_mean, woody_savanna
[204]:
df = dataset.fetch_static_features()
print(df.shape)
(102, 124)
[205]:
print(df.isna().sum().sum())
df.isna().sum()
0
[205]:
area_km2         0
barren           0
bdticm           0
bldfie_sl1       0
bldfie_sl2       0
                ..
vi               0
water_bodies     0
wb               0
win_mean         0
woody_savanna    0
Length: 124, dtype: int64

find those columns which have at least one NaN value

[206]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[207]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[207]:
Series([], dtype: float64)
[208]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_mm, gtemp_C, gtemp_C_max, gtemp_C_min, pcp_mm,
prs_max, prs_mean, prs_min, q_cms_obs, rh_%, ssd_hr, windspeed_mps, windspeed_mps_max

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[209]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Japan

[210]:
dataset = RainfallRunoff('Japan', path=DATA_PATH, verbosity=0)
print(dataset)
Japan with 751 stations, 27 dynamic and 35 static features

The static features of Japan are same as that of GSHA.

[211]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[212]:
df = dataset.fetch_static_features()
print(df.shape)
(751, 35)
[213]:
print(df.isna().sum().sum())
df.isna().sum()
265
[213]:
EVP_uncertainty(%)      78
HYRIV_ID                 0
LRAD_uncertainty(%)     66
P_uncertainty(%)         0
SRAD_uncertainty(%)      0
T_uncertainty(%)         0
agency                   0
area_km2                 0
cly_pc_uav               0
ele_mt_uav               0
ero_kh_uav               0
gla_pc_use               0
glc_cl_cmj               0
gwt_cm_cav               0
inu_pc_ult               0
lat                      0
lit_cl_cmj               0
long                     0
pet_uncertainty(%)     121
pnv_cl_cmj               0
prm_pc_use               0
sgr_dk_rav               0
slope_degrees            0
slt_pc_uav               0
snd_pc_uav               0
wet_pc_u01               0
wet_pc_u02               0
wet_pc_u03               0
wet_pc_u04               0
wet_pc_u05               0
wet_pc_u06               0
wet_pc_u07               0
wet_pc_u08               0
wet_pc_u09               0
wind_uncertainty(%)      0
dtype: int64

find those columns which have at least one NaN value

[214]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[215]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[215]:
EVP_uncertainty(%)      78
LRAD_uncertainty(%)     66
pet_uncertainty(%)     121
dtype: int64
[216]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[217]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Ireland

[218]:
dataset = RainfallRunoff('Ireland', path=DATA_PATH, verbosity=0)
print(dataset)
Ireland with 464 stations, 10 dynamic and 214 static features

The static features of Ireland are same as that of EStreams.

[219]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[220]:
df = dataset.fetch_static_features()
print(df.shape)
(464, 214)
[221]:
print(df.isna().sum().sum())
df.isna().sum()
9313
[221]:
static_features
area_flag            0
area_km2             0
area_official       16
area_rel            16
aridity            208
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        208
Length: 214, dtype: int64

find those columns which have at least one NaN value

[222]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[223]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[223]:
static_features
area_official           16
area_rel                16
aridity                208
baseflow_index         208
bedrk_dep                1
                      ...
soil_tawc_p90            1
start_date             137
start_date_climatic    208
start_date_hydro       204
zero_q_freq            208
Length: 109, dtype: int64
[224]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[225]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Finland

[226]:
dataset = RainfallRunoff('Finland', path=DATA_PATH, verbosity=0)
print(dataset)
Finland with 669 stations, 10 dynamic and 214 static features

The static features of Finland are same as that of EStreams.

[227]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[228]:
df = dataset.fetch_static_features()
print(df.shape)
(669, 214)
[229]:
print(df.isna().sum().sum())
df.isna().sum()
10509
[229]:
static_features
area_flag            0
area_km2             0
area_official      126
area_rel           126
aridity            176
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        196
Length: 214, dtype: int64

find those columns which have at least one NaN value

[230]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[231]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[231]:
static_features
area_official          126
area_rel               126
aridity                176
baseflow_index         199
dam_yr_first           590
                      ...
soil_tawc_p90            1
start_date               6
start_date_climatic    176
start_date_hydro       196
zero_q_freq            196
Length: 109, dtype: int64
[232]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

Italy

[233]:
dataset = RainfallRunoff('Italy', path=DATA_PATH, verbosity=0)
print(dataset)
Italy with 294 stations, 10 dynamic and 214 static features

The static features of Italy are same as that of EStreams.

[234]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[235]:
df = dataset.fetch_static_features()
print(df.shape)
(294, 214)
[236]:
print(df.isna().sum().sum())
df.isna().sum()
3695
[236]:
static_features
area_flag            0
area_km2             0
area_official      106
area_rel           106
aridity             46
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq         86
Length: 214, dtype: int64

find those columns which have at least one NaN value

[237]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[238]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[238]:
static_features
area_official                106
area_rel                     106
aridity                       46
baseflow_index                87
dam_yr_first                 265
dam_yr_last                  265
duplicated_suspect           277
elevation                    219
end_date_climatic             46
end_date_hydro                85
frac_snow                     46
hfd_mean                      98
hfd_std                      105
hp_dur                        46
hp_freq                       46
hp_time                       46
hq_dur                       107
hq_freq                      107
lp_dur                        46
lp_freq                       46
lp_time                       46
lq_dur                       112
lq_freq                      112
num_years_climatic            45
num_years_hydro               45
p_mean                        46
p_seasonality                 46
pet_mean                      46
q_5                           86
q_95                          86
q_elas_Sankarasubramanian     86
q_mean                        86
q_runoff_ratio                86
res_tot_sto                  265
slope_no_unit                 90
start_date_climatic           46
start_date_hydro              85
zero_q_freq                   86
dtype: int64
[239]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

Poland

[240]:
dataset = RainfallRunoff('Poland', path=DATA_PATH, verbosity=0)
print(dataset)
Poland with 1287 stations, 10 dynamic and 214 static features

The static features of Poland are same as that of EStreams.

[241]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[242]:
df = dataset.fetch_static_features()
print(df.shape)
(1287, 214)
[243]:
print(df.isna().sum().sum())
df.isna().sum()
15798
[243]:
static_features
area_flag            0
area_km2             0
area_official        6
area_rel             6
aridity            270
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        270
Length: 214, dtype: int64

find those columns which have at least one NaN value

[244]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[245]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[245]:
static_features
area_official                   6
area_rel                        6
aridity                       270
baseflow_index                270
dam_yr_first                 1099
dam_yr_last                  1099
duplicated_suspect           1276
elevation                    1287
end_date                      210
end_date_climatic             270
end_date_hydro                270
frac_snow                     270
hfd_mean                      277
hfd_std                       283
hp_dur                        270
hp_freq                       270
hp_time                       270
hq_dur                        485
hq_freq                       485
lp_dur                        270
lp_freq                       270
lp_time                       270
lq_dur                        507
lq_freq                       507
num_days_gaps                 210
num_years_climatic            270
num_years_hydro               270
p_mean                        270
p_seasonality                 270
pet_mean                      270
q_5                           270
q_95                          270
q_elas_Sankarasubramanian     270
q_mean                        270
q_runoff_ratio                270
res_tot_sto                  1101
slope_no_unit                 270
start_date                    210
start_date_climatic           270
start_date_hydro              270
zero_q_freq                   270
dtype: int64
[246]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

Portugal

[247]:
dataset = RainfallRunoff('Portugal', path=DATA_PATH, verbosity=0)
print(dataset)
Portugal with 280 stations, 10 dynamic and 214 static features

The static features of Portugal are same as that of EStreams.

[248]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[249]:
df = dataset.fetch_static_features()
print(df.shape)
(280, 214)
[250]:
print(df.isna().sum().sum())
df.isna().sum()
2172
[250]:
static_features
area_flag           0
area_km2            0
area_official      25
area_rel           25
aridity            33
                   ..
steep_area_fra      0
strm_dens           0
tot_area            0
watershed_group     0
zero_q_freq        34
Length: 214, dtype: int64

find those columns which have at least one NaN value

[251]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[252]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[252]:
static_features
area_official                 25
area_rel                      25
aridity                       33
baseflow_index                94
dam_yr_first                 221
dam_yr_last                  221
duplicated_suspect           280
end_date                       1
end_date_climatic             33
end_date_hydro                34
frac_snow                     33
hfd_mean                      38
hfd_std                       41
hp_dur                        33
hp_freq                       33
hp_time                       33
hq_dur                        36
hq_freq                       36
lp_dur                        33
lp_freq                       33
lp_time                       33
lq_dur                        36
lq_freq                       36
num_days_gaps                  1
num_years_climatic            33
num_years_hydro               33
p_mean                        33
p_seasonality                 33
pet_mean                      33
q_5                           34
q_95                          34
q_elas_Sankarasubramanian     34
q_mean                        34
q_runoff_ratio                34
res_tot_sto                  221
slope_no_unit                 92
start_date                     1
start_date_climatic           33
start_date_hydro              34
zero_q_freq                   34
dtype: int64
[253]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[254]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Simbi

[255]:
dataset = RainfallRunoff('Simbi', path= DATA_PATH, verbosity=0)
print(dataset)
Simbi with 24 stations, 3 dynamic and 232 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[256]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Alluvial aquifers with free water, Alluvial aquifers with partly confined water, Alluvium & detrital
materials_geol, Andesites & rhyodacites_geol, Aridity_mon_arid, BFI1_d, BFI2_d, BFI3_d, BFI_d,
Basalt_geol, Beaches & dunes_lc_98, Carb_Rocks_Perc, Carbonate aquifers with marl intercalation,
Closed Shrubland_lc_95, Continuous urban_lc_98, Cropland_lc_95, Crystalline formation,
Cumul_Freq_1%, Cumul_Freq_10%, Cumul_Freq_100%, Cumul_Freq_11%, Cumul_Freq_12%, Cumul_Freq_13%,
Cumul_Freq_14%, Cumul_Freq_15%, Cumul_Freq_16%, Cumul_Freq_17%, Cumul_Freq_18%, Cumul_Freq_19%,
Cumul_Freq_2%, Cumul_Freq_20%, Cumul_Freq_21%, Cumul_Freq_22%, Cumul_Freq_23%, Cumul_Freq_24%,
Cumul_Freq_25%, Cumul_Freq_26%, Cumul_Freq_27%, Cumul_Freq_28%, Cumul_Freq_29%, Cumul_Freq_3%,
Cumul_Freq_30%, Cumul_Freq_31%, Cumul_Freq_32%, Cumul_Freq_33%, Cumul_Freq_34%, Cumul_Freq_35%,
Cumul_Freq_36%, Cumul_Freq_37%, Cumul_Freq_38%, Cumul_Freq_39%, Cumul_Freq_4%, Cumul_Freq_40%,
Cumul_Freq_41%, Cumul_Freq_42%, Cumul_Freq_43%, Cumul_Freq_44%, Cumul_Freq_45%, Cumul_Freq_46%,
Cumul_Freq_47%, Cumul_Freq_48%, Cumul_Freq_49%, Cumul_Freq_5%, Cumul_Freq_50%, Cumul_Freq_51%,
Cumul_Freq_52%, Cumul_Freq_53%, Cumul_Freq_54%, Cumul_Freq_55%, Cumul_Freq_56%, Cumul_Freq_57%,
Cumul_Freq_58%, Cumul_Freq_59%, Cumul_Freq_6%, Cumul_Freq_60%, Cumul_Freq_61%, Cumul_Freq_62%,
Cumul_Freq_63%, Cumul_Freq_64%, Cumul_Freq_65%, Cumul_Freq_66%, Cumul_Freq_67%, Cumul_Freq_68%,
Cumul_Freq_69%, Cumul_Freq_7%, Cumul_Freq_70%, Cumul_Freq_71%, Cumul_Freq_72%, Cumul_Freq_73%,
Cumul_Freq_74%, Cumul_Freq_75%, Cumul_Freq_76%, Cumul_Freq_77%, Cumul_Freq_78%, Cumul_Freq_79%,
Cumul_Freq_8%, Cumul_Freq_80%, Cumul_Freq_81%, Cumul_Freq_82%, Cumul_Freq_83%, Cumul_Freq_84%,
Cumul_Freq_85%, Cumul_Freq_86%, Cumul_Freq_87%, Cumul_Freq_88%, Cumul_Freq_89%, Cumul_Freq_9%,
Cumul_Freq_90%, Cumul_Freq_91%, Cumul_Freq_92%, Cumul_Freq_93%, Cumul_Freq_94%, Cumul_Freq_95%,
Cumul_Freq_96%, Cumul_Freq_97%, Cumul_Freq_98%, Cumul_Freq_99%, Deciduous Broadleaf Forest_lc_95,
Deciduous Needleleaf Forest_lc_95, Dense agricultural crops_lc_98, Dense agroforestry systems_lc_98,
Diorite & tonalite_geol, Discontinuous urban_lc_98, Dominant pastures_lc_98, ETP_5_mon_q5,
ETP_95_mon_q95, ETP_mon_avg, Evergreen Broadleaf Forest_lc_95, Evergreen Needleleaf Forest_lc_95,
Fissured & partitioned carbonate aquifers, Flysch & sandstone & limestone_geol, Forest_lc_98,
Grassland_lc_95, Gravelius, Hard limestone_geol, Highly permeable fissured & porous carbonate
aquifers, Industrial areas_lc_98, Karst aquifer, Lat_Exu, Lon_Exu, Low permeability sedimentary
formation, Magma_Perc, Mangroves_lc_98, Marl & marly limestone_geol, Marl & sand_geol, Marly
limestone_geol, Max_Elv, Medium-density agricultural crops_lc_98, Min_Elv, Mixed Forest_lc_95, More
productive alluvial area, Open Shrubland_lc_95, P_5_mon_q5, P_95_mon_q95, P_max10_mon_QMXA10,
P_min5_mon_QMNA5, P_mon_avg, Pasture with other presence_lc_98, Ports & airports_lc_98, Q1_5_mon_q5,
Q1_95_mon_q95, Q1_max10_mon_QMXA10, Q1_min5_mon_QMNA5, Q1_mm_d_hq_dur, Q1_mm_d_hq_freq,
Q1_mm_d_lq_dur, Q1_mm_d_lq_freq, Q1_mm_d_mean, Q1_mm_d_q5, Q1_mm_d_q95, Q1_mon_avg, Q2_5_mon_q5,
Q2_95_mon_q95, Q2_max10_mon_QMXA10, Q2_min5_mon_QMNA5, Q2_mm_d_hq_dur, Q2_mm_d_hq_freq,
Q2_mm_d_lq_dur, Q2_mm_d_lq_freq, Q2_mm_d_mean, Q2_mm_d_q5, Q2_mm_d_q95, Q2_mon_avg, Q3_5_mon_q5,
Q3_95_mon_q95, Q3_max10_mon_QMXA10, Q3_min5_mon_QMNA5, Q3_mm_d_hq_dur, Q3_mm_d_hq_freq,
Q3_mm_d_lq_dur, Q3_mm_d_lq_freq, Q3_mm_d_mean, Q3_mm_d_q5, Q3_mm_d_q95, Q3_mon_avg, Q_5_mon_q5,
Q_95_mon_q95, Q_max10_mon_QMXA10, Q_min5_mon_QMNA5, Q_mm_d_hq_dur, Q_mm_d_hq_freq, Q_mm_d_lq_dur,
Q_mm_d_lq_freq, Q_mm_d_mean, Q_mm_d_q5, Q_mm_d_q95, Q_mon_avg, Quarry_lc_98, River beds & recent
alluvium_lc_98, Rock outcrops & bare soil_lc_98, Runoff_Ratio_mon_arid, Saline areas_lc_98,
Savannahs with other presence_lc_98, Savannahs_lc_98, Sd_Elv, Sedim_Perc, Stream_density,
Temp_5_mon_q5, Temp_95_mon_q95, Temp_mon_avg, Ultrabasic rocks_geol, Urban_lc_95, Volcano-
sedimentary rock_geol, Water plan_lc_98, Water_lc_95, Wetlands_lc_98, Wooded Grassland_lc_95,
Woodland_lc_95, area_km2, lat, long, slope_degrees
[257]:
df = dataset.fetch_static_features()
print(df.shape)
(24, 232)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:320: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[258]:
print(df.isna().sum().sum())
df.isna().sum()
96
[258]:
Alluvial aquifers with free water               0
Alluvial aquifers with partly confined water    0
Alluvium & detrital materials_geol              0
Andesites & rhyodacites_geol                    0
Aridity_mon_arid                                0
                                               ..
Woodland_lc_95                                  0
area_km2                                        0
lat                                             0
long                                            0
slope_degrees                                   0
Length: 232, dtype: int64

find those columns which have at least one NaN value

[259]:
df.loc[:, (df.isna().sum()>0)]
[259]:
BFI1_d BFI2_d BFI3_d BFI_d Q1_mm_d_hq_dur Q1_mm_d_hq_freq Q1_mm_d_lq_dur Q1_mm_d_lq_freq Q1_mm_d_mean Q1_mm_d_q5 ... Q3_mm_d_mean Q3_mm_d_q5 Q3_mm_d_q95 Q_mm_d_hq_dur Q_mm_d_hq_freq Q_mm_d_lq_dur Q_mm_d_lq_freq Q_mm_d_mean Q_mm_d_q5 Q_mm_d_q95
001 0.46 0.68 0.55 0.49 2.12 2.43 27.62 34.19 1.16 0.2 ... 1.18 0.2 3.10 1.62 0.86 0.00 0.00 1.23 0.4 3.10
004 0.59 0.38 0.42 0.38 1.98 5.38 0.00 0.00 2.26 0.6 ... 2.23 0.3 8.15 2.00 5.00 0.00 0.00 1.97 0.6 4.80
006 0.47 0.66 0.61 0.50 1.90 5.52 0.00 0.00 1.50 0.5 ... 1.36 0.4 3.50 1.89 1.00 4.00 0.80 1.35 0.4 3.60
007 0.49 0.53 0.50 0.47 2.65 5.05 19.03 29.00 1.91 0.3 ... 2.04 0.5 5.60 2.33 2.50 5.51 16.86 2.08 0.5 5.50
008 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
010 0.29 0.21 0.31 0.32 6.08 20.86 23.73 117.52 2.10 0.2 ... 2.19 0.1 7.10 2.85 8.23 9.95 35.23 2.61 0.4 6.70
023 0.16 0.34 0.29 0.20 3.65 20.14 18.33 89.05 1.87 0.2 ... 1.81 0.2 6.70 1.91 7.79 5.34 24.79 1.98 0.3 5.58
024 0.38 0.42 0.39 0.42 3.38 15.14 0.00 0.00 1.37 0.3 ... 1.38 0.4 4.50 1.75 2.75 18.19 109.00 1.13 0.1 3.70
029 0.38 0.28 0.33 0.39 2.43 6.95 34.18 61.86 2.31 0.2 ... 2.13 0.1 6.30 1.56 2.71 0.00 0.00 2.30 0.7 4.50
036 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
037 0.24 0.47 0.34 0.41 9.02 36.10 83.00 158.10 0.61 0.0 ... 0.71 0.0 2.90 1.33 0.38 12.67 62.08 1.03 0.2 2.90
041 0.39 0.61 0.52 0.39 6.14 14.90 130.76 105.86 0.89 0.0 ... 0.93 0.0 3.20 0.00 0.00 14.83 57.07 1.05 0.2 3.00
044 0.32 0.28 0.36 0.25 7.22 40.24 70.33 160.76 1.05 0.0 ... 0.89 0.0 4.10 1.38 9.86 12.38 15.57 1.36 0.2 4.46
045 0.52 0.39 0.44 0.42 2.95 2.95 0.00 0.00 0.44 0.1 ... 0.43 0.1 1.10 1.60 0.17 5.64 13.17 0.36 0.1 1.00
051 0.23 0.21 0.13 0.18 3.62 13.10 25.48 110.43 1.06 0.1 ... 1.61 0.1 6.00 2.27 12.00 7.72 49.25 1.67 0.2 4.50
052 0.49 0.62 0.58 0.29 1.90 2.71 37.16 97.33 2.66 0.2 ... 2.88 0.8 8.70 3.11 33.50 28.00 160.33 2.39 0.0 9.96
053 0.31 0.10 0.11 0.12 2.30 12.62 11.45 44.71 2.66 0.4 ... 2.72 0.0 15.60 1.89 14.33 7.15 79.33 2.41 0.2 6.90
056 0.22 0.49 0.47 0.15 3.49 11.29 18.35 70.76 1.21 0.1 ... 1.28 0.2 4.30 1.78 10.55 8.69 81.64 1.20 0.1 4.30
057 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
058 0.29 0.27 0.06 0.32 3.71 38.29 81.74 163.48 1.83 0.1 ... 1.50 0.0 6.80 2.32 20.00 20.33 86.50 0.96 0.1 2.80
060 0.44 0.53 0.41 0.26 3.03 9.67 29.38 55.95 1.53 0.3 ... 1.48 0.4 4.50 2.17 9.67 7.76 32.00 1.48 0.2 4.80
061 0.33 0.49 0.41 0.30 2.41 12.71 16.44 54.81 2.69 0.4 ... 2.87 0.5 9.20 1.85 6.92 4.80 19.00 2.86 0.6 8.90
065 0.28 0.34 0.31 0.25 2.39 17.52 28.92 50.95 1.74 0.3 ... 1.72 0.3 6.20 2.30 11.75 13.11 73.31 1.74 0.2 6.00
068 0.55 0.55 0.55 0.52 2.67 3.05 13.68 42.33 2.55 0.4 ... 2.31 0.3 6.10 2.12 2.83 8.60 7.17 2.19 0.6 4.60

24 rows × 32 columns

[260]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[260]:
BFI1_d             3
BFI2_d             3
BFI3_d             3
BFI_d              3
Q1_mm_d_hq_dur     3
Q1_mm_d_hq_freq    3
Q1_mm_d_lq_dur     3
Q1_mm_d_lq_freq    3
Q1_mm_d_mean       3
Q1_mm_d_q5         3
Q1_mm_d_q95        3
Q2_mm_d_hq_dur     3
Q2_mm_d_hq_freq    3
Q2_mm_d_lq_dur     3
Q2_mm_d_lq_freq    3
Q2_mm_d_mean       3
Q2_mm_d_q5         3
Q2_mm_d_q95        3
Q3_mm_d_hq_dur     3
Q3_mm_d_hq_freq    3
Q3_mm_d_lq_dur     3
Q3_mm_d_lq_freq    3
Q3_mm_d_mean       3
Q3_mm_d_q5         3
Q3_mm_d_q95        3
Q_mm_d_hq_dur      3
Q_mm_d_hq_freq     3
Q_mm_d_lq_dur      3
Q_mm_d_lq_freq     3
Q_mm_d_mean        3
Q_mm_d_q5          3
Q_mm_d_q95         3
dtype: int64
[261]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[262]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Slovenia

[263]:
dataset = RainfallRunoff('Slovenia', path=DATA_PATH, verbosity=0)
print(dataset)
Slovenia with 117 stations, 10 dynamic and 214 static features

The static features of Slovenia are same as that of EStreams.

[264]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[265]:
df = dataset.fetch_static_features()
print(df.shape)
(117, 214)
[266]:
print(df.isna().sum().sum())
df.isna().sum()
1163
[266]:
static_features
area_flag           0
area_km2            0
area_official      17
area_rel           17
aridity            21
                   ..
steep_area_fra      0
strm_dens           0
tot_area            0
watershed_group     0
zero_q_freq        21
Length: 214, dtype: int64

find those columns which have at least one NaN value

[267]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[268]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[268]:
static_features
area_official                 17
area_rel                      17
aridity                       21
baseflow_index                21
dam_yr_first                 113
dam_yr_last                  113
duplicated_suspect           117
elevation                      7
end_date                       1
end_date_climatic             21
end_date_hydro                21
frac_snow                     21
hfd_mean                      21
hfd_std                       21
hp_dur                        21
hp_freq                       21
hp_time                       21
hq_dur                        21
hq_freq                       21
lp_dur                        21
lp_freq                       21
lp_time                       21
lq_dur                        27
lq_freq                       27
num_days_gaps                  1
num_years_climatic            21
num_years_hydro               21
p_mean                        21
p_seasonality                 21
pet_mean                      21
q_5                           21
q_95                          21
q_elas_Sankarasubramanian     21
q_mean                        21
q_runoff_ratio                21
res_tot_sto                  113
slope_no_unit                 21
start_date                     1
start_date_climatic           21
start_date_hydro              21
zero_q_freq                   21
dtype: int64
[269]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, q_cms_obs, rh_%, solrad_wm2, sp_mean,
windspeed_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[270]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Spain

[271]:
dataset = RainfallRunoff('Spain', path=DATA_PATH, verbosity=0)
print(dataset)
Spain with 889 stations, 27 dynamic and 35 static features

The static features of Spain are same as that of GSHA.

[272]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[273]:
df = dataset.fetch_static_features()
print(df.shape)
(889, 35)
[274]:
print(df.isna().sum().sum())
df.isna().sum()
30
[274]:
EVP_uncertainty(%)     11
HYRIV_ID                0
LRAD_uncertainty(%)     6
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area_km2                0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     13
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slope_degrees           0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[275]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[276]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[276]:
EVP_uncertainty(%)     11
LRAD_uncertainty(%)     6
pet_uncertainty(%)     13
dtype: int64
[277]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[278]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

Thailand

[279]:
dataset = RainfallRunoff('Thailand', path=DATA_PATH, verbosity=0)
print(dataset)
Thailand with 73 stations, 27 dynamic and 35 static features

The static features of Thailand are same as that of GSHA.

[280]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[281]:
df = dataset.fetch_static_features()
print(df.shape)
(73, 35)
[282]:
print(df.isna().sum().sum())
df.isna().sum()
0
[282]:
EVP_uncertainty(%)     0
HYRIV_ID               0
LRAD_uncertainty(%)    0
P_uncertainty(%)       0
SRAD_uncertainty(%)    0
T_uncertainty(%)       0
agency                 0
area_km2               0
cly_pc_uav             0
ele_mt_uav             0
ero_kh_uav             0
gla_pc_use             0
glc_cl_cmj             0
gwt_cm_cav             0
inu_pc_ult             0
lat                    0
lit_cl_cmj             0
long                   0
pet_uncertainty(%)     0
pnv_cl_cmj             0
prm_pc_use             0
sgr_dk_rav             0
slope_degrees          0
slt_pc_uav             0
snd_pc_uav             0
wet_pc_u01             0
wet_pc_u02             0
wet_pc_u03             0
wet_pc_u04             0
wet_pc_u05             0
wet_pc_u06             0
wet_pc_u07             0
wet_pc_u08             0
wet_pc_u09             0
wind_uncertainty(%)    0
dtype: int64

find those columns which have at least one NaN value

[283]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[284]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[284]:
Series([], dtype: float64)
[285]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[286]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

USGS

[287]:
dataset = RainfallRunoff('USGS', path=DATA_PATH, verbosity=0)
print(dataset)
USGS with 12004 stations, 20 dynamic and 29 static features

The static features of USGS are same as that of HYSETS.

[288]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Drainage_Area_GSIM_km2, Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries,
Flag_Land_Use_Extraction, Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction,
Gravelius, Hydrometric_station_latitude, Hydrometric_station_longitude, Land_Use_Crops_frac,
Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac, Land_Use_Snow_Ice_frac,
Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name, Perimeter,
Permeability_logk_m2, Porosity_frac, Source, area_km2, lat, long, slope_degrees
[289]:
df = dataset.fetch_static_features()
print(df.shape)
(12004, 29)
[290]:
print(df.isna().sum().sum())
df.isna().sum()
16551
[290]:
Source                               0
Name                                 0
lat                                  0
long                                 0
area_km2                             0
Drainage_Area_GSIM_km2           11884
Flag_GSIM_boundaries                 0
Flag_Artificial_Boundaries           0
Elevation_m                          1
slope_degrees                        1
Gravelius                         1168
Perimeter                         1168
Flag_Shape_Extraction                0
Aspect_deg                           1
Flag_Terrain_Extraction              0
Land_Use_Forest_frac                 3
Land_Use_Grass_frac                  3
Land_Use_Wetland_frac                3
Land_Use_Water_frac                  3
Land_Use_Urban_frac                  3
Land_Use_Shrubs_frac                 3
Land_Use_Crops_frac                  3
Land_Use_Snow_Ice_frac               3
Flag_Land_Use_Extraction             0
Permeability_logk_m2              1152
Porosity_frac                     1152
Flag_Subsoil_Extraction              0
Hydrometric_station_latitude         0
Hydrometric_station_longitude        0
dtype: int64

find those columns which have at least one NaN value

[291]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[292]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[292]:
Drainage_Area_GSIM_km2    11884
Elevation_m                   1
slope_degrees                 1
Gravelius                  1168
Perimeter                  1168
Aspect_deg                    1
Land_Use_Forest_frac          3
Land_Use_Grass_frac           3
Land_Use_Wetland_frac         3
Land_Use_Water_frac           3
Land_Use_Urban_frac           3
Land_Use_Shrubs_frac          3
Land_Use_Crops_frac           3
Land_Use_Snow_Ice_frac        3
Permeability_logk_m2       1152
Porosity_frac              1152
dtype: int64
[293]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_2m_max, airtemp_C_2m_min, cloudcover, dptemp_C_mean_2m, evap_mm,
evap_mm_snow, lwdownrad_wm2, lwnetrad_wm2, pcp_mm, q_cms_obs, q_mmd_obs, snowdensity_kgm3,
snowfall_mm, snowmelt_mm, solrad_wm2, solradnet_wm2, swe_mm, windspeedu_mps, windspeedv_mps

print total number of nans for each of dynamic feature. _, dyn_ds = dataset.fetch(“all”, dynamic_features=dataset.dynamic_features)

[294]:
# for feat, nans in zip(
#     dyn_ds.dynamic_features.data.tolist(),
#     dyn_ds.to_array().isnull().sum(dim=["time", "dynamic_features"]).data.tolist()
#     ):

#     print(feat, nans)

WaterBenchIowa

[295]:
dataset = RainfallRunoff('WaterBenchIowa', path=DATA_PATH, verbosity=0)
print(dataset)
WaterBenchIowa with 125 stations, 3 dynamic and 7 static features
[296]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, loam, sandy_clay_loam, silt, silty_clay_loam, slope, travel_time
[297]:
df = dataset.fetch_static_features()
print(df.shape)
(125, 7)
[298]:
print(df.isna().sum().sum())
df.isna().sum()
0
[298]:
travel_time        0
area               0
slope              0
loam               0
silt               0
sandy_clay_loam    0
silty_clay_loam    0
dtype: int64

find those columns which have at least one NaN value

[299]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[300]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[300]:
Series([], dtype: float64)
[301]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
discharge, et, precipitation

Regional Datsets without observed streamflow

The following datasets do not have observed streamflow data. However, they behave similar to the datasets with observed streamflow data.

GSHA

This dataset contains climate (dynamic) variables and static features for catchments around the world. These dynamic and static features are used for other dataset classes like Spain, Thailand and Japan.

[302]:
dataset = RainfallRunoff('GSHA', path=DATA_PATH, verbosity=0)
print(dataset)
GSHA with 21568 stations, 26 dynamic and 35 static features
[303]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area_km2, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slope_degrees, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[304]:
df = dataset.fetch_static_features()
print(df.shape)
(21568, 35)
[305]:
print(df.isna().sum().sum())
df.isna().sum()
3442
[305]:
EVP_uncertainty(%)     1224
HYRIV_ID                  0
LRAD_uncertainty(%)     630
P_uncertainty(%)          0
SRAD_uncertainty(%)       0
T_uncertainty(%)          8
agency                    0
area_km2                  0
cly_pc_uav                0
ele_mt_uav                0
ero_kh_uav                0
gla_pc_use                0
glc_cl_cmj                0
gwt_cm_cav                0
inu_pc_ult                0
lat                       0
lit_cl_cmj                0
long                      0
pet_uncertainty(%)     1580
pnv_cl_cmj                0
prm_pc_use                0
sgr_dk_rav                0
slope_degrees             0
slt_pc_uav                0
snd_pc_uav                0
wet_pc_u01                0
wet_pc_u02                0
wet_pc_u03                0
wet_pc_u04                0
wet_pc_u05                0
wet_pc_u06                0
wet_pc_u07                0
wet_pc_u08                0
wet_pc_u09                0
wind_uncertainty(%)       0
dtype: int64

find those columns which have at least one NaN value

[306]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[307]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[307]:
EVP_uncertainty(%)     1224
LRAD_uncertainty(%)     630
T_uncertainty(%)          8
pet_uncertainty(%)     1580
dtype: int64
[308]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2, swe_mm_era5,
windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

EStreams

The EStreams dataset does not contain observed streamflow data. However, it contains other climate (dynamic) variables and static features for european catchments. These dynamic and static features are used for other Euoropean dataset classes like Portugal, Spain, Finland, Italy, Ireland and Poland.

[309]:
dataset = RainfallRunoff('EStreams', path=DATA_PATH, verbosity=0)
print(dataset)
EStreams with 17130 stations, 9 dynamic and 214 static features
[310]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_flag, area_km2, area_official, area_rel, aridity, baseflow_index, bedrk_dep, dam_num,
dam_yr_first, dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation,
elon_ratio, end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country,
gauge_flag, gauge_id, gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur,
hp_freq, hp_time, hq_dur, hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08,
lai_09, lai_10, lai_11, lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap,
lit_dom, lit_fra_ev, lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi,
lit_fra_py, lit_fra_sc, lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi,
lit_fra_wb, lon_snap, long, lp_dur, lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03,
ndvi_04, ndvi_05, ndvi_06, ndvi_07, ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean,
nested_catchments, num_continuous_days, num_days, num_days_gaps, num_days_noflag, num_days_reliable,
num_days_suspect, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_no_unit, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04,
sno_cov_05, sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12,
sno_cov_mean, soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25,
soil_bd_p75, soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med,
soil_fra_clay_min, soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90,
soil_fra_grav_max, soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05,
soil_fra_grav_p25, soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean,
soil_fra_sand_med, soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75,
soil_fra_sand_p90, soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min,
soil_fra_silt_p05, soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max,
soil_oc_mean, soil_oc_med, soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90,
soil_tawc_max, soil_tawc_mean, soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25,
soil_tawc_p75, soil_tawc_p90, start_date, start_date_climatic, start_date_hydro,
stations_dens_p_mean, stations_dens_rh_mean, stations_dens_sp_mean, stations_dens_swr_mean,
stations_dens_t_max, stations_dens_t_mean, stations_dens_t_min, stations_dens_ws_mean,
stations_num_p_mean, stations_num_rh_mean, stations_num_sp_mean, stations_num_swr_mean,
stations_num_t_max, stations_num_t_mean, stations_num_t_min, stations_num_ws_mean, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[311]:
df = dataset.fetch_static_features()
print(df.shape)
(17130, 214)
[312]:
print(df.isna().sum().sum())
df.isna().sum()
193553
[312]:
static_features
area_flag             0
area_km2              0
area_official      1355
area_rel           1356
aridity            3393
                   ...
steep_area_fra        0
strm_dens             0
tot_area              0
watershed_group       0
zero_q_freq        3614
Length: 214, dtype: int64

find those columns which have at least one NaN value

[313]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[314]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[314]:
static_features
area_official            1355
area_rel                 1356
aridity                  3393
baseflow_index           3728
bedrk_dep                   8
                         ...
stations_dens_t_max         1
stations_dens_t_mean        1
stations_dens_t_min         1
stations_dens_ws_mean       1
zero_q_freq              3614
Length: 156, dtype: int64
[315]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, rh_%, solrad_wm2, sp_mean,
windspeed_mps