Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

  • Fetching ERA5-Land data in GRIB format.
  • Apply homogenization conventions, as applicable:
    • Rename spatial coordinates to latitude, longitude.
    • Ensure latitude, longitude ranges are between [-90, +90], [-180, +180].
    • If the number of hourly samples is incomplete (less than 24) the date is skipped (e.g. 1950-01-01).
    • Time is expressed in one single dimension called time, which in ERA5-Land netCDFs is usually called valid_time.
    • Drop "number", "step", "surface", "valid_time" and "depthBelowLandLayer".
    • The accumulation variables (e.g. Total precipitation) are de-accumulated * to hourly resolution. Please see the Jupyter notebook below. 
  • Written to an ARCO Zarr archive (which is similar in structure to netCDF).
  • The CDS requests the data from the Zarr archive using xarray and writes the data to netCDF or CSV (as requested).

Jupyter notebook demonstrating the procedure to de-accumulate the data

(*)  De-accumulation is the process of substracting to each hourly time sample its previous sample so that the final value expresses only the magnitude for that hour, and not for that hour plus the previous ones. The process takes into account the accumulation period of 24 hours, so that the first sample of every period is left as it is.

Expand
titleJupyter notebook demonstrating the procedure to de-accumulate the data

ARCO ERA5-Land de-accumulation

In this notebook, we show the procedure to de-accumulate the data of an accumulated ERA5-Land variable, then show how to accumulate it back and later prove the procedure by reconstructing the data and comparing it against the original one.

Code Block
languagepy
from copy import deepcopy

import xarray as xr
import numpy as np
import cdsapi

Definitions

Code Block
languagepy
def flatten_dataset(ds, var, time_selection=[]):
    """Rearange the data so we flatten time samples.
    E.g. from
    {'time': 2, 'step': 24, 'latitude': 1801, 'longitude': 3600}
    to
    {'time': 48, 'latitude': 1801, 'longitude': 3600}
    """
    total_samples = ds.valid_time.shape[0]*ds.valid_time.shape[1]
    
    reshaped_var = ds[var].values.reshape((total_samples, ds[var].shape[2], ds[var].shape[3]))

    ds_new = xr.Dataset(
        data_vars={var: (["time", "latitude", "longitude"], reshaped_var)},
        coords={
            "time": ("time", ds.valid_time.values.reshape((total_samples))),
            "latitude": ("latitude", ds.latitude.values),
            "longitude": ("longitude", ds.longitude.values),
        },
    )

    ds_new.attrs = ds.attrs
    ds_new[var].attrs = ds[var].attrs
    ds_new.longitude.attrs = ds.longitude.attrs
    ds_new.latitude.attrs = ds.latitude.attrs
    ds_new.time.attrs = ds.valid_time.attrs
    if time_selection is not None:
        ds_new = ds_new.isel(time=slice(*time_selection))

    # Remove additional variables, same as in ARCO hourly ERA5 on single levels
    for drop_var in ["number", "surface"]:
        if drop_var in ds_new:
            ds_new = ds_new.drop_vars([drop_var])

    return ds_new

def apply_lon_lat_conventions(ds):
    # Renames
    if "lon" in ds.dims:
        ds = ds.rename({"lon": "longitude"})
    if "lat" in ds.dims:
        ds = ds.rename({"lat": "latitude"})

    # Flip latitudes (ensure they are monotonic increasing)
    if "latitude" in ds.dims:
        lats = ds["latitude"]
        if len(lats) > 1 and lats[0] > lats[-1]:
            ds = ds.reindex(latitude=ds.latitude[::-1])

    # Convert longitude to [-180, 180[
    if "longitude" in ds.dims and ds["longitude"].max() > 180:
        lons = ds["longitude"]
        lons_attrs = lons.attrs
        new_lons = np.concatenate([lons[lons >= 180], lons[lons < 180]])
        ds = ds.reindex(longitude=new_lons)
        ds = ds.assign_coords(longitude=(((ds["longitude"] + 180) % 360) - 180))
        ds["longitude"].attrs = lons_attrs

    return ds

Data

Code Block
languagepy
variable = "tp"
# CDS API

token = "" # Your CDS API Token

dataset = "reanalysis-era5-land"
request = {
    "variable": ["total_precipitation"],
    "year": "1970",
    "month": "01",
    "day": ["01"],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "data_format": "grib",
    "download_format": "unarchived"
}

client = cdsapi.Client(url="https://cds.climate.copernicus.eu/api", key=token)
client.retrieve(dataset, request).download("cdsapi_era5-land-date-1.grib")

request["day"] = ["02"]
client.retrieve(dataset, request).download("cdsapi_era5-land-date0.grib")

ds0 = xr.open_dataset("cdsapi_era5-land-date0.grib", filter_by_keys={'stepType': 'accum'})
ds_1 = xr.open_dataset("cdsapi_era5-land-date-1.grib", filter_by_keys={'stepType': 'accum'})

ds0 = apply_lon_lat_conventions(ds0)
ds_1 = apply_lon_lat_conventions(ds_1)

De-accumulation

Code Block
languagepy
# Deaccumulate
# We want to finally obtain a Dataset that contains 24 samples, them being:
# [
#    00:00    <date-1, step24> - <date-1, step23>,
#    01:00    <date0, step1>,
#    02:00    <date0, step2> - <date0, step1>,
#    03:00    <date0, step3> - <date0, step2>,
# ...
#    23:00    <date0, step23> - <date0, step22>
# ]
def deaccumulate(ds_1, ds0, variable):
    """ De-accumulate the variable of one Dataset
    
    Args:
        ds_1: Date -1 (e.g. 1970-01-01) dataset.
        ds0: Date 0 (e.g. 1970-01-02) dataset.
        variable: The dataset variable
    
    Returns:
        A de-accumulated Dataset.
    """
    ds_1_flat = flatten_dataset(ds_1, variable, None)
    ds0_flat = flatten_dataset(ds0, variable, None)
    ds_deaccumulated = deepcopy(ds0_flat)
    
    # Array indexes
    # ds-1, date-1, step23
    i_ds_1_d_1_s23 = 46
    # ds0, date-1, step24
    i_ds0_d_1_s24 = 23
    # ds0, date0, step1
    i_ds0_d0_s1 = 24
    
    # 00:00    <date-1, step24> - <date-1, step23>
    ds_deaccumulated[variable][i_ds0_d_1_s24] -= ds_1_flat[variable][i_ds_1_d_1_s23]
    
    # Nothing to do for 01:00    <date0, step1>,
    
    # From
    #    02:00    <date0, step2> - <date0, step1>,
    # to
    #    23:00    <date0, step23> - <date0, step22>
    for i, t in enumerate(range(ds_deaccumulated.time.shape[0])):
        if i > i_ds0_d0_s1:
            ds_deaccumulated[variable][t] -= ds0_flat[variable][t - 1]
    
    # Slice to the times of interest (24 samples)
    ds_deaccumulated = ds_deaccumulated.isel(time=slice(i_ds0_d_1_s24, i_ds_1_d_1_s23 + 1))
    
    return ds_deaccumulated

ds_deaccumulated = deaccumulate(ds_1, ds0, variable)

Accumulation

Code Block
languagepy
# We revert de de-accumulation to obtain the original values

def accumulate(ds0_deaccumulated, variable, ds_1_accumulated=None, ds_1_deaccumulated=None):
    """Accumulate a variable over each time sample.
    
    Args:
        ds0_deaccumulated: Date 0 (e.g. 1970-01-02) de-accumulated
        variable: Dataset variable
        ds_1_accumulated: Date -1 (e.g. 1970-01-01) accumulated.  Expected time and steps dimensions.
                          Needed either this or ds_1_deaccumulated
        ds_1_deaccumulated: Date -1 (e.g. 1970-01-01) de-accumulated. Expected time dimension. 
                            Needed either this or ds_1_accumulated
        
    Returns:
        Date 0 xarray.Dataset
    """
    ds0_accumulated = deepcopy(ds0_deaccumulated)
    
    if ds_1_accumulated:
        ds0_accumulated[variable][0] += ds_1_accumulated[variable].values[1][22]
    elif ds_1_deaccumulated:
        for i in range(1, 24):
            ds0_accumulated[variable][0] += ds_1_deaccumulated[variable].values[i]

    for step in range(2, 24):
        tmp = deepcopy(ds0_deaccumulated[variable].values[1])
        for acc_step in range(2, step+1):
            tmp += ds0_deaccumulated[variable].values[acc_step]
        ds0_accumulated[variable][step] = tmp
    
    return ds0_accumulated

ds_accumulated = accumulate(ds_deaccumulated, variable, ds_1_accumulated=ds_1)

Check correctness

Code Block
languagepy
#    00:00 
np.allclose(ds_accumulated.tp.values[0], ds0.tp.values[0][23], equal_nan=True)

# From
#    01:00
# to
#    23:00
for step in range(1, 24):
    print("Time", f"{step}:00")
    print(np.allclose(ds_accumulated.tp[step], ds0.tp.values[1][step-1], equal_nan=True))
    print()


Data organization and access

...