import json
import logging.config
import os
import re
from pathlib import Path
from typing import Optional, Tuple, Union
import pandas as pd
import xarray as xr
from xclim.core.units import units as u
from miranda.scripting import LOGGING_CONFIG
logging.config.dictConfig(LOGGING_CONFIG)
__all__ = ["open_txt"]
# CMOR-like attributes
cmor = json.load(open(Path(__file__).parent / "data" / "deh_cf_attrs.json"))[ # noqa
"variable_entry"
]
# TODO: Some potentially useful attributes were skipped, because they would be complicated to include in a dataset since they vary per station
meta_patterns = {
"Station: ": "name",
"Bassin versant: ": "bv",
"Coordonnées: (NAD83) ": "coords",
}
data_header_pattern = "Station Date Débit (m³/s) Remarque\n"
def extract_daily(path: Union[os.PathLike, str]) -> Tuple[dict, pd.DataFrame]:
"""Extract data and metadata from DEH (MELCC) stream flow file."""
with open(path, encoding="latin1") as fh:
txt = fh.read()
txt = re.sub(" +", " ", txt)
meta, data = txt.split(data_header_pattern)
m = dict()
for key in meta_patterns:
# Various possible separators to take into account
m[meta_patterns[key]] = (
meta.split(key)[1].split(" \n")[0].split("\n")[0].split(" Régime")[0]
)
d = pd.read_csv(
path,
delimiter=r"\s+",
skiprows=len(meta.splitlines()),
encoding="latin1",
converters={0: lambda x: str(x)}, # noqa
index_col=1,
parse_dates=True,
infer_datetime_format=True,
)
if len(d["Station"].unique()) == 1:
m["station"] = d["Station"].unique()[0]
d = d.drop("Station", axis=1)
else:
raise ValueError("Multiple stations detected in the same file.")
d = d.rename(columns={"Remarque": "Nan", "(m³/s)": "Remarque"})
d.index.names = ["time"]
d = d.drop("Nan", axis=1)
return m, d
def to_cf(meta: dict, data: pd.DataFrame, cf_table: dict) -> xr.Dataset:
"""Return CF-compliant metadata."""
ds = xr.Dataset()
ds["q"] = xr.DataArray(data["Débit"], attrs=cf_table["q"])
ds["flag"] = xr.DataArray(data["Remarque"], attrs=cf_table["flag"])
ds["name"] = xr.DataArray(meta["name"])
ds["station_id"] = xr.DataArray(meta["station"])
ds["area"] = xr.DataArray(
u.convert(float(meta["bv"].split(" ")[0]), meta["bv"].split(" ")[1], "km²"),
attrs={"long_name": "drainage area", "units": "km2"},
)
def parse_dms(coord):
deg, minutes, seconds, _ = re.split("[°'\"]", coord)
if float(deg) > 0:
return round(
float(deg) + float(minutes) / 60 + float(seconds) / (60 * 60), 6
)
return round(float(deg) - (float(minutes) / 60 + float(seconds) / (60 * 60)), 6)
coords = meta["coords"].split(" // ")
ds["lat"] = xr.DataArray(
parse_dms(coords[0]),
attrs={
"standard_name": "latitude",
"long_name": "latitude",
"units": "decimal_degrees",
},
)
ds["lon"] = xr.DataArray(
parse_dms(coords[1]),
attrs={
"standard_name": "longitude",
"long_name": "longitude",
"units": "decimal_degrees",
},
)
ds.attrs[
"institution"
] = "Ministère de l'Environnement et de la Lutte contre les changements climatiques"
ds.attrs[
"source"
] = "Hydrometric data <https://www.cehq.gouv.qc.ca/hydrometrie/historique_donnees/index.asp>"
ds.attrs["redistribution"] = "Redistribution policy unknown. For internal use only."
return ds
[docs]def open_txt(path: Union[str, Path], cf_table: Optional[dict] = cmor) -> xr.Dataset:
"""Extract daily HQ meteorological data and convert to xr.DataArray with CF-Convention attributes."""
meta, data = extract_daily(path)
return to_cf(meta, data, cf_table)