def csv(
path: str,
convert_units_to_si: bool = True,
standardize_variable_names: bool = True,
encoding: str = "UTF-8",
errors: str = "strict",
) -> xarray.Dataset:
"""Parses the Onset CSV format generate by HOBOware into a xarray object
Inputs:
path: The path to the CSV file
convert_units_to_si: Whether to standardize data units to SI units
standardize_variable_names: Rename the variable names a standardize name
convention
encoding: File encoding. Defaults to "utf-8"
errors: Error handling. Defaults to "strict"
Returns:
xarray.Dataset
"""
raw_header = []
line = ""
with open(
path,
encoding=encoding,
errors=errors,
) as f:
while "Date Time" not in line and len(raw_header) < 10:
line = f.readline()
raw_header.append(line)
first_row = f.readline()
if "Date Time" not in raw_header[-1]:
raise ValueError("Date Time column not found in header")
# Parse onset header
header, variables = _parse_onset_csv_header(raw_header)
date_column_index = list(variables.keys()).index("Date Time")
date_format = _get_time_format(first_row.split(",")[date_column_index])
# Inputs to pd.read_csv
consider_columns = {
var: id
for id, var in enumerate(variables.keys())
if var.lower().replace(" ", "_") not in IGNORED_VARIABLES
}
df = pd.read_csv(
path,
na_values=[" "],
skiprows=list(range(len(raw_header))),
parse_dates=["Date Time"],
date_format=date_format,
sep=",",
header=None,
memory_map=True,
names=consider_columns.keys(),
usecols=consider_columns.values(),
encoding_errors=errors,
encoding=encoding,
)
# Add timezone to time variables
if df["Date Time"].dtype == "object":
logger.warning(
"Date Time column is not in a consistent format. Trying to convert"
)
df["Date Time"] = df["Date Time"].apply(
lambda x: pd.to_datetime(x, format=_get_time_format(x))
)
df["Date Time"] = df["Date Time"].dt.tz_localize(header["timezone"])
# Convert to dataset
ds = df.to_xarray()
ds.attrs = {**GLOBAL_ATTRIBUTES, **header}
for var in ds:
ds[var].attrs = variables[var]
if standardize_variable_names:
ds = ds.rename_vars(_standardized_variable_mapping(ds))
# Detect instrument type based on variables available
ds.attrs["instrument_type"] = _detect_instrument_type(ds)
# # Review units and convert SI system
if convert_units_to_si:
if standardize_variable_names:
if "temperature" in ds and ("C" not in ds["temperature"].attrs["units"]):
logger.warning("Temperature in Farenheit will be converted to celsius")
ds["temperature"] = _farenheit_to_celsius(ds["temperature"])
ds["temperature"].attrs["units"] = "degC"
ds.attrs["history"] += " ".join(
[
f"{datetime.now()}",
f"Convert temperature ({ ds['temperature'].attrs['units']}) to"
"degree Celsius [(degF-32)/1.8000]",
]
)
if (
"conductivity" in ds
and "uS/cm" not in ds["conductivity"].attrs["units"]
):
logger.warning(
"Unknown conductivity units (%s)", ds["conductivity"].attrs["units"]
)
else:
logger.warning(
"Unit conversion is not supported if standardize_variable_names=False"
)
# Test daylight saving issue
# TODO move this daylight saving detection test elsewhere
dt = ds["time"].diff("index")
sampling_interval = dt.median().values
dst_fall = -pd.Timedelta("1h") + sampling_interval
dst_spring = pd.Timedelta("1h") + sampling_interval
if any(dt == dst_fall):
logger.warning(
(
"Time gaps (=%s) for sampling interval of %s "
"suggest a Fall daylight saving issue is present"
),
dst_fall,
sampling_interval,
)
if any(dt == dst_spring):
logger.warning(
(
"Time gaps (=%s) for sampling interval of %s "
"suggest a Spring daylight saving issue is present"
),
dst_fall,
sampling_interval,
)
ds = standardize_dataset(ds)
return ds