Skip to content

onset

Onset is a company that manufactures data loggers and sensors for environmental monitoring. Their Hobo data loggers are widely used for monitoring water quality parameters such as temperature, conductivity, and light intensity. The present module provides a parser for the CSV files generated by the HOBOware software.

csv(path, convert_units_to_si=True, standardize_variable_names=True, encoding='UTF-8', errors='strict')

Parses the Onset CSV format generate by HOBOware into a xarray object

Inputs

path: The path to the CSV file convert_units_to_si: Whether to standardize data units to SI units standardize_variable_names: Rename the variable names a standardize name convention encoding: File encoding. Defaults to "utf-8" errors: Error handling. Defaults to "strict"

Returns: xarray.Dataset

Source code in ocean_data_parser/parsers/onset.py
def csv(
    path: str,
    convert_units_to_si: bool = True,
    standardize_variable_names: bool = True,
    encoding: str = "UTF-8",
    errors: str = "strict",
) -> xarray.Dataset:
    """Parses the Onset CSV format generate by HOBOware into a xarray object

    Inputs:
        path: The path to the CSV file
        convert_units_to_si: Whether to standardize data units to SI units
        standardize_variable_names: Rename the variable names a standardize name
        convention
        encoding: File encoding. Defaults to "utf-8"
        errors: Error handling. Defaults to "strict"
    Returns:
        xarray.Dataset
    """

    raw_header = []
    line = ""
    with open(
        path,
        encoding=encoding,
        errors=errors,
    ) as f:
        while "Date Time" not in line and len(raw_header) < 10:
            line = f.readline()
            raw_header.append(line)
        first_row = f.readline()
    if "Date Time" not in raw_header[-1]:
        raise ValueError("Date Time column not found in header")

    # Parse onset header
    header, variables = _parse_onset_csv_header(raw_header)
    date_column_index = list(variables.keys()).index("Date Time")
    date_format = _get_time_format(first_row.split(",")[date_column_index])

    # Inputs to pd.read_csv
    consider_columns = {
        var: id
        for id, var in enumerate(variables.keys())
        if var.lower().replace(" ", "_") not in IGNORED_VARIABLES
    }
    df = pd.read_csv(
        path,
        na_values=[" "],
        skiprows=list(range(len(raw_header))),
        parse_dates=["Date Time"],
        date_format=date_format,
        sep=",",
        header=None,
        memory_map=True,
        names=consider_columns.keys(),
        usecols=consider_columns.values(),
        encoding_errors=errors,
        encoding=encoding,
    )

    # Add timezone to time variables
    if df["Date Time"].dtype == "object":
        logger.warning(
            "Date Time column is not in a consistent format. Trying to convert"
        )
        df["Date Time"] = df["Date Time"].apply(
            lambda x: pd.to_datetime(x, format=_get_time_format(x))
        )

    df["Date Time"] = df["Date Time"].dt.tz_localize(header["timezone"])

    # Convert to dataset
    ds = df.to_xarray()
    ds.attrs = {**GLOBAL_ATTRIBUTES, **header}
    for var in ds:
        ds[var].attrs = variables[var]

    if standardize_variable_names:
        ds = ds.rename_vars(_standardized_variable_mapping(ds))
        # Detect instrument type based on variables available
        ds.attrs["instrument_type"] = _detect_instrument_type(ds)

    # # Review units and convert SI system
    if convert_units_to_si:
        if standardize_variable_names:
            if "temperature" in ds and ("C" not in ds["temperature"].attrs["units"]):
                logger.warning("Temperature in Farenheit will be converted to celsius")
                ds["temperature"] = _farenheit_to_celsius(ds["temperature"])
                ds["temperature"].attrs["units"] = "degC"
                ds.attrs["history"] += " ".join(
                    [
                        f"{datetime.now()}",
                        f"Convert temperature ({ ds['temperature'].attrs['units']}) to"
                        "degree Celsius [(degF-32)/1.8000]",
                    ]
                )
            if (
                "conductivity" in ds
                and "uS/cm" not in ds["conductivity"].attrs["units"]
            ):
                logger.warning(
                    "Unknown conductivity units (%s)", ds["conductivity"].attrs["units"]
                )
        else:
            logger.warning(
                "Unit conversion is not supported if standardize_variable_names=False"
            )

    # Test daylight saving issue
    # TODO move this daylight saving detection test elsewhere
    dt = ds["time"].diff("index")
    sampling_interval = dt.median().values
    dst_fall = -pd.Timedelta("1h") + sampling_interval
    dst_spring = pd.Timedelta("1h") + sampling_interval
    if any(dt == dst_fall):
        logger.warning(
            (
                "Time gaps (=%s) for sampling interval of %s "
                "suggest a Fall daylight saving issue is present"
            ),
            dst_fall,
            sampling_interval,
        )
    if any(dt == dst_spring):
        logger.warning(
            (
                "Time gaps (=%s) for sampling interval of %s "
                "suggest a Spring daylight saving issue is present"
            ),
            dst_fall,
            sampling_interval,
        )

    ds = standardize_dataset(ds)
    return ds