Skip to content

onset

Onset is a company that manufactures data loggers and sensors for environmental monitoring. Their Hobo data loggers are widely used for monitoring water quality parameters such as temperature, conductivity, and light intensity. The present module provides parsers for the different data formats generated by the HOBOware and HOBOconnect softwares.

csv(path, convert_units_to_si=True, standardize_variable_names=True, encoding='UTF-8', errors='strict', timezone=None, ambiguous_timestamps='raise')

Parses the Onset CSV format generate by HOBOware into a xarray object

Parameters:

Name Type Description Default
path str

The path to the CSV file

required
convert_units_to_si bool

Whether to standardize data units to SI units

True
standardize_variable_names bool

Rename the variable names a standardize name convention

True
encoding str

File encoding. Defaults to "utf-8"

'UTF-8'
errors str

Error handling. Defaults to "strict"

'strict'
timezone str

Timezone to localize the time variable, overwrites the timezone in header

None
ambiguous_timestamps str

How to handle ambiguous time stamps. Defaults to "raise"

'raise'

Returns: xarray.Dataset

Source code in ocean_data_parser/parsers/onset.py
def csv(
    path: str,
    convert_units_to_si: bool = True,
    standardize_variable_names: bool = True,
    encoding: str = "UTF-8",
    errors: str = "strict",
    timezone: str = None,
    ambiguous_timestamps: str = "raise",
) -> xarray.Dataset:
    """Parses the Onset CSV format generate by HOBOware into a xarray object

    Args:
        path: The path to the CSV file
        convert_units_to_si: Whether to standardize data units to SI units
        standardize_variable_names: Rename the variable names a standardize name
            convention
        encoding: File encoding. Defaults to "utf-8"
        errors: Error handling. Defaults to "strict"
        timezone: Timezone to localize the time variable, overwrites the timezone in header
        ambiguous_timestamps: How to handle ambiguous time stamps. Defaults to "raise"
    Returns:
        xarray.Dataset
    """

    raw_header = []
    line = ""
    with open(
        path,
        encoding=encoding,
        errors=errors,
    ) as f:
        while "Date Time" not in line and len(raw_header) < 10:
            line = f.readline()
            raw_header.append(line)
        first_row = f.readline()
    if "Date Time" not in raw_header[-1]:
        raise ValueError("Date Time column not found in header")

    # Parse onset header
    header, variables = _parse_onset_csv_header(raw_header)
    date_column_index = list(variables.keys()).index("Date Time")
    date_format = _get_time_format(first_row.split(",")[date_column_index])

    # Inputs to pd.read_csv
    consider_columns = {
        var: id
        for id, var in enumerate(variables.keys())
        if var.lower().replace(" ", "_") not in IGNORED_VARIABLES
    }
    df = pd.read_csv(
        path,
        na_values=[" "],
        skiprows=list(range(len(raw_header))),
        parse_dates=["Date Time"],
        date_format=date_format,
        sep=",",
        header=None,
        memory_map=True,
        names=consider_columns.keys(),
        usecols=consider_columns.values(),
        encoding_errors=errors,
        encoding=encoding,
    )

    # Add timezone to time variables
    if df["Date Time"].dtype == "object":
        logger.warning(
            "Date Time column is not in a consistent format. Trying to convert"
        )
        df["Date Time"] = df["Date Time"].apply(
            lambda x: pd.to_datetime(x, format=_get_time_format(x))
        )
    df["Date Time"] = df["Date Time"].dt.tz_localize(
        timezone or header["timezone"], ambiguous=ambiguous_timestamps
    )
    check_daylight_saving(df["Date Time"], ambiguous_timestamps)

    # Convert to dataset
    ds = df.to_xarray()
    ds.attrs = {**GLOBAL_ATTRIBUTES, **header}
    for var in ds:
        ds[var].attrs = variables[var]

    if standardize_variable_names:
        ds = ds.rename_vars(_standardized_variable_mapping(ds))
        # Detect instrument type based on variables available
        ds.attrs["instrument_type"] = _detect_instrument_type(ds)

    # # Review units and convert SI system
    if convert_units_to_si:
        if standardize_variable_names:
            if "temperature" in ds and ("C" not in ds["temperature"].attrs["units"]):
                logger.warning("Temperature in Farenheit will be converted to celsius")
                ds["temperature"] = _farenheit_to_celsius(ds["temperature"])
                ds["temperature"].attrs["units"] = "degC"
                ds.attrs["history"] += " ".join(
                    [
                        f"{datetime.now()}",
                        f"Convert temperature ({ ds['temperature'].attrs['units']}) to"
                        "degree Celsius [(degF-32)/1.8000]",
                    ]
                )
            if (
                "conductivity" in ds
                and "uS/cm" not in ds["conductivity"].attrs["units"]
            ):
                logger.warning(
                    "Unknown conductivity units (%s)", ds["conductivity"].attrs["units"]
                )
        else:
            logger.warning(
                "Unit conversion is not supported if standardize_variable_names=False"
            )

    ds = standardize_dataset(ds)
    return ds

xlsx(path, timezone=None, ambiguous_timestamps='infer')

Parses the Onset XLSX format generate by HOBOware into a xarray object

Parameters:

Name Type Description Default
path str

The path to the XLSX file

required
timezone str

Timezone to localize the time variable, overwrites the timezone in header

None
ambiguous_timestamps str

How to handle ambiguous time stamps. Defaults to "infer"

'infer'

Returns: xarray.Dataset

Source code in ocean_data_parser/parsers/onset.py
def xlsx(
    path: str, timezone: str = None, ambiguous_timestamps: str = "infer"
) -> xarray.Dataset:
    """Parses the Onset XLSX format generate by HOBOware into a xarray object

    Args:
        path: The path to the XLSX file
        timezone: Timezone to localize the time variable, overwrites the timezone in header
        ambiguous_timestamps: How to handle ambiguous time stamps. Defaults to "infer"
    Returns:
        xarray.Dataset
    """

    def _format_detail_key(key):
        """Format detail key to be more readable"""
        key = re.sub(r"\(.*\)", "", key)
        return (
            key.replace(" Info", "")
            .replace(" ", "_")
            .replace("-", "_")
            .lower()
            .replace("deployment_deployment", "deployment")
            .replace("device_device", "device")
            .replace("app_app", "app")
        )

    def _get_column_and_unit(column):
        """split column name and unit in parenthesis"""
        column = column.split(" (")
        if len(column) == 1:
            return column[0], None
        return column[0], column[1].replace(")", "")

    # Read the different sheets from the xlsx file
    data = pd.read_excel(path, sheet_name="Data", engine="openpyxl")
    events = pd.read_excel(path, sheet_name="Events", engine="openpyxl")
    details = (
        pd.read_excel(
            path,
            sheet_name="Details",
            engine="openpyxl",
            names=["group", "subgroup", "parameter", "value"],
        )
        .ffill(axis=0)
        .dropna(subset=["parameter", "value"])
    )
    details_attrs = {
        _format_detail_key(f"{row['subgroup']}_{row['parameter']}"): row["value"]
        for id, row in details.iterrows()
        if row["group"] == "Devices"
    }

    variable_attributes = {}

    for var in data.columns:
        column, unit = _get_column_and_unit(var)
        column = _format_detail_key(column)
        if column == "#":
            column = "record_number"
        elif column == "date_time":
            column = "time"
        variable_attributes[column] = {
            "long_name": column,
            "units": unit,
            "original_name": var,
        }
    data.columns = variable_attributes.keys()

    if "time" not in data.columns:
        raise ValueError("Date Time column not found in header")
    file_timezone = variable_attributes["time"].pop("units", None)
    if file_timezone:
        file_timezone = TIMEZONE_MAPPING.get(file_timezone, file_timezone)

    # Convert to dataset
    data["time"] = (
        pd.to_datetime(data["time"], errors="coerce")
        .dt.tz_localize(timezone or file_timezone, ambiguous=ambiguous_timestamps)
        .dt.tz_convert("UTC")
    )
    check_daylight_saving(data["time"])

    ds = data.to_xarray()
    for var in variable_attributes:
        ds[var].attrs = variable_attributes[var]
    ds.attrs = {**GLOBAL_ATTRIBUTES, "events": events.to_json(), **details_attrs}
    ds["instrument_type"] = _detect_instrument_type(ds)
    ds = standardize_dataset(ds)
    return ds