def superCO2(path: str, output: str = None) -> xarray.Dataset:
"""Parse superCO2 output file txt file"""
header = []
line = 1
with open(path, encoding="utf-8") as f:
header += [f.readline()]
if re.search(r"\d+ header lines", header[0]):
n_header_lines = int(re.search(r"(\d+) header lines", header[0])[1])
else:
logger.error("Unknown header format")
# Read the rest of the header lines
while line < n_header_lines - 1:
header.append(f.readline())
line += 1
# Read the column header and data with pandas
df = pd.read_csv(
f,
sep=r"\t",
engine="python",
dtype=superCO2_dtypes,
na_values=[-999, "NaN"],
)
if "Collected beginning on" in header[2]:
collected_beginning_date = pd.to_datetime(header[3])
else:
collected_beginning_date = pd.NaT
# Reformat variable names
df.columns = [_format_variables(var) for var in df.columns]
# Generate time variable from Date and Time columns
df["time"] = pd.to_datetime(
(df["Date"] + " " + df["Time"]), format="%Y%m%d %H%M%S", utc=True
).dt.tz_convert(None)
# Review day of the year variable
df["time_doy_utc"] = pd.to_datetime(
df["DOY_UTC"] - 1,
unit="D",
origin=pd.Timestamp(collected_beginning_date.year, 1, 1),
utc=True,
).dt.tz_convert(None)
# Compare DOY_UTC vs Date + Time
dt = (df["time"] - df["time_doy_utc"]).mean().total_seconds()
dt_std = (df["time"] - df["time_doy_utc"]).std().total_seconds()
if dt > MAXIMUM_TIME_DIFFERENCE_IN_SECONDS:
logger.warning(
"Date + Time and DOY_UTC variables have an average time difference of %ss>%ss with a standard deviation of %ss",
dt,
MAXIMUM_TIME_DIFFERENCE_IN_SECONDS,
dt_std,
)
global_attributes = {
"title": header[1].replace(r"\n", ""),
"collected_beginning_date": collected_beginning_date,
}
if output == "dataframe":
return df, global_attributes
# Convert to an xarray dataset
ds = df.to_xarray()
ds.attrs = global_attributes
return standardize_dataset(ds)