Source code for pyobistools.taxa

#!/usr/bin/env python
# coding=utf-8

import typing as t
from functools import partial

import numpy as np
import pandas as pd
import pyworms
import requests

from pyobistools.utils import removesuffix

from pyobistools.validation import check_scientificname_and_ids as check_names



[docs]
STANDARD_SPECIES_COLUMNS = {
    'taxon_id': np.nan,
    'url': '',
    'scientificname': '',
    'authority': '',
    'status': '',
    'unacceptreason': '',
    'taxon_rank_id': np.nan,
    'rank': '',
    'valid_taxon_id': np.nan,
    'valid_name': '',
    'valid_authority': '',
    'parent_name_usage_id': np.nan,
    'kingdom': '',
    'phylum': '',
    'class': '',
    'order': '',
    'family': '',
    'genus': '',
    'citation': '',
    'lsid': '',
    'is_marine': False,
    'is_brackish': False,
    'is_fresh_water': False,
    'is_terrestrial': False,
    'is_extinct': False,
    'match_type': '',
    'modified': '',
    'matched': False,
    'match_input': '',
    'match_from': '',
}




[docs]
def _standardize_types(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.columns:
        if c in STANDARD_SPECIES_COLUMNS:
            if isinstance(STANDARD_SPECIES_COLUMNS[c], bool):
                df[c] = df[c].astype(bool)
            elif isinstance(STANDARD_SPECIES_COLUMNS[c], str):
                df[c] = df[c].astype(str)
            elif np.isnan(STANDARD_SPECIES_COLUMNS[c]):
                df[c] = pd.to_numeric(df[c])
    return df




[docs]
def remove_suffix(name: str) -> str:
    suffixes = [
        ' sp.',
        ' spp.',
        ' sp',
        ' spp',
    ]
    for suf in suffixes:
        if name.endswith(suf):
            return removesuffix(name, suf)

    # If no suffix was found return the original name
    # with whitespace removed
    return name.strip()




[docs]
def add_suffix(name: str) -> t.List[str]:
    """
    Adds suffixes to a name for searching ITIS
    """
    suffixes = [
        ' sp.',
        ' spp.',
        ' sp',
        ' spp',
    ]
    for suf in suffixes:
        if name.endswith(suf):
            # Strip off any existing suffixes from the name
            # and break out
            name = removesuffix(name, suf)
            break

    # Return a name for each suffix with whitespace remvoed
    return [name.strip() + s for s in suffixes]




[docs]
def match_taxa(names, ask=True, itis_usage=False):
    """
    Wrap the existing functionality in validation in the expected name for this function as per R's iobis/obistools.

    @param names    List of scientific names to check against
    @param ask      Do we ask the user to resolve multi-match or ambiguous names?
    @param itis_usage   Pass through the ITIS check setting for the client function to handle

    @return structure with appended lsids where WoRMS (or ITIS can resolve them)
    """

    return check_names.check_scientificname_and_ids(names, value='names', itis_usage=itis_usage)




[docs]
def search_worms(names: t.List[str],
                 kwargs: t.Dict[str, t.Any] = {}) -> pd.DataFrame:
    """
    Searches WoRMS for records based on a list of scientific names and returns
    a standardized pandas DataFrame representing the results

    Args:
        names (t.List[str]): List of scientific names to match

    Returns:
        pd.DataFrame: Species records
    """
    # WoRMS doesn't like suffixes so remove them
    suffixless_names = [remove_suffix(s) for s in names]

    # Renames from pyworks output to the standard columns
    renames = {
        'AphiaID': 'taxon_id',
        'valid_AphiaID': 'valid_taxon_id',
        'taxonRankID': 'taxon_rank_id',
        'isExtinct': 'is_extinct',
        'parentNameUsageID': 'parent_name_usage_id',
        'isFreshwater': 'is_fresh_water',
        'isTerrestrial': 'is_terrestrial',
        'isMarine': 'is_marine',
        'isBrackish': 'is_brackish',
    }

    results = pyworms.aphiaRecordsByMatchNames(
        suffixless_names,
        **kwargs
    )

    rows = []
    for input_idx, name_results in enumerate(results):

        # Carry through the input name for the output DataFrame
        match_input = names[input_idx]

        # Track rows which did not return any data
        if not name_results:
            rows.append({'match_input': match_input, 'matched': False})

        for row in name_results:
            for k, v in renames.items():
                if k in row:
                    row[v] = row.pop(k)
            row.update({
                'match_input': match_input,
                'matched': True
            })
            rows.append(row)

    # Now standardize the columns
    for r in rows:
        # Fill in columns that don't exist
        r.update({
            k: v for k, v in STANDARD_SPECIES_COLUMNS.items()
            if k not in r
        })

    results = pd.DataFrame(rows)
    results['match_from'] = 'worms'
    results = _standardize_types(results)
    return results




[docs]
def search_itis(names: t.List[str],
                kwargs: t.Dict[str, t.Any] = {}) -> pd.DataFrame:
    """
    Searches ITIS for records based on a list of scientific names and returns
    a standardized pandas DataFrame representing the results

    Args:
        names (t.List[str]): List of scientific names to match

    Returns:
        pd.DataFrame: Species records
    """
    # ITIS wants suffixes?
    suffix_names = [add_suffix(s) for s in names]  # noqa

    # TODO: Hit ITIS API to return results
    # renames = {
    #     'tsn': 'taxon_id',
    #     'combinedName': 'valid_name',
    # }

    # This is a placeholder for now, we don't actually hit ITIS at all
    rows = []
    for n in names:
        rows.append({'match_input': n, 'matched': False})

    # Now standardize the columns
    for r in rows:
        # Fill in columns that don't exist
        r.update({
            k: v for k, v in STANDARD_SPECIES_COLUMNS.items()
            if k not in r
        })

    # Standardize the OBIS return data format
    results = pd.DataFrame(rows)

    # Set the lsid when a taxon_id is defined
    results.loc[results.taxon_id.notna(), 'lsid'] = results.apply(
        lambda x: "urn:lsid:itis.gov:itis_tsn:" + str(x.taxon_id), axis=1
    )
    results['valid_taxon_id'] = results.taxon_id.copy()
    results['match_from'] = 'itis'
    results = _standardize_types(results)
    return results




[docs]
def search_obis(names: t.List[str],
                kwargs: t.Dict[str, t.Any] = {}) -> pd.DataFrame:
    """
    Searches OBIS for records based on a list of scientific names and returns
    a standardized pandas DataFrame representing the results

    Args:
        names (t.List[str]): List of scientific names to match

    Returns:
        pd.DataFrame: Species records
    """
    obis_api = kwargs.pop('url', 'https://api.obis.org/v3/')
    http_headers = {
        'content-type': 'application/json; charset=utf-8'
    }

    renames = {
        'taxonRank': 'rank',
        'scientificName': 'scientificname',
        'scientificNameAuthorship': 'authority',
        'taxonID': 'taxon_id',
        'taxonomicStatus': 'status',
        'acceptedNameUsage': 'valid_name',
    }

    # WoRMS doesn't like suffixes so remove them
    suffixless_names = [remove_suffix(s) for s in names]

    rows = []
    for name in suffixless_names:

        r = requests.get(
            f'{obis_api}taxon/{name}',
            headers=http_headers
        )
        try:
            r.raise_for_status()
        except BaseException:
            # Error, fill with empty dataframe
            rows.append({'match_input': name, 'matched': False})
        else:
            results = r.json()['results']
            if not results:
                rows.append({'match_input': name, 'matched': False})
            for row in results:
                for k, v in renames.items():
                    if k in row:
                        row[v] = row.pop(k)
                row.update({
                    'match_input': name,
                    'matched': True
                })
                rows.append(row)

    # Now standardize the columns
    for r in rows:
        # Fill in columns that don't exist
        r.update({
            k: v for k, v in STANDARD_SPECIES_COLUMNS.items()
            if k not in r
        })

    # Standardize the OBIS return data format
    results = pd.DataFrame(rows)
    results['match_from'] = 'obis'
    results = _standardize_types(results)

    return results




[docs]
def search(names: t.List[str],
           worms_kwargs: t.Dict[str, t.Any] = {},
           itis_kwargs: t.Dict[str, t.Any] = {},
           obis_kwargs: t.Dict[str, t.Any] = {},
           order: t.List[str] = None,
           quick: bool = False) -> pd.DataFrame:
    """
    Search a list of scientific names in WoRMS, ITIS and OBIS and return the resulting
    record information in a DataFrame. Tries WoRMS first, ITIS second, and OBIS third unless
    another order is specificed with the "order" parameter.

    Args:
        names (t.List[str]): List of scientific names to match
        worms_kwargs (t.Dict[str, t.Any]): keyword arguments to pass to pyworm's
            "aphiaRecordsByMatchNames" function
        itis_kwargs (t.Dict[str, t.Any]): keyword arguments to pass to ITIS
        obis_kwargs (t.Dict[str, t.Any]): keyword arguments to pass to OBIS
        order (t.List[str]): order which to to check external services, defaults to
            ['worms', 'idis', 'obis']. To only check a subset of sources set to a smaller list.
        quick (bool): Stop when the first match is found and return rather than query additional
            services
    Returns:
        pd.DataFrame: Species records
    """
    # Accept inputs that are strings (single species)
    if isinstance(names, str):
        names = [names]

    if order is None:
        order = ['worms', 'itis', 'obis']

    # A mapping between the "order" and the function to call
    # for each one
    funcs = {
        'worms': partial(search_worms, **worms_kwargs),
        'itis': partial(search_itis, **itis_kwargs),
        'obis': partial(search_obis, **obis_kwargs),
    }

    # Call individual search functions until one doesn't return empty
    all_results = []

    for o in order:
        if o not in funcs:
            continue

        results = funcs[o](names)
        all_results.append(results)

        # Break if we want to stop searching on first match
        if results.matched.any() and quick is True:
            break

    # Return all results subset by the standard columns
    results = pd.concat(all_results, ignore_index=True)
    results = results[STANDARD_SPECIES_COLUMNS.keys()]
    return results
Source code for pyobistools.taxa

pyobistools

Navigation

Related Topics