Source code for megadetector.data_management.lila.lila_common

"""

lila_common.py

Common constants and functions related to LILA data management/retrieval.

"""

#%% Imports and constants

import os
import json
import zipfile
import pandas as pd

from urllib.parse import urlparse

from megadetector.utils.url_utils import download_url
from megadetector.utils.path_utils import unzip_file
from megadetector.utils.ct_utils import is_empty

# LILA camera trap primary metadata file
lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'

wildlife_insights_page_size = 30000
wildlife_insights_taxonomy_url = 'https://api.wildlifeinsights.org/api/v1/taxonomy/taxonomies-all?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]={}'.format(
    wildlife_insights_page_size)
wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
wildlife_insights_taxonomy_local_csv_filename = \
    wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')

# Filenames are consistent across clouds relative to these URLs
lila_base_urls = {
    'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
    'gcp':'https://storage.googleapis.com/public-datasets-lila/',
    'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
}

lila_cloud_urls = {
    'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
    'gcp':'gs://public-datasets-lila/',
    'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
}

for url in lila_base_urls.values():
    assert url.endswith('/')


#%% Common functions


[docs]
def read_wildlife_insights_taxonomy_mapping(metadata_dir, force_download=False):
    """
    Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.

    Args:
        metadata_dir (str): folder to use for temporary LILA metadata files
        force_download (bool, optional): download the taxonomy mapping file
            even if the local file exists.

    Returns:
        pd.dataframe: A DataFrame with taxonomy information
    """

    wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)

    if os.path.exists(wi_taxonomy_csv_path) and (not force_download):
        df = pd.read_csv(wi_taxonomy_csv_path)
    else:
        wi_taxonomy_json_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_json_filename)
        download_url(wildlife_insights_taxonomy_url, wi_taxonomy_json_path,
                     force_download=force_download)
        with open(wi_taxonomy_json_path,'r') as f:
            d = json.load(f)

        # We haven't implemented paging, make sure that's not an issue
        assert d['meta']['totalItems'] < wildlife_insights_page_size

        # d['data'] is a list of items that look like:
        """
         {'id': 2000003,
         'class': 'Mammalia',
         'order': 'Rodentia',
         'family': 'Abrocomidae',
         'genus': 'Abrocoma',
         'species': 'bennettii',
         'authority': 'Waterhouse, 1837',
         'commonNameEnglish': "Bennett's Chinchilla Rat",
         'taxonomyType': 'biological',
         'uniqueIdentifier': '7a6c93a5-bdf7-4182-82f9-7a67d23f7fe1'}
        """
        df = pd.DataFrame(d['data'])
        df.to_csv(wi_taxonomy_csv_path,index=False)

    return df




[docs]
def read_lila_taxonomy_mapping(metadata_dir, force_download=False):
    """
    Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.

    Args:
        metadata_dir (str): folder to use for temporary LILA metadata files
        force_download (bool, optional): download the taxonomy mapping file
            even if the local file exists.

    Returns:
        pd.DataFrame: a DataFrame with one row per identification
    """

    p = urlparse(lila_taxonomy_mapping_url)
    taxonomy_filename = os.path.join(metadata_dir,os.path.basename(p.path))
    download_url(lila_taxonomy_mapping_url, taxonomy_filename,
                 force_download=force_download)

    df = pd.read_csv(taxonomy_filename)

    return df




[docs]
def read_lila_metadata(metadata_dir, force_download=False):
    """
    Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.

    Args:
        metadata_dir (str): folder to use for temporary LILA metadata files
        force_download (bool, optional): download the metadata file even if
            the local file exists.

    Returns:
        dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
        with keys corresponding to the headers in the .csv file, currently:

        - name
        - short_name
        - continent
        - country
        - region
        - image_base_url_relative
        - bbox_url_relative
        - image_base_url_gcp
        - metadata_url_gcp
        - bbox_url_gcp
        - image_base_url_aws
        - metadata_url_aws
        - bbox_url_aws
        - image_base_url_azure
        - metadata_url_azure
        - box_url_azure
        - mdv4_results_raw
        - mdv5b_results_raw
        - md_results_with_rde
        - json_filename
    """

    # Put the master metadata file in the same folder where we're putting images
    p = urlparse(lila_metadata_url)
    metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))
    download_url(lila_metadata_url, metadata_filename, force_download=force_download)

    df = pd.read_csv(metadata_filename)

    records = df.to_dict('records')

    # Parse into a table keyed by dataset name
    metadata_table = {}

    # r = records[0]
    for r in records:
        if is_empty(r['name']):
            continue

        # Convert NaN's to None
        for k in r.keys():
            if is_empty(r[k]):
                r[k] = None

        metadata_table[r['name']] = r

    return metadata_table




[docs]
def read_lila_all_images_file(metadata_dir, force_download=False, read_to_dataframe=True):
    """
    Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
    all LILA files, and opens the resulting .csv file as a Pandas DataFrame.

    Args:
        metadata_dir (str): folder to use for temporary LILA metadata files
        force_download (bool, optional): download the metadata file even if
            the local file exists.
        read_to_dataframe (bool, optional): read the .csv file into a dataframe

    Returns:
        pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap
        image, or None if read_to_dataframe is False
    """

    p = urlparse(lila_all_images_url)
    lila_all_images_zip_filename = os.path.join(metadata_dir,os.path.basename(p.path))
    download_url(lila_all_images_url, lila_all_images_zip_filename,
                 force_download=force_download)

    with zipfile.ZipFile(lila_all_images_zip_filename,'r') as z:
        files = z.namelist()
    assert len(files) == 1

    unzipped_csv_filename = os.path.join(metadata_dir,files[0])
    if not os.path.isfile(unzipped_csv_filename):
        unzip_file(lila_all_images_zip_filename,metadata_dir)
    else:
        print('{} already unzipped'.format(unzipped_csv_filename))

    if not read_to_dataframe:
        return None

    df = pd.read_csv(unzipped_csv_filename)
    return df




[docs]
def read_metadata_file_for_dataset(ds_name,
                                   metadata_dir,
                                   metadata_table=None,
                                   json_url=None,
                                   preferred_cloud='gcp',
                                   force_download=False):
    """
    Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.

    Args:
        ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
            "Caltech Camera Traps")
        metadata_dir (str): folder to use for temporary LILA metadata files
        metadata_table (dict, optional): an optional dictionary already loaded via
            read_lila_metadata()
        json_url (str, optional): the URL of the metadata file, if None will be retrieved
            via read_lila_metadata()
        preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
        force_download (bool, optional): download the metadata file even if
            the local file exists.

    Returns:
        str: the .json filename on the local disk

    """

    if preferred_cloud is None:
        preferred_cloud = 'gcp'

    assert preferred_cloud in lila_base_urls.keys()

    if json_url is None:

        if metadata_table is None:
            metadata_table = read_lila_metadata(metadata_dir)

        json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]

    p = urlparse(json_url)
    json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
    download_url(json_url, json_filename, force_download=force_download)

    # Unzip if necessary
    if json_filename.endswith('.zip'):

        with zipfile.ZipFile(json_filename,'r') as z:
            files = z.namelist()
        assert len(files) == 1
        unzipped_json_filename = os.path.join(metadata_dir,files[0])
        if not os.path.isfile(unzipped_json_filename):
            unzip_file(json_filename,metadata_dir)
        else:
            print('{} already unzipped'.format(unzipped_json_filename))
        json_filename = unzipped_json_filename

    return json_filename



#%% Interactive test driver

if False:

    pass

    #%% Verify that all base URLs exist

    # LILA camera trap primary metadata file
    urls = (lila_metadata_url,
            lila_taxonomy_mapping_url,
            lila_all_images_url,
            wildlife_insights_taxonomy_url)

    from megadetector.utils import url_utils

    status_codes = url_utils.test_urls(urls,timeout=2.0)
    assert all([code == 200 for code in status_codes])


    #%% Verify that the metadata URLs exist for individual datasets

    metadata_dir = os.path.expanduser('~/lila/metadata')

    dataset_metadata = read_lila_metadata(metadata_dir)

    urls_to_test = []

    # ds_name = next(iter(dataset_metadata.keys()))
    for ds_name in dataset_metadata.keys():

        ds_info = dataset_metadata[ds_name]
        for cloud_name in lila_base_urls.keys():
            urls_to_test.append(ds_info['metadata_url_' + cloud_name])
            if ds_info['bbox_url_relative'] is not None:
                urls_to_test.append(ds_info['bbox_url_' + cloud_name])

    status_codes = url_utils.test_urls(urls_to_test,
                                       error_on_failure=True,
                                       n_workers=10,
                                       pool_type='process',
                                       timeout=2.0)
    assert all([code == 200 for code in status_codes])