"""
lila_common.py
Common constants and functions related to LILA data management/retrieval.
"""
#%% Imports and constants
import os
import json
import zipfile
import pandas as pd
from urllib.parse import urlparse
from megadetector.utils.url_utils import download_url
from megadetector.utils.path_utils import unzip_file
from megadetector.utils.ct_utils import is_empty
# LILA camera trap primary metadata file
lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
wildlife_insights_page_size = 30000
wildlife_insights_taxonomy_url = 'https://api.wildlifeinsights.org/api/v1/taxonomy/taxonomies-all?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]={}'.format(
wildlife_insights_page_size)
wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
wildlife_insights_taxonomy_local_csv_filename = \
wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
# Filenames are consistent across clouds relative to these URLs
lila_base_urls = {
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
'gcp':'https://storage.googleapis.com/public-datasets-lila/',
'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
}
lila_cloud_urls = {
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
'gcp':'gs://public-datasets-lila/',
'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
}
for url in lila_base_urls.values():
assert url.endswith('/')
#%% Common functions
[docs]
def read_wildlife_insights_taxonomy_mapping(metadata_dir, force_download=False):
"""
Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
Args:
metadata_dir (str): folder to use for temporary LILA metadata files
force_download (bool, optional): download the taxonomy mapping file
even if the local file exists.
Returns:
pd.dataframe: A DataFrame with taxonomy information
"""
wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
if os.path.exists(wi_taxonomy_csv_path) and (not force_download):
df = pd.read_csv(wi_taxonomy_csv_path)
else:
wi_taxonomy_json_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_json_filename)
download_url(wildlife_insights_taxonomy_url, wi_taxonomy_json_path,
force_download=force_download)
with open(wi_taxonomy_json_path,'r') as f:
d = json.load(f)
# We haven't implemented paging, make sure that's not an issue
assert d['meta']['totalItems'] < wildlife_insights_page_size
# d['data'] is a list of items that look like:
"""
{'id': 2000003,
'class': 'Mammalia',
'order': 'Rodentia',
'family': 'Abrocomidae',
'genus': 'Abrocoma',
'species': 'bennettii',
'authority': 'Waterhouse, 1837',
'commonNameEnglish': "Bennett's Chinchilla Rat",
'taxonomyType': 'biological',
'uniqueIdentifier': '7a6c93a5-bdf7-4182-82f9-7a67d23f7fe1'}
"""
df = pd.DataFrame(d['data'])
df.to_csv(wi_taxonomy_csv_path,index=False)
return df
[docs]
def read_lila_taxonomy_mapping(metadata_dir, force_download=False):
"""
Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
Args:
metadata_dir (str): folder to use for temporary LILA metadata files
force_download (bool, optional): download the taxonomy mapping file
even if the local file exists.
Returns:
pd.DataFrame: a DataFrame with one row per identification
"""
p = urlparse(lila_taxonomy_mapping_url)
taxonomy_filename = os.path.join(metadata_dir,os.path.basename(p.path))
download_url(lila_taxonomy_mapping_url, taxonomy_filename,
force_download=force_download)
df = pd.read_csv(taxonomy_filename)
return df
[docs]
def read_lila_all_images_file(metadata_dir, force_download=False, read_to_dataframe=True):
"""
Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
Args:
metadata_dir (str): folder to use for temporary LILA metadata files
force_download (bool, optional): download the metadata file even if
the local file exists.
read_to_dataframe (bool, optional): read the .csv file into a dataframe
Returns:
pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap
image, or None if read_to_dataframe is False
"""
p = urlparse(lila_all_images_url)
lila_all_images_zip_filename = os.path.join(metadata_dir,os.path.basename(p.path))
download_url(lila_all_images_url, lila_all_images_zip_filename,
force_download=force_download)
with zipfile.ZipFile(lila_all_images_zip_filename,'r') as z:
files = z.namelist()
assert len(files) == 1
unzipped_csv_filename = os.path.join(metadata_dir,files[0])
if not os.path.isfile(unzipped_csv_filename):
unzip_file(lila_all_images_zip_filename,metadata_dir)
else:
print('{} already unzipped'.format(unzipped_csv_filename))
if not read_to_dataframe:
return None
df = pd.read_csv(unzipped_csv_filename)
return df
#%% Interactive test driver
if False:
pass
#%% Verify that all base URLs exist
# LILA camera trap primary metadata file
urls = (lila_metadata_url,
lila_taxonomy_mapping_url,
lila_all_images_url,
wildlife_insights_taxonomy_url)
from megadetector.utils import url_utils
status_codes = url_utils.test_urls(urls,timeout=2.0)
assert all([code == 200 for code in status_codes])
#%% Verify that the metadata URLs exist for individual datasets
metadata_dir = os.path.expanduser('~/lila/metadata')
dataset_metadata = read_lila_metadata(metadata_dir)
urls_to_test = []
# ds_name = next(iter(dataset_metadata.keys()))
for ds_name in dataset_metadata.keys():
ds_info = dataset_metadata[ds_name]
for cloud_name in lila_base_urls.keys():
urls_to_test.append(ds_info['metadata_url_' + cloud_name])
if ds_info['bbox_url_relative'] is not None:
urls_to_test.append(ds_info['bbox_url_' + cloud_name])
status_codes = url_utils.test_urls(urls_to_test,
error_on_failure=True,
n_workers=10,
pool_type='process',
timeout=2.0)
assert all([code == 200 for code in status_codes])