"""
camtrap_dp_to_coco.py
Parse a very limited subset of the Camtrap DP data package format:
https://camtrap-dp.tdwg.org/
...and convert to COCO format. Assumes that all required metadata files have been
put in the same directory (which is standard).
Does not currently parse bounding boxes, just attaches species labels to images.
Currently supports only sequence-level labeling.
"""
#%% Imports and constants
import os
import json
import argparse
import pandas as pd
from dateutil import parser as dateparser
from collections import defaultdict
#%% Functions
[docs]
def camtrap_dp_to_coco(camtrap_dp_folder,output_file=None):
"""
Convert the Camtrap DP package in [camtrap_dp_folder] to COCO.
Does not validate images, just converts. Use integrity_check_json_db to validate
the resulting COCO file.
Optionally writes the results to [output_file]
Args:
camtrap_dp_folder (str): input folder, containing a CamtrapDP package
output_file (str, optional): COCO-formatted output file
"""
required_files = ('datapackage.json','deployments.csv','events.csv','media.csv','observations.csv')
for fn in required_files:
fn_abs = os.path.join(camtrap_dp_folder,fn)
assert os.path.isfile(fn_abs), 'Could not find required file {}'.format(fn_abs)
with open(os.path.join(camtrap_dp_folder,'datapackage.json'),'r') as f:
datapackage = json.load(f)
assert datapackage['profile'] == 'https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0/camtrap-dp-profile.json', \
'I only know how to parse Camtrap DP 1.0 packages'
deployments_file = None
events_file = None
media_file = None
observations_file = None
resources = datapackage['resources']
for r in resources:
if r['name'] == 'deployments':
deployments_file = r['path']
elif r['name'] == 'media':
media_file = r['path']
elif r['name'] == 'events':
events_file = r['path']
elif r['name'] == 'observations':
observations_file = r['path']
assert deployments_file is not None, 'No deployment file specified'
assert events_file is not None, 'No events file specified'
assert media_file is not None, 'No media file specified'
assert observations_file is not None, 'No observation file specified'
deployments_df = pd.read_csv(os.path.join(camtrap_dp_folder,deployments_file))
events_df = pd.read_csv(os.path.join(camtrap_dp_folder,events_file))
media_df = pd.read_csv(os.path.join(camtrap_dp_folder,media_file))
observations_df = pd.read_csv(os.path.join(camtrap_dp_folder,observations_file))
print('Read {} deployment lines'.format(len(deployments_df)))
print('Read {} events lines'.format(len(events_df)))
print('Read {} media lines'.format(len(media_df)))
print('Read {} observation lines'.format(len(observations_df)))
media_id_to_media_info = {}
# i_row = 0; row = media_df.iloc[i_row]
for i_row,row in media_df.iterrows():
media_info = {}
media_info['file_name'] = os.path.join(row['filePath'],row['fileName']).replace('\\','/')
media_info['location'] = row['deploymentID']
media_info['id'] = row['mediaID']
media_info['datetime'] = row['timestamp']
media_info['datetime'] = dateparser.parse(media_info['datetime'])
media_info['frame_num'] = -1
media_info['seq_num_frames'] = -1
media_id_to_media_info[row['mediaID']] = media_info
event_id_to_media_ids = defaultdict(list)
# i_row = 0; row = events_df.iloc[i_row]
for i_row,row in events_df.iterrows():
media_id = row['mediaID']
assert media_id in media_id_to_media_info
event_id_to_media_ids[row['eventID']].append(media_id)
event_id_to_category_names = defaultdict(set)
# i_row = 0; row = observations_df.iloc[i_row]
for i_row,row in observations_df.iterrows():
if row['observationLevel'] != 'event':
raise ValueError("I don't know how to parse image-level events yet")
if row['observationType'] == 'blank':
event_id_to_category_names[row['eventID']].add('empty')
elif row['observationType'] == 'unknown':
event_id_to_category_names[row['eventID']].add('unknown')
elif row['observationType'] == 'human':
assert row['scientificName'] == 'Homo sapiens'
event_id_to_category_names[row['eventID']].add(row['scientificName'])
else:
assert row['observationType'] == 'animal'
assert isinstance(row['scientificName'],str)
event_id_to_category_names[row['eventID']].add(row['scientificName'])
# Sort images within an event into frame numbers
#
# event_id = next(iter(event_id_to_media_ids))
for event_id in event_id_to_media_ids.keys():
media_ids_this_event = event_id_to_media_ids[event_id]
media_info_this_event = [media_id_to_media_info[media_id] for media_id in media_ids_this_event]
media_info_this_event = sorted(media_info_this_event, key=lambda x: x['datetime'])
for i_media,media_info in enumerate(media_info_this_event):
media_info['frame_num'] = i_media
media_info['seq_num_frames'] = len(media_info_this_event)
media_info['seq_id'] = event_id
# Create category names
category_name_to_category_id = {'empty':0}
for event_id in event_id_to_category_names:
category_names_this_event = event_id_to_category_names[event_id]
for name in category_names_this_event:
if name not in category_name_to_category_id:
category_name_to_category_id[name] = len(category_name_to_category_id)
# Move everything into COCO format
images = list(media_id_to_media_info.values())
categories = []
for name in category_name_to_category_id:
categories.append({'name':name,'id':category_name_to_category_id[name]})
info = {'version':1.0,'description':datapackage['name']}
# Create annotations
annotations = []
for event_id in event_id_to_media_ids.keys():
i_ann = 0
media_ids_this_event = event_id_to_media_ids[event_id]
media_info_this_event = [media_id_to_media_info[media_id] for media_id in media_ids_this_event]
categories_this_event = event_id_to_category_names[event_id]
for im in media_info_this_event:
for category_name in categories_this_event:
ann = {}
ann['id'] = event_id + '_' + str(i_ann)
i_ann += 1
ann['image_id'] = im['id']
ann['category_id'] = category_name_to_category_id[category_name]
ann['sequence_level_annotation'] = True
annotations.append(ann)
coco_data = {}
coco_data['images'] = images
coco_data['annotations'] = annotations
coco_data['categories'] = categories
coco_data['info'] = info
for im in coco_data['images']:
im['datetime'] = str(im['datetime'] )
if output_file is not None:
with open(output_file,'w') as f:
json.dump(coco_data,f,indent=1)
return coco_data
#%% Interactive driver
if False:
pass
#%%
camtrap_dp_folder = r'C:\temp\pilot2\pilot2'
coco_file = os.path.join(camtrap_dp_folder,'test-coco.json')
coco_data = camtrap_dp_to_coco(camtrap_dp_folder,
output_file=coco_file)
#%% Validate
from megadetector.data_management.databases.integrity_check_json_db import \
integrity_check_json_db, IntegrityCheckOptions
options = IntegrityCheckOptions()
options.baseDir = camtrap_dp_folder
options.bCheckImageSizes = False
options.bCheckImageExistence = True
options.bFindUnusedImages = True
options.bRequireLocation = True
options.iMaxNumImages = -1
options.nThreads = 1
options.verbose = True
sorted_categories, data, error_info = integrity_check_json_db(coco_file,options)
#%% Preview
from megadetector.visualization.visualize_db import DbVizOptions, visualize_db
options = DbVizOptions()
options.parallelize_rendering = True
options.parallelize_rendering_with_threads = True
options.parallelize_rendering_n_cores = 10
preview_dir = r'c:\temp\camtrapdp-preview'
html_output_file, image_db = visualize_db(coco_file, preview_dir, camtrap_dp_folder, options=options)
from megadetector.utils.path_utils import open_file
open_file(html_output_file)
#%% Command-line driver
def main():
"""
Command-line interface to convert Camtrap DP to COCO.
"""
parser = argparse.ArgumentParser(description='Convert Camtrap DP to COCO format')
parser.add_argument('camtrap_dp_folder', type=str,
help='Input folder, containing a CamtrapDP package')
parser.add_argument('--output_file', type=str, default=None,
help='COCO-formatted output file (defaults to [camtrap_dp_folder]_coco.json)')
args = parser.parse_args()
if args.output_file is None:
# Default output file name: [camtrap_dp_folder]_coco.json
#
# Remove trailing slash if present
folder_name = args.camtrap_dp_folder.rstrip(os.sep)
output_file = folder_name + '_coco.json'
else:
output_file = args.output_file
camtrap_dp_to_coco(camtrap_dp_folder=args.camtrap_dp_folder, output_file=output_file)
print(f"Successfully converted Camtrap DP package at '{args.camtrap_dp_folder}' to " + \
f"COCO format at '{output_file}'")
if __name__ == '__main__':
main()