Source code for megadetector.data_management.camtrap_dp_to_coco

"""

camtrap_dp_to_coco.py

Parse a very limited subset of the Camtrap DP data package format:

https://camtrap-dp.tdwg.org/

...and convert to COCO format.  Assumes that all required metadata files have been
put in the same directory (which is standard).

Does not currently parse bounding boxes, just attaches species labels to images.

Currently supports only sequence-level labeling.

"""

#%% Imports and constants

import os
import json
import argparse

import pandas as pd

from dateutil import parser as dateparser

from collections import defaultdict


#%% Functions


[docs]
def camtrap_dp_to_coco(camtrap_dp_folder,output_file=None):
    """
    Convert the Camtrap DP package in [camtrap_dp_folder] to COCO.

    Does not validate images, just converts.  Use integrity_check_json_db to validate
    the resulting COCO file.

    Optionally writes the results to [output_file]

    Args:
        camtrap_dp_folder (str): input folder, containing a CamtrapDP package
        output_file (str, optional): COCO-formatted output file
    """

    required_files = ('datapackage.json','deployments.csv','events.csv','media.csv','observations.csv')

    for fn in required_files:
        fn_abs = os.path.join(camtrap_dp_folder,fn)
        assert os.path.isfile(fn_abs), 'Could not find required file {}'.format(fn_abs)

    with open(os.path.join(camtrap_dp_folder,'datapackage.json'),'r') as f:
        datapackage = json.load(f)

    assert datapackage['profile'] == 'https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0/camtrap-dp-profile.json', \
        'I only know how to parse Camtrap DP 1.0 packages'

    deployments_file = None
    events_file = None
    media_file = None
    observations_file = None

    resources = datapackage['resources']
    for r in resources:
        if r['name'] == 'deployments':
            deployments_file = r['path']
        elif r['name'] == 'media':
            media_file = r['path']
        elif r['name'] == 'events':
            events_file = r['path']
        elif r['name'] == 'observations':
            observations_file = r['path']

    assert deployments_file is not None, 'No deployment file specified'
    assert events_file is not None, 'No events file specified'
    assert media_file is not None, 'No media file specified'
    assert observations_file is not None, 'No observation file specified'

    deployments_df = pd.read_csv(os.path.join(camtrap_dp_folder,deployments_file))
    events_df = pd.read_csv(os.path.join(camtrap_dp_folder,events_file))
    media_df = pd.read_csv(os.path.join(camtrap_dp_folder,media_file))
    observations_df = pd.read_csv(os.path.join(camtrap_dp_folder,observations_file))

    print('Read {} deployment lines'.format(len(deployments_df)))
    print('Read {} events lines'.format(len(events_df)))
    print('Read {} media lines'.format(len(media_df)))
    print('Read {} observation lines'.format(len(observations_df)))

    media_id_to_media_info = {}

    # i_row = 0; row = media_df.iloc[i_row]
    for i_row,row in media_df.iterrows():
        media_info = {}
        media_info['file_name'] = os.path.join(row['filePath'],row['fileName']).replace('\\','/')
        media_info['location'] = row['deploymentID']
        media_info['id'] = row['mediaID']
        media_info['datetime'] = row['timestamp']
        media_info['datetime'] = dateparser.parse(media_info['datetime'])
        media_info['frame_num'] = -1
        media_info['seq_num_frames'] = -1
        media_id_to_media_info[row['mediaID']] = media_info

    event_id_to_media_ids = defaultdict(list)

    # i_row = 0; row = events_df.iloc[i_row]
    for i_row,row in events_df.iterrows():
        media_id = row['mediaID']
        assert media_id in media_id_to_media_info
        event_id_to_media_ids[row['eventID']].append(media_id)

    event_id_to_category_names = defaultdict(set)

    # i_row = 0; row = observations_df.iloc[i_row]
    for i_row,row in observations_df.iterrows():

        if row['observationLevel'] != 'event':
            raise ValueError("I don't know how to parse image-level events yet")

        if row['observationType'] == 'blank':
            event_id_to_category_names[row['eventID']].add('empty')
        elif row['observationType'] == 'unknown':
            event_id_to_category_names[row['eventID']].add('unknown')
        elif row['observationType'] == 'human':
            assert row['scientificName'] == 'Homo sapiens'
            event_id_to_category_names[row['eventID']].add(row['scientificName'])
        else:
            assert row['observationType'] == 'animal'
            assert isinstance(row['scientificName'],str)
            event_id_to_category_names[row['eventID']].add(row['scientificName'])

    # Sort images within an event into frame numbers
    #
    # event_id = next(iter(event_id_to_media_ids))
    for event_id in event_id_to_media_ids.keys():
        media_ids_this_event = event_id_to_media_ids[event_id]
        media_info_this_event = [media_id_to_media_info[media_id] for media_id in media_ids_this_event]
        media_info_this_event = sorted(media_info_this_event, key=lambda x: x['datetime'])
        for i_media,media_info in enumerate(media_info_this_event):
            media_info['frame_num'] = i_media
            media_info['seq_num_frames'] = len(media_info_this_event)
            media_info['seq_id'] = event_id

    # Create category names
    category_name_to_category_id = {'empty':0}
    for event_id in event_id_to_category_names:
        category_names_this_event = event_id_to_category_names[event_id]
        for name in category_names_this_event:
            if name not in category_name_to_category_id:
                category_name_to_category_id[name] = len(category_name_to_category_id)

    # Move everything into COCO format
    images = list(media_id_to_media_info.values())

    categories = []
    for name in category_name_to_category_id:
        categories.append({'name':name,'id':category_name_to_category_id[name]})
    info = {'version':1.0,'description':datapackage['name']}

    # Create annotations
    annotations = []

    for event_id in event_id_to_media_ids.keys():
        i_ann = 0
        media_ids_this_event = event_id_to_media_ids[event_id]
        media_info_this_event = [media_id_to_media_info[media_id] for media_id in media_ids_this_event]
        categories_this_event = event_id_to_category_names[event_id]
        for im in media_info_this_event:
            for category_name in categories_this_event:
                ann = {}
                ann['id'] = event_id + '_' + str(i_ann)
                i_ann += 1
                ann['image_id'] = im['id']
                ann['category_id'] = category_name_to_category_id[category_name]
                ann['sequence_level_annotation'] = True
                annotations.append(ann)

    coco_data = {}
    coco_data['images'] = images
    coco_data['annotations'] = annotations
    coco_data['categories'] = categories
    coco_data['info'] = info

    for im in coco_data['images']:
        im['datetime'] = str(im['datetime'] )

    if output_file is not None:
        with open(output_file,'w') as f:
            json.dump(coco_data,f,indent=1)

    return coco_data



#%% Interactive driver

if False:

    pass

    #%%

    camtrap_dp_folder = r'C:\temp\pilot2\pilot2'
    coco_file = os.path.join(camtrap_dp_folder,'test-coco.json')
    coco_data = camtrap_dp_to_coco(camtrap_dp_folder,
                                   output_file=coco_file)

    #%% Validate

    from megadetector.data_management.databases.integrity_check_json_db import \
        integrity_check_json_db, IntegrityCheckOptions

    options = IntegrityCheckOptions()

    options.baseDir = camtrap_dp_folder
    options.bCheckImageSizes = False
    options.bCheckImageExistence = True
    options.bFindUnusedImages = True
    options.bRequireLocation = True
    options.iMaxNumImages = -1
    options.nThreads = 1
    options.verbose = True

    sorted_categories, data, error_info = integrity_check_json_db(coco_file,options)

    #%% Preview

    from megadetector.visualization.visualize_db import DbVizOptions, visualize_db

    options = DbVizOptions()
    options.parallelize_rendering = True
    options.parallelize_rendering_with_threads = True
    options.parallelize_rendering_n_cores = 10

    preview_dir = r'c:\temp\camtrapdp-preview'
    html_output_file, image_db = visualize_db(coco_file, preview_dir, camtrap_dp_folder, options=options)

    from megadetector.utils.path_utils import open_file
    open_file(html_output_file)


#%% Command-line driver

def main():
    """
    Command-line interface to convert Camtrap DP to COCO.
    """

    parser = argparse.ArgumentParser(description='Convert Camtrap DP to COCO format')
    parser.add_argument('camtrap_dp_folder', type=str,
                        help='Input folder, containing a CamtrapDP package')
    parser.add_argument('--output_file', type=str, default=None,
                        help='COCO-formatted output file (defaults to [camtrap_dp_folder]_coco.json)')

    args = parser.parse_args()

    if args.output_file is None:
        # Default output file name: [camtrap_dp_folder]_coco.json
        #
        # Remove trailing slash if present
        folder_name = args.camtrap_dp_folder.rstrip(os.sep)
        output_file = folder_name + '_coco.json'
    else:
        output_file = args.output_file

    camtrap_dp_to_coco(camtrap_dp_folder=args.camtrap_dp_folder, output_file=output_file)
    print(f"Successfully converted Camtrap DP package at '{args.camtrap_dp_folder}' to " + \
          f"COCO format at '{output_file}'")

if __name__ == '__main__':
    main()