Source code for megadetector.data_management.wi_download_csv_to_coco

"""

wi_download_csv_to_coco.py

Converts a .csv file (or a folder of .csv files) from a Wildlife Insights project export to a
COCO camera traps .json file.

Currently assumes that common names are unique identifiers, which is convenient but unreliable.

"""

#%% Imports and constants

import os
import re

from tqdm import tqdm
from collections import defaultdict

from megadetector.utils.ct_utils import write_json
from megadetector.utils.ct_utils import is_empty
from megadetector.utils.ct_utils import sort_dictionary_by_value
from megadetector.utils.ct_utils import sort_list_of_dicts_by_key
from megadetector.utils.string_utils import is_int
from megadetector.utils.path_utils import find_images
from megadetector.utils.wi_platform_utils import read_images_from_download_bundle
from megadetector.utils.wi_platform_utils import read_sequences_from_download_bundle
from megadetector.utils.wi_platform_utils import url_to_relative_path

wi_extra_annotation_columns = \
    ('identified_by',
     'wi_taxon_id',
     'uncertainty',
     'number_of_objects',
     'group_size',
     'age',
     'sex',
     'animal_recognizable',
     'individual_id',
     'individual_animal_notes',
     'behavior',
     'highlighted',
     'markings')

# Omitted:
#
# is_blank
# filename
# cv_confidence
# license
# bounding_boxes

# Handled as part of the category:
#
# class
# order
# family
# genus
# species

# Handled as part of the image:
#
# timestamp
# image_id
# project_id
# deployment_id
# location

wi_extra_image_columns = ('project_id','deployment_id')

def _make_location_id(project_id,deployment_id):
    return 'project_' + str(project_id) + '_deployment_' + deployment_id

# The default category mappings lose some information about vehicles,
# but we are typically using this to perform accuracy comparisons for animals,
# and having lots of categories that co-occur with other categories (which
# vehicles do) makes analysis messier.
default_category_remappings = {
    # "blank" is handled specially below
    # 'blank':'empty',
    'homo species':'human',
    'no cv result':'unknown',
    'misfire':'blank',
    '.*human.*':'human',
    '.*vehicle.*':'vehicle',
    'truck':'vehicle',
    'atv':'vehicle'
}


#%% Main function


[docs]
def wi_download_csv_to_coco(csv_file_in,
                            coco_file_out=None,
                            image_folder=None,
                            exclude_missing_images=False,
                            image_flattening='deployment',
                            verbose=True,
                            category_remappings=default_category_remappings,
                            blank_disagreement_handling='trust_label',
                            include_blanks=True):
    """
    Converts a .csv file (or folder of .csv files) from a Wildlife Insights project export
    to a COCO Camera Traps .json file.

    TODO: currently relies on uniqueness of common names, which is not guaranteed.  Prints
    warnings for non-unique common names.

    Args:
        csv_file_in (str): a downloaded .csv file we should convert to COCO, or a folder
            containing images...csv files.
        coco_file_out (str, optional): the .json file we should write; if [coco_file_out] is None,
            returns data, but doesn't write it
        image_folder (str, optional): the folder where images live, only relevant if
            [exclude_missing_images] is True
        exclude_missing_images (bool, optional): whether to exclude images not present
            in disk; if this is True, [image_folder] must be a valid folder.  This has no
            impact on blank images if "include_blanks" is False.
        image_flattening (str, optional): if 'none', relative paths will be stored
            as the entire URL for each image, other than gs://.  Can be 'guid' (just
            store [GUID].JPG), 'deployment' (store as [deployment]/[GUID].JPG), or
            'project' (store as [project]/[deployment]/[GUID].JPG).
        verbose (bool, optional): enable additional debug console output
        category_remappings (dict, optional): str --> str dict that maps WI category
            names to output category names.  Regular expressions allowed in keys.
        blank_disagreement_handling (str, optional): what to do when the "common_name"
            field disagrees with the "is_blank" field; can be "trust_label" (default),
            "trust_is_blank", or "error
        include_blanks (bool, optional): whether to include blank images in the COCO
            file

    Returns:
        dict: COCO-formatted data, identical to what's written to [coco_file_out]
    """

    ##%% Validate inputs

    assert os.path.isfile(csv_file_in) or os.path.isdir(csv_file_in), \
        '{} does not exist'.format(csv_file_in)

    assert blank_disagreement_handling in ('trust_label','trust_is_blank','error'), \
        'Unknown blank disagreement handling value: {}'.format(
            blank_disagreement_handling)


    ##%% Read input files

    # read_images_from_download_bundle supports a folder or a single .csv file
    image_id_to_image_records = read_images_from_download_bundle(csv_file_in)

    assert image_id_to_image_records is not None, \
        'Failed to read images from {}'.format(csv_file_in)

    print('Read image records for {} unique image IDs'.format(
        len(image_id_to_image_records)))

    sequence_id_to_sequence_records = read_sequences_from_download_bundle(csv_file_in)

    sequence_id_to_image_ids = None

    # Is this a sequence-based project?
    if sequence_id_to_sequence_records is not None:

        print('Read sequence records for {} sequence IDs'.format(
            len(sequence_id_to_sequence_records)))

        # Group images into sequences

        sequence_id_to_image_ids = defaultdict(set)

        # image_id = next(iter(image_id_to_image_records))
        for image_id in image_id_to_image_records:

            records_this_image = image_id_to_image_records[image_id]

            for r in records_this_image:

                assert image_id == r['image_id']
                if 'sequence_id' not in r:
                    print('Warning: image {} does not have a sequence ID'.format(r['image_id']))
                    continue
                sequence_id_to_image_ids[r['sequence_id']].add(image_id)

            # ...for each record associated with this image ID

        # ...for each image ID

        # Create frame numbers and frame ordering

        # sequence_id = next(iter(sequence_id_to_image_ids))
        for sequence_id in sequence_id_to_image_ids:

            image_ids_this_sequence = sequence_id_to_image_ids[sequence_id]

            records_this_sequence = []

            for image_id in image_ids_this_sequence:

                records_this_image = image_id_to_image_records[image_id]
                # Choose a representative record for sorting
                r = records_this_image[0]
                # Timestamps are formatted as "2019-09-09 13:45:00"
                assert isinstance(r['timestamp'],str) and len(r['timestamp']) == 19
                records_this_sequence.append(r)

            sorted_records_this_sequence = \
                sort_list_of_dicts_by_key(records_this_sequence,'timestamp')

            # i_record = 0; r = sorted_records_this_sequence[i_record]
            for i_record,r in enumerate(sorted_records_this_sequence):

                r['frame_num'] = i_record
                r['seq_num_frames'] = len(sorted_records_this_sequence)
                image_id = r['image_id']

                # If there are multiple records for this image (typically indicating multiple
                # species), propagate that information to the other records
                records_this_image_id = image_id_to_image_records[image_id]

                # target_r = records_this_image_id[0]
                for target_r in records_this_image_id:

                    if r == target_r:
                        continue

                    assert r['timestamp'] == target_r['timestamp']
                    target_r['frame_num'] = i_record
                    target_r['seq_num_frames'] = len(sorted_records_this_sequence)

            # ...for each record in this sequence

        # ...for each sequence ID

    # ...if this is a sequence-based project


    #%% Create COCO dictionaries

    category_name_to_category = {}
    empty_category = {'name':'empty','id':0,'count':0,'taxonomy_string':''}
    category_name_to_category['empty'] = empty_category

    image_id_to_image = {}
    image_id_to_annotations = defaultdict(list)

    print('Converting records to COCO...')

    n_blanks_excluded = 0
    n_placeholders_excluded = 0

    # image_id = next(iter(image_id_to_image_records))
    for image_id in tqdm(image_id_to_image_records.keys(),
                         total=len(image_id_to_image_records)):

        image_records_this_id = image_id_to_image_records[image_id]

        reference_record = image_records_this_id[0]

        url = reference_record['location']
        assert url.startswith('gs://')

        # Omit placeholder images
        if 'https' in url and 'placeholder' in url:
            continue

        # The "project" flattening scheme means "prepend the project ID to /[deployment]/..."
        if image_flattening == 'project':
            project_id = str(reference_record['project_id'])
            file_name = project_id + '/' + url_to_relative_path(url,image_flattening='deployment')
        else:
            file_name = url_to_relative_path(url,image_flattening=image_flattening)

        location_id = _make_location_id(
            reference_record['project_id'],
            reference_record['deployment_id'])

        nonblank_annotation_found = False

        im = {}
        im['id'] = image_id
        im['file_name'] = file_name
        im['location'] = location_id
        im['datetime'] = reference_record['timestamp']

        sequence_records_this_sequence = None

        # Should we iterate over image records or sequence records to determine
        # labels for this image?
        label_records = image_records_this_id

        if 'sequence_id' in reference_record:

            assert sequence_id_to_image_ids is not None
            assert 'seq_num_frames' in reference_record, 'sequence processing error'
            assert 'frame_num' in reference_record, 'sequence processing error'

            # Not a typo; WI uses "sequence_id", COCO Camera Traps uses "seq_id"
            im['seq_id'] = reference_record['sequence_id']
            im['seq_num_frames'] = reference_record['seq_num_frames']
            im['frame_num'] = reference_record['frame_num']

            sequence_records_this_sequence = \
                sequence_id_to_sequence_records[reference_record['sequence_id']]
            label_records = sequence_records_this_sequence

            # Image-level and sequence-level taxa should be the same
            #
            # I don't know why labels are reported at both levels.
            taxon_ids_this_sequence = set([r['wi_taxon_id'] for r in sequence_records_this_sequence])
            taxon_ids_each_image = set([r['wi_taxon_id'] for r in image_records_this_id])

            assert taxon_ids_each_image == taxon_ids_this_sequence, \
                'Sequence label inconsistency'

        im['wi_image_info'] = {}
        for s in wi_extra_image_columns:
            assert s in reference_record, \
                'Required column {} missing from image {}'.format(s,reference_record['image_id'])
            im['wi_image_info'][s] = str(reference_record[s])

        categories_this_image = set()

        # Iterate over either image records or label records to determine the labels
        # we should store for this image.
        #
        # record = label_records[0]
        for record in label_records:

            # If there are multiple records for this image (typically because multiple species
            # were recorded), make sure the metadata is consistent across records
            if record != reference_record:

                # "Timestamp" is only present for image records; sequence records use
                # "start_time" and "end_time"
                # assert record['timestamp'] == reference_record['timestamp']
                assert record['project_id'] == reference_record['project_id']
                assert record['deployment_id'] == reference_record['deployment_id']

            count = None

            # This is a bit of future-proofing... it seems odd to me that "count"
            # becomes "number_of_objects" in image-based project downloads.
            if 'count' in record:
                raise ValueError(
                    'Note to self: you suspected a field called "count" might occur in some scenarios')

            # Image-based projects use "number_of_objects"
            if 'number_of_objects' in record:
                assert 'group_size' not in record
                count = record['number_of_objects']

            # Sequence-based projects use "group_size"
            if 'group_size' in record:
                assert 'number_of_objects' not in record
                count = record['group_size']

            if is_empty(count):
                count = None
            else:
                assert is_int(count), \
                    'Illegal group size value: {}'.format(count)
                count = int(count)

            category_name = record['common_name'].strip().lower()

            if category_name == '':

                if len(record['genus']) > 0 and len(record['species']) > 0:
                    category_name = record['genus'] + ' ' + record['species']
                elif len(record['genus']) > 0:
                    category_name = record['genus']
                elif len(record['family']) > 0:
                    category_name = record['family']
                elif len(record['order']) > 0:
                    category_name = record['order']
                elif len(record['class']) > 0:
                    category_name = record['class']
                else:
                    print('Warning: no common name or binomial name available for {}'.format(
                        record['wi_taxon_id']))
                    category_name = record['wi_taxon_id']
                category_name = category_name.strip().lower()

            # ...handling empty category names

            taxonomy_tokens = []
            for level in ('class','order','family','genus','species'):
                taxonomy_tokens.append(record[level])
            taxonomy_string = ';'.join(taxonomy_tokens)
            taxonomy_string = taxonomy_string.lower().strip()

            # Should this category name get remapped?
            if (category_remappings is not None):
                # Check for exact matches
                if category_name in category_remappings:
                    category_name = category_remappings[category_name]
                # Check for regex matches
                else:
                    for k in category_remappings.keys():
                        if re.search(k,category_name):
                            category_name = category_remappings[k]
                            break

            # This is used for logic below, so we handle it outside of category_remappings
            if category_name == 'blank':
                category_name = 'empty'

            assert isinstance(record['is_blank'],int) and \
                record['is_blank'] in (0,1)

            # Resolve disagreements between different ways that blank-ness
            # can be represented
            category_says_blank = category_name == 'empty'
            is_blank_says_blank = record['is_blank'] == 1

            if (category_says_blank) and (is_blank_says_blank):
                category_name = 'empty'
            elif (category_says_blank) or (is_blank_says_blank):
                if blank_disagreement_handling == 'error':
                    raise ValueError('Blank disagreement for {} ({})'.format(
                        image_id, file_name))
                elif blank_disagreement_handling == 'trust_category':
                    print('Warning: category says {}, is_blank says {}, using category'.format(
                        category_name,record['is_blank']))
                elif blank_disagreement_handling == 'trust_is_blank':
                    print('Warning: category says {}, is_blank says {}, using is_blank'.format(
                        category_name,record['is_blank']))
                    if is_blank_says_blank:
                        category_name = 'empty'
                    else:
                        # This is a quirky case, we're supposed to trust is_blank, but
                        # and it says it's not blank, but the category says it is, so we
                        # have no other category we can use
                        assert category_name == 'empty'
                        category_name = 'unknown'

            assert category_name != 'blank'

            # Don't create annotations for the same category twice for the same image
            if category_name in categories_this_image:
                continue
            categories_this_image.add(category_name)

            if category_name in category_name_to_category:
                category = category_name_to_category[category_name]
                category_id = category['id']
                category['count'] = category['count'] + 1
                assert category['name'] == category_name
                if (category_name not in ['empty','unknown']) and \
                   (taxonomy_string != category['taxonomy_string']):
                    print('Warning: category {} has multiple taxonomy strings:\n{}\n{}\n'.format(
                        category_name,
                        taxonomy_string,
                        category['taxonomy_string']))
            else:
                category_id = len(category_name_to_category)
                category = {}
                category_name_to_category[category_name] = category
                category['name'] = category_name
                category['id'] = category_id
                category['count'] = 1
                category['taxonomy_string'] = taxonomy_string

            if category_name != 'empty':
                nonblank_annotation_found = True

            ann = {}
            ann['image_id'] = image_id
            annotations_this_image = image_id_to_annotations[image_id]
            annotation_number = len(annotations_this_image)
            ann['id'] = image_id + '_' + str(annotation_number).zfill(2)
            ann['category_id'] = category_id

            if sequence_records_this_sequence is not None:
                ann['sequence_level_annotation'] = True
            else:
                ann['sequence_level_annotation'] = False

            if count is not None:
                ann['count'] = count

            annotations_this_image.append(ann)

            extra_info = {}
            for s in wi_extra_annotation_columns:
                if s in record:

                    v = record[s]

                    # Only store interesting values
                    store_record = False

                    if isinstance(v,str) and (len(v) > 0):
                        if s.lower() == 'uncertainty' and v.lower() == "don't know":
                            store_record = False
                        elif s.lower() == 'age' and v.lower() == 'unknown':
                            store_record = False
                        elif s.lower() == 'sex' and v.lower() == 'unknown':
                            store_record = False
                        elif s.lower() == 'identified_by' and v.lower() == 'batch upload':
                            store_record = False
                        else:
                            store_record = True

                    # Treat bools as store_true, there are tons of uninformative "False"
                    # fields (e.g. "highlighted").
                    elif isinstance(v,bool):
                        if v:
                            store_record = True

                    if store_record:
                        extra_info[s] = v

            if len(extra_info) > 0:
                ann['wi_extra_info'] = extra_info

        # ...for each label record (image or sequence) associated with this image

        if include_blanks or nonblank_annotation_found:
            image_id_to_image[image_id] = im
        else:
            n_blanks_excluded += 1

    # ...for each image


    ##%% Write COCO output

    images = list(image_id_to_image.values())
    categories = list(category_name_to_category.values())

    print('Created COCO records for {} image IDs ({} blanks, {} placeholders excluded)'.format(
        len(image_id_to_image),n_blanks_excluded, n_placeholders_excluded))

    annotations = []

    # image_id_to_annotations contains image IDs we didn't end up using,
    # so we loop over [images] to find the image IDs for which we want to
    # store annotations
    for im in images:
        image_id = im['id']
        annotations_this_image = image_id_to_annotations[image_id]
        for ann in annotations_this_image:
            annotations.append(ann)

    print('Created COCO {} annotation records ({} categories)'.format(
        len(annotations),len(categories)))

    info = {'version':'1.00','description':'converted from WI export'}
    info['source_file'] = csv_file_in
    coco_data = {}
    coco_data['info'] = info
    coco_data['images'] = images
    coco_data['annotations'] = annotations
    coco_data['categories'] = categories

    category_name_to_count = {c['name']:c['count'] for c in categories}
    category_name_to_count = \
        sort_dictionary_by_value(category_name_to_count,reverse=True)

    print_category_counts = False

    if print_category_counts:

        print('Categories and counts:\n')
        for i_category,category_name in enumerate(category_name_to_count):
            category_name_string = category_name
            if (category_name == 'empty') and (not include_blanks):
                category_name_string += (' (excluded)')
            print('{}: {}'.format(category_name_string,
                                category_name_to_count[category_name]))

    ##%% Exclude missing images if requested

    if exclude_missing_images:

        assert os.path.isdir(image_folder), \
            'Must specify a valid image folder if you specify validate_images=True'

        print('Enumerating images in {}'.format(image_folder))
        all_images = find_images(image_folder, return_relative_paths=True, recursive=True)
        all_images_set = set(all_images)

        missing_images = []

        category_name_to_missing_image_count = defaultdict(int)

        category_id_to_name = {c['id']:c['name'] for c in categories}

        # im = images[0]
        for im in tqdm(images):

            file_name_relative = im['file_name']
            if file_name_relative not in all_images_set:

                annotations_this_image = image_id_to_annotations[im['id']]
                categories_this_image = []
                for ann in annotations_this_image:
                    category_id = ann['category_id']
                    category_name = category_id_to_name[category_id]
                    category_name_to_missing_image_count[category_name] += 1

                missing_images.append(im)

        print('Missing {} of {} images'.format(
            len(missing_images),
            len(images)))

        if len(category_name_to_missing_image_count) > 0:

            print('\nCategories with missing images:\n')

            category_name_to_missing_image_count = \
                sort_dictionary_by_value(category_name_to_missing_image_count,
                                         reverse=True)

            for category_name in category_name_to_missing_image_count:
                expected_count_string = ''
                if category_name in category_name_to_count:
                    expected_count_string = ' (of {} in metadata)'.format(
                        category_name_to_count[category_name])
                print('{}: {}{}'.format(category_name,
                                        category_name_to_missing_image_count[category_name],
                                        expected_count_string))

        # ...if we're missing any images

        # TODO: clean up categories that are no longer used
        missing_filenames = set([im['file_name'] for im in missing_images])
        missing_image_ids = set([im['id'] for im in missing_images])

        images = [im for im in images if im['file_name'] not in missing_filenames]
        annotations = [ann for ann in annotations if ann['image_id'] not in missing_image_ids]
        coco_data['images'] = images
        coco_data['annotations'] = annotations

    # ...if we are supposed to exclude missing images


    ##%% Write output json

    if coco_file_out is not None:
        print('Writing COCO data to {}'.format(coco_file_out))
        write_json(coco_file_out,coco_data)


    ##%% Validate output

    from megadetector.data_management.databases.integrity_check_json_db import \
        IntegrityCheckOptions,integrity_check_json_db

    print('Validating COCO file {}'.format(coco_file_out))

    options = IntegrityCheckOptions()
    options.baseDir = image_folder
    options.bCheckImageExistence = False
    options.verbose = verbose

    _ = integrity_check_json_db(coco_file_out,options)

    ##%%

    return coco_data


# ...def wi_download_csv_to_coco(...)


#%% Interactive driver

if False:

    #%%

    image_folder = '/blah/images/2000000'
    csv_file_in = '/csv_downloads/wildlife-insights_046ddddd-d870-dddd-a91d-a50c1a28fe29_project-2001650_data'
    coco_file_out = None
    gs_prefix = 'gs://000000000000_2000000_3658_project_name__main/deployment/'

    validate_images = False
    verbose = True
    category_remappings = default_category_remappings


#%% Command-line driver

# TODO