Source code for megadetector.data_management.databases.integrity_check_json_db

"""

integrity_check_json_db.py

Does some integrity-checking and computes basic statistics on a COCO Camera Traps .json file, specifically:

* Verifies that required fields are present and have the right types
* Verifies that annotations refer to valid images
* Verifies that annotations refer to valid categories
* Verifies that image, category, and annotation IDs are unique
* Optionally checks file existence
* Finds un-annotated images
* Finds unused categories
* Prints a list of categories sorted by count

"""

#%% Constants and environment

import argparse
import json
import os
import sys

from functools import partial
from multiprocessing.pool import Pool, ThreadPool
from operator import itemgetter
from tqdm import tqdm

from megadetector.visualization.visualization_utils import open_image
from megadetector.utils import ct_utils
from megadetector.utils.path_utils import find_images


#%% Classes and environment

[docs] class IntegrityCheckOptions: """ Options for integrity_check_json_db() """ def __init__(self): #: Image path; the filenames in the .json file should be relative to this folder self.baseDir = '' #: Should we validate the image sizes? self.bCheckImageSizes = False #: Should we check that all the images in the .json file exist on disk? self.bCheckImageExistence = False #: Should we search [baseDir] for images that are not used in the .json file? self.bFindUnusedImages = False #: Should we require that all images in the .json file have a 'location' field? self.bRequireLocation = True #: For debugging, limit the number of images we'll process self.iMaxNumImages = -1 #: Number of threads to use for parallelization, set to <= 1 to disable parallelization self.nThreads = 10 #: Whether to use threads (rather than processes for parallelization) self.parallelizeWithThreads = True #: Enable additional debug output self.verbose = True #: Allow integer-valued image and annotation IDs (COCO uses this, CCT files use strings) self.allowIntIDs = False #: If True, error if the 'info' field is not present self.requireInfo = False #: Validate that boxes have positive width/height values, can be 'error', #: 'warning', or None self.validateBoxes = None
#%% Functions def _check_image_existence_and_size(image,options=None): """ Validate the image represented in the CCT image dict [image], which should have fields: * file_name * width * height Args: image (dict): image to validate options (IntegrityCheckOptions): parameters impacting validation Returns: str: None if this image passes validation, otherwise an error string """ if options is None: options = IntegrityCheckOptions() assert options.bCheckImageExistence file_path = os.path.join(options.baseDir,image['file_name']) if not os.path.isfile(file_path): s = 'Image path {} does not exist'.format(file_path) return s if options.bCheckImageSizes: if not ('height' in image and 'width' in image): s = 'Missing image size in {}'.format(file_path) return s # width, height = Image.open(file_path).size try: pil_im = open_image(file_path) width,height = pil_im.size pil_im.close() except Exception as e: s = 'Error opening {}: {}'.format(file_path,str(e)) return s if (not (width == image['width'] and height == image['height'])): s = 'Size mismatch for image {}: {} (reported {},{}, actual {},{})'.format( image['id'], file_path, image['width'], image['height'], width, height) return s return None
[docs] def integrity_check_json_db(json_file, options=None): """ Does some integrity-checking and computes basic statistics on a COCO Camera Traps .json file; see module header comment for a list of the validation steps. Args: json_file (str): filename to validate, or an already-loaded dict options (IntegrityCheckOptions, optional): see IntegrityCheckOptions Returns: tuple: tuple containing: - sorted_categories (dict): list of categories used in [json_file], sorted by frequency - data (dict): the data loaded from [json_file] - error_info (dict): specific validation errors """ if options is None: options = IntegrityCheckOptions() if options.bCheckImageSizes: options.bCheckImageExistence = True if options.verbose: print(options.__dict__) if options.baseDir is None: options.baseDir = '' base_dir = options.baseDir ##%% Read .json file if necessary, integrity-check fields if isinstance(json_file,dict): data = json_file elif isinstance(json_file,str): assert os.path.isfile(json_file), '.json file {} does not exist'.format(json_file) if options.verbose: print('Reading .json {} with base dir [{}]...'.format( json_file,base_dir)) with open(json_file,'r') as f: data = json.load(f) else: raise ValueError('Illegal value for json_file') images = data['images'] annotations = data['annotations'] categories = data['categories'] if options.requireInfo: assert 'info' in data, 'No info struct in database' if len(base_dir) > 0: assert os.path.isdir(base_dir), \ 'Base directory {} does not exist'.format(base_dir) ##%% Build dictionaries, checking ID uniqueness and internal validity as we go image_id_to_image = {} ann_id_to_ann = {} category_id_to_category = {} category_name_to_category = {} image_location_set = set() if options.verbose: print('Checking categories...') for cat in tqdm(categories): # Confirm that required fields are present assert 'name' in cat assert 'id' in cat assert isinstance(cat['id'],int), \ 'Illegal category ID type: [{}]'.format(str(cat['id'])) assert isinstance(cat['name'],str), \ 'Illegal category name type [{}]'.format(str(cat['name'])) category_id = cat['id'] category_name = cat['name'] # Confirm ID uniqueness assert category_id not in category_id_to_category, \ 'Category ID {} is used more than once'.format(category_id) category_id_to_category[category_id] = cat cat['_count'] = 0 assert category_name not in category_name_to_category, \ 'Category name {} is used more than once'.format(category_name) category_name_to_category[category_name] = cat # ...for each category if options.verbose: print('\nChecking image records...') if options.iMaxNumImages > 0 and len(images) > options.iMaxNumImages: if options.verbose: print('Trimming image list to {}'.format(options.iMaxNumImages)) images = images[0:options.iMaxNumImages] image_paths_in_json = set() sequences = set() # image = images[0] for image in tqdm(images): image['_count'] = 0 # Confirm that required fields are present assert 'file_name' in image assert 'id' in image image['file_name'] = image['file_name'].replace('\\','/') image_paths_in_json.add(image['file_name']) assert isinstance(image['file_name'],str), 'Illegal image filename type' if options.allowIntIDs: assert isinstance(image['id'],str) or isinstance(image['id'],int), \ 'Illegal image ID type' else: assert isinstance(image['id'],str), 'Illegal image ID type' image_id = image['id'] # Confirm ID uniqueness assert image_id not in image_id_to_image, 'Duplicate image ID {}'.format(image_id) image_id_to_image[image_id] = image if 'height' in image: assert 'width' in image, 'Image with height but no width: {}'.format(image['id']) if 'width' in image: assert 'height' in image, 'Image with width but no height: {}'.format(image['id']) if options.bRequireLocation: assert 'location' in image, 'No location available for: {}'.format(image['id']) if 'location' in image: # We previously supported ints here; this should be strings now # assert isinstance(image['location'], str) or isinstance(image['location'], int), \ # 'Illegal image location type' assert isinstance(image['location'], str) image_location_set.add(image['location']) if 'seq_id' in image: sequences.add(image['seq_id']) assert not ('sequence_id' in image or 'sequence' in image), 'Illegal sequence identifier' unused_files = [] image_paths_relative = None # Are we checking for unused images? if (len(base_dir) > 0) and options.bFindUnusedImages: if options.verbose: print('\nEnumerating images...') image_paths_relative = find_images(base_dir,return_relative_paths=True,recursive=True) for fn_relative in image_paths_relative: if fn_relative not in image_paths_in_json: unused_files.append(fn_relative) # List of (filename,error_string) tuples validation_errors = [] # If we're checking image existence but not image size, we don't need to read the images if options.bCheckImageExistence and not options.bCheckImageSizes: if image_paths_relative is None: image_paths_relative = find_images(base_dir,return_relative_paths=True,recursive=True) image_paths_relative_set = set(image_paths_relative) for im in images: if im['file_name'] not in image_paths_relative_set: validation_errors.append((im['file_name'],'not found in relative path list')) # If we're checking image size, we need to read the images if options.bCheckImageSizes: if len(base_dir) == 0: print('Warning: checking image sizes without a base directory, assuming "."') if options.verbose: print('Checking image existence and/or image sizes...') if options.nThreads is not None and options.nThreads > 1: if options.parallelizeWithThreads: worker_string = 'threads' else: worker_string = 'processes' if options.verbose: print('Starting a pool of {} {}'.format(options.nThreads,worker_string)) if options.parallelizeWithThreads: pool = ThreadPool(options.nThreads) else: pool = Pool(options.nThreads) try: results = list(tqdm(pool.imap( partial(_check_image_existence_and_size,options=options), images), total=len(images))) finally: pool.close() pool.join() print('Pool closed and joined for image size checks') else: results = [] for im in tqdm(images): results.append(_check_image_existence_and_size(im,options)) for i_image,result in enumerate(results): if result is not None: validation_errors.append((images[i_image]['file_name'],result)) # ...for each image if options.verbose: print('{} validation errors (of {})'.format(len(validation_errors),len(images))) print('Checking annotations...') n_boxes = 0 for ann in tqdm(annotations): # Confirm that required fields are present assert 'image_id' in ann assert 'id' in ann assert 'category_id' in ann if options.allowIntIDs: assert isinstance(ann['id'],str) or isinstance(ann['id'],int), \ 'Illegal annotation ID type' assert isinstance(ann['image_id'],str) or isinstance(ann['image_id'],int), \ 'Illegal annotation image ID type' else: assert isinstance(ann['id'],str), 'Illegal annotation ID type' assert isinstance(ann['image_id'],str), 'Illegal annotation image ID type' assert isinstance(ann['category_id'],int), 'Illegal annotation category ID type' if 'bbox' in ann: n_boxes += 1 ann_id = ann['id'] image_id = ann['image_id'] if ('bbox' in ann) and (options.validateBoxes is not None): assert options.validateBoxes in ('error','warning'), \ 'Illegal value {} for validateBoxes'.format(options.validateBoxes) annotation_string = str(ann['bbox']) # We'll allow arbitrary metadata to be tacked on to the end of boxes s = '' if len(ann['bbox']) < 4: s += 'Annotation error: illegal bounding box in annotation {} for image {}: {}\n'.format( ann_id,image_id,annotation_string) if ann['bbox'][2] < 0: s += 'Annotation error: negative width in annotation {} for image {}: {}\n'.format( ann_id,image_id,annotation_string) if ann['bbox'][3] < 0: s += 'Annotation error: negative height in annotation {} for image {}: {}\n'.format( ann_id,image_id,annotation_string) if len(s) > 0: if options.validateBoxes == 'error': raise ValueError(s) else: print('Warning: {}'.format(s)) im = image_id_to_image[image_id] validation_errors.append((im['file_name'],s)) # ...if we're supposed to validate boxes # Confirm ID uniqueness assert ann_id not in ann_id_to_ann, \ 'Duplicate annotation ID {}'.format(ann_id) ann_id_to_ann[ann_id] = ann # Confirm validity assert ann['category_id'] in category_id_to_category, \ 'Category {} not found in category list'.format(ann['category_id']) assert ann['image_id'] in image_id_to_image, \ 'Image ID {} referred to by annotation {}, not available'.format( ann['image_id'],ann['id']) image_id_to_image[ann['image_id']]['_count'] += 1 category_id_to_category[ann['category_id']]['_count'] +=1 # ...for each annotation sorted_categories = sorted(categories, key=itemgetter('_count'), reverse=True) ##%% Print statistics if options.verbose: # Find un-annotated images and multi-annotation images n_unannotated = 0 n_multi_annotated = 0 for image in images: if image['_count'] == 0: n_unannotated += 1 elif image['_count'] > 1: n_multi_annotated += 1 print('\nFound {} unannotated images, {} images with multiple annotations'.format( n_unannotated,n_multi_annotated)) if (len(base_dir) > 0) and options.bFindUnusedImages: print('Found {} unused image files'.format(len(unused_files))) n_unused_categories = 0 # Find unused categories for cat in categories: if cat['_count'] == 0: print('Unused category: {}'.format(cat['name'])) n_unused_categories += 1 print('Found {} unused categories'.format(n_unused_categories)) sequence_string = 'no sequence info' if len(sequences) > 0: sequence_string = '{} sequences'.format(len(sequences)) print('\nDB contains {} images, {} annotations, {} bboxes, {} categories, {}\n'.format( len(images),len(annotations),n_boxes,len(categories),sequence_string)) if len(image_location_set) > 0: print('DB contains images from {} locations\n'.format(len(image_location_set))) print('Categories and annotation (not image) counts:\n') for cat in sorted_categories: print('{:6} {}'.format(cat['_count'],cat['name'])) print('') error_info = {} error_info['unused_files'] = unused_files error_info['validation_errors'] = validation_errors return sorted_categories, data, error_info
# ...def integrity_check_json_db() #%% Command-line driver def main(): # noqa parser = argparse.ArgumentParser() parser.add_argument('json_file',type=str, help='COCO-formatted .json file to validate') parser.add_argument('--bCheckImageSizes', action='store_true', help='Validate image size, requires baseDir to be specified. ' + \ 'Implies existence checking.') parser.add_argument('--bCheckImageExistence', action='store_true', help='Validate image existence, requires baseDir to be specified') parser.add_argument('--bFindUnusedImages', action='store_true', help='Check for images in baseDir that aren\'t in the database, ' + \ 'requires baseDir to be specified') parser.add_argument('--baseDir', action='store', type=str, default='', help='Base directory for images') parser.add_argument('--bAllowNoLocation', action='store_true', help='Disable errors when no location is specified for an image') parser.add_argument('--iMaxNumImages', action='store', type=int, default=-1, help='Cap on total number of images to check') parser.add_argument('--nThreads', action='store', type=int, default=10, help='Number of threads (only relevant when verifying image ' + \ 'sizes and/or existence)') if len(sys.argv[1:])==0: parser.print_help() parser.exit() args = parser.parse_args() args.bRequireLocation = (not args.bAllowNoLocation) options = IntegrityCheckOptions() ct_utils.args_to_object(args, options) integrity_check_json_db(args.json_file,options) if __name__ == '__main__': main() #%% Interactive driver(s) if False: #%% """ python integrity_check_json_db.py ~/data/ena24.json --baseDir ~/data/ENA24 --bAllowNoLocation """ # Integrity-check .json files for LILA json_files = [os.path.expanduser('~/data/ena24.json')] options = IntegrityCheckOptions() options.baseDir = os.path.expanduser('~/data/ENA24') options.bCheckImageSizes = False options.bFindUnusedImages = True options.bRequireLocation = False # options.iMaxNumImages = 10 for json_file in json_files: sorted_categories,data,_ = integrity_check_json_db(json_file, options)