Source code for megadetector.postprocessing.separate_detections_into_folders

r"""

separate_detections_into_folders.py

**Overview**

Given a .json file with batch processing results, separate the files in that
set of results into folders that contain animals/people/vehicles/nothing,
according to per-class thresholds.

Image files are copied, not moved.

**Output structure**

Preserves relative paths within each of those folders; cannot be used with .json
files that have absolute paths in them.

For example, if your .json file has these images:

* a/b/c/1.jpg
* a/b/d/2.jpg
* a/b/e/3.jpg
* a/b/f/4.jpg
* a/x/y/5.jpg

And let's say:

* The results say that the first three images are empty/person/vehicle, respectively
* The fourth image is above threshold for "animal" and "person"
* The fifth image contains an animal

* You specify an output base folder of c:/out

You will get the following files:

* c:/out/empty/a/b/c/1.jpg
* c:/out/people/a/b/d/2.jpg
* c:/out/vehicles/a/b/e/3.jpg
* c:/out/animal_person/a/b/f/4.jpg
* c:/out/animals/a/x/y/5.jpg

**Rendering bounding boxes**

By default, images are just copied to the target output folder.  If you specify --render_boxes,
bounding boxes will be rendered on the output images.  Because this is no longer strictly
a copy operation, this may result in the loss of metadata.  More accurately, this *may*
result in the loss of some EXIF metadata; this *will* result in the loss of IPTC/XMP metadata.

Rendering boxes also makes this script a lot slower.

**Classification-based separation**

If you have a results file with classification data, you can also specify classes to put
in their own folders, within the "animals" folder, like this:

``--classification_thresholds "deer=0.75,cow=0.75"``

So, e.g., you might get:

c:/out/animals/deer/a/x/y/5.jpg

In this scenario, the folders within "animals" will be:

deer, cow, multiple, unclassified

"multiple" in this case only means "deer and cow"; if an image is classified as containing a
bird and a bear, that would end up in "unclassified", since the folder separation is based only
on the categories you provide at the command line.

No classification-based separation is done within the animal_person, animal_vehicle, or
animal_person_vehicle folders.

"""

#%% Constants and imports

import argparse
import json
import os
import shutil
import sys
import itertools

from multiprocessing.pool import ThreadPool
from functools import partial
from tqdm import tqdm

from megadetector.utils.ct_utils import args_to_object
from megadetector.utils.string_utils import is_float
from megadetector.utils.path_utils import remove_empty_folders
from megadetector.detection.run_detector import get_typical_confidence_threshold_from_results
from megadetector.visualization import visualization_utils as vis_utils
from megadetector.visualization.visualization_utils import blur_detections

friendly_folder_names = {'animal':'animals','person':'people','vehicle':'vehicles'}

# Occasionally we have near-zero confidence detections associated with COCO classes that
# didn't quite get squeezed out of the model in training.  As long as they're near zero
# confidence, we just ignore them.
invalid_category_epsilon = 0.00001

default_line_thickness = 8
default_box_expansion = 3


#%% Options class

[docs] class SeparateDetectionsIntoFoldersOptions: """ Options used to parameterize separate_detections_into_folders() """ def __init__(self,threshold=None): #: Default threshold for categories not specified in category_name_to_threshold self.threshold = None #: Dict mapping category names to thresholds; for example, an image with only a detection of class #: "animal" whose confidence is greater than or equal to category_name_to_threshold['animal'] #: will be put in the "animal" folder. self.category_name_to_threshold = { 'animal': self.threshold, 'person': self.threshold, 'vehicle': self.threshold } #: Number of workers to use, set to <= 1 to disable parallelization self.n_threads = 1 #: By default, this function errors if you try to output to an existing folder self.allow_existing_directory = False #: By default, this function errors if any of the images specified in the results file don't #: exist in the source folder. self.allow_missing_files = False #: Whether to overwrite images that already exist in the target folder; only relevant if #: [allow_existing_directory] is True self.overwrite = True #: Whether to skip empty images; if this is False, empty images (i.e., images with no detections #: above the corresponding threshold) will be copied to an "empty" folder. self.skip_empty_images = False #: The MD results .json file to process self.results_file = None #: The folder containing source images; filenames in [results_file] should be relative to this #: folder. self.base_input_folder = None #: The folder to which we should write output images; see the module header comment for information #: about how that folder will be structured. self.base_output_folder = None #: Should we move rather than copy? self.move_images = False #: Should we render boxes on the output images? Makes everything a lot slower. self.render_boxes = False #: Line thickness in pixels; only relevant if [render_boxes] is True self.line_thickness = default_line_thickness #: Box expansion in pixels; only relevant if [render_boxes] is True self.box_expansion = default_box_expansion #: Originally specified as a string that looks like this: #: #: deer=0.75,cow=0.75 #: #: String, converted internally to a dict mapping name:threshold self.classification_thresholds = None ## Debug or internal attributes #: Do not set explicitly; populated from data when using classification results self.classification_category_id_to_name = None #: Do not set explicitly; populated from data when using classification results self.classification_categories = None #: Used to test this script; sets a limit on the number of images to process. self.debug_max_images = None #: Do not set explicitly; this gets created based on [results_file] #: #:Dictionary mapping categories (plus combinations of categories, and 'empty') to output folders self.category_name_to_folder = None #: Do not set explicitly; this gets loaded from [results_file] self.category_id_to_category_name = None #: List of category names for which we should blur detections, most commonly ['person'] #: #: Can also be a comma-separated list. self.category_names_to_blur = None #: Remove all empty folders from the target folder at the end of the process, #: whether or not they were created by this script self.remove_empty_folders = False
# ...__init__() # ...class SeparateDetectionsIntoFoldersOptions #%% Support functions def _path_is_abs(p): return (len(p) > 1) and (p[0] == '/' or p[1] == ':') printed_missing_file_warning = False def _process_detections(im,options): """ Process all detections for a single image May modify *im*. """ global printed_missing_file_warning relative_filename = im['file'] detections = None if 'detections' in im: detections = im['detections'] categories_above_threshold = None if detections is None: assert im['failure'] is not None and len(im['failure']) > 0 target_folder = options.category_name_to_folder['failure'] else: category_name_to_max_confidence = {} category_names = options.category_id_to_category_name.values() for category_name in category_names: category_name_to_max_confidence[category_name] = 0.0 # Find the maximum confidence for each category # # det = detections[0] for det in detections: category_id = det['category'] # For zero-confidence detections, we occasionally have leftover goop # from COCO classes if category_id not in options.category_id_to_category_name: print('Warning: unrecognized category {} in file {}'.format( category_id,relative_filename)) # assert det['conf'] < invalid_category_epsilon continue category_name = options.category_id_to_category_name[category_id] if det['conf'] > category_name_to_max_confidence[category_name]: category_name_to_max_confidence[category_name] = det['conf'] # ...for each detection on this image # Count the number of thresholds exceeded categories_above_threshold = [] for category_name in category_names: threshold = options.category_name_to_threshold[category_name] assert threshold is not None max_confidence_this_category = category_name_to_max_confidence[category_name] if max_confidence_this_category >= threshold: categories_above_threshold.append(category_name) # ...for each category categories_above_threshold.sort() using_classification_folders = (options.classification_thresholds is not None and \ len(options.classification_thresholds) > 0) # If this is above multiple thresholds if len(categories_above_threshold) > 1: # Currently "animal_person" images get put into the "animal_person" folder, even if we're # doing species-based separation. Ideally, we would optionally put these in either the "deer" # folder or a "deer_person" folder, but this is pretty esoteric, so not worrying about this # for now. target_folder = options.category_name_to_folder['_'.join(categories_above_threshold)] elif len(categories_above_threshold) == 0: target_folder = options.category_name_to_folder['empty'] else: assert len(categories_above_threshold) == 1 target_folder = options.category_name_to_folder[categories_above_threshold[0]] # Are we making species classification folders, and is this an animal? if ('animal' in categories_above_threshold) and (using_classification_folders): # Do we need to put this into a specific species folder? # Find the animal-class detections that are above threshold category_name_to_id = {v: k for k, v in options.category_id_to_category_name.items()} animal_category_id = category_name_to_id['animal'] valid_animal_detections = [d for d in detections if \ (d['category'] == animal_category_id and \ d['conf'] >= options.category_name_to_threshold['animal'])] # Count the number of classification categories that are above threshold for at # least one detection classification_categories_above_threshold = set() # d = valid_animal_detections[0] for d in valid_animal_detections: if 'classifications' not in d or d['classifications'] is None: continue # classification = d['classifications'][0] for classification in d['classifications']: classification_category_id = classification[0] classification_confidence = classification[1] # Do we have a threshold for this category, and if so, is # this classification above threshold? assert options.classification_category_id_to_name is not None classification_category_name = \ options.classification_category_id_to_name[classification_category_id] if (classification_category_name in options.classification_thresholds) and \ (classification_confidence > \ options.classification_thresholds[classification_category_name]): classification_categories_above_threshold.add(classification_category_name) # ...for each classification # ...for each detection if len(classification_categories_above_threshold) == 0: classification_folder_name = 'unclassified' elif len(classification_categories_above_threshold) > 1: classification_folder_name = 'multiple' else: assert len(classification_categories_above_threshold) == 1 classification_folder_name = list(classification_categories_above_threshold)[0] target_folder = os.path.join(target_folder,classification_folder_name) # ...if we have to deal with classification subfolders # ...if we have 0/1/more categories above threshold # ...if this is/isn't a failure case source_path = os.path.join(options.base_input_folder,relative_filename) if not os.path.isfile(source_path): if not options.allow_missing_files: raise ValueError('Cannot find file {}'.format(source_path)) else: if not printed_missing_file_warning: print('Warning: cannot find at least one file ({})'.format(source_path)) printed_missing_file_warning = True return target_path = os.path.join(target_folder,relative_filename) if (not options.overwrite) and (os.path.isfile(target_path)): return target_dir = os.path.dirname(target_path) os.makedirs(target_dir,exist_ok=True) # Skip this image if it's empty and we're not processing empty images if ((categories_above_threshold is None) or (len(categories_above_threshold) == 0)) and \ options.skip_empty_images: return # At this point, this image is getting copied; we may or may not also need to # draw bounding boxes or blur pixels. # Do a simple copy operation if we don't need to manipulate the images (render boxes, blur pixels) if (not options.render_boxes and (options.category_names_to_blur is None)) or \ (categories_above_threshold is None) or \ (len(categories_above_threshold) == 0): if options.move_images: shutil.move(source_path,target_path) else: shutil.copyfile(source_path,target_path) else: # Open the source image pil_image = vis_utils.load_image(source_path) # Blur regions in the image if necessary category_names_to_blur = options.category_names_to_blur if category_names_to_blur is not None: if isinstance(category_names_to_blur,str): category_names_to_blur = category_names_to_blur.split(',') category_names_to_blur = [s.strip() for s in category_names_to_blur] detections_to_blur = [] for d in detections: category_name = options.category_id_to_category_name[d['category']] category_threshold = options.category_name_to_threshold[category_name] if (d['conf'] >= category_threshold) and (category_name in category_names_to_blur): detections_to_blur.append(d) if len(detections_to_blur) > 0: blur_detections(pil_image,detections_to_blur) # Render bounding boxes for each category separately, because # we allow different thresholds for each category. category_name_to_id = {v: k for k, v in options.category_id_to_category_name.items()} assert len(category_name_to_id) == len(options.category_id_to_category_name) classification_label_map = None if using_classification_folders: classification_label_map = options.classification_categories for category_name in categories_above_threshold: category_id = category_name_to_id[category_name] category_threshold = options.category_name_to_threshold[category_name] assert category_threshold is not None category_detections = [d for d in detections if d['category'] == category_id] # When we're not using classification folders, remove classification # information to maintain standard detection colors. if not using_classification_folders: for d in category_detections: if 'classifications' in d: del d['classifications'] vis_utils.render_detection_bounding_boxes( category_detections, pil_image, label_map=options.detection_categories, classification_label_map=classification_label_map, confidence_threshold=category_threshold, thickness=options.line_thickness, expansion=options.box_expansion) # ...for each category # Try to preserve EXIF data and image quality when saving vis_utils.exif_preserving_save(pil_image,target_path) # ...if we don't/do need to render boxes # ...def _process_detections() #%% Main function
[docs] def separate_detections_into_folders(options): """ Given a .json file with batch processing results, separate the files in that set of results into folders that contain animals/people/vehicles/nothing, according to per-class thresholds. See the header comment of this module for more details about the output folder structure. Args: options (SeparateDetectionsIntoFoldersOptions): parameters guiding image separation, see the SeparateDetectionsIntoFoldersOptions documentation for specific options. """ # Input validation # Currently we don't support moving (instead of copying) when we're also rendering # bounding boxes or blurring humans. assert not (options.render_boxes and options.move_images), \ 'Cannot specify both render_boxes and move_images' assert not ((options.category_names_to_blur is not None) and options.move_images), \ 'Cannot specify both category_names_to_blur and move_images' # Create output folder if necessary if (os.path.isdir(options.base_output_folder)) and \ (len(os.listdir(options.base_output_folder) ) > 0): if options.allow_existing_directory: print('Warning: target folder exists and is not empty... did ' + \ 'you mean to delete an old version?') else: raise ValueError('Target folder exists and is not empty') os.makedirs(options.base_output_folder,exist_ok=True) # Load detection results print('Loading detection results') with open(options.results_file,'r') as f: results = json.load(f) images = results['images'] for im in images: fn = im['file'] assert not _path_is_abs(fn), 'Cannot process results with absolute image paths' print('Processing detections for {} images'.format(len(images))) default_threshold = options.threshold if default_threshold is None: default_threshold = get_typical_confidence_threshold_from_results(results) detection_categories = results['detection_categories'] options.detection_categories = detection_categories options.category_id_to_category_name = detection_categories # Map class names to output folders options.category_name_to_folder = {} options.category_name_to_folder['empty'] = os.path.join(options.base_output_folder,'empty') options.category_name_to_folder['failure'] =\ os.path.join(options.base_output_folder,'processing_failure') # Create all combinations of categories category_names = list(detection_categories.values()) category_names.sort() # category_name = category_names[0] for category_name in category_names: # Do we have a custom threshold for this category? if category_name not in options.category_name_to_threshold: print('Warning: category {} in detection file, but not in threshold mapping'.format( category_name)) options.category_name_to_threshold[category_name] = None if options.category_name_to_threshold[category_name] is None: options.category_name_to_threshold[category_name] = default_threshold category_threshold = options.category_name_to_threshold[category_name] print('Processing category {} at threshold {}'.format(category_name,category_threshold)) target_category_names = [] for c in category_names: target_category_names.append(c) for combination_length in range(2,len(category_names)+1): combined_category_names = list(itertools.combinations(category_names,combination_length)) for combination in combined_category_names: combined_name = '_'.join(combination) target_category_names.append(combined_name) # Create folder mappings for each category for category_name in target_category_names: folder_name = category_name if category_name in friendly_folder_names: folder_name = friendly_folder_names[category_name] options.category_name_to_folder[category_name] = \ os.path.join(options.base_output_folder,folder_name) # Create the actual folders for folder in options.category_name_to_folder.values(): os.makedirs(folder,exist_ok=True) # Handle species classification thresholds, if specified if options.classification_thresholds is not None: assert 'classification_categories' in results and \ results['classification_categories'] is not None, \ 'Classification thresholds specified, but no classification results available' classification_categories = results['classification_categories'] classification_category_name_to_id = {v: k for k, v in classification_categories.items()} classification_category_id_to_name = {k: v for k, v in classification_categories.items()} options.classification_category_id_to_name = classification_category_id_to_name options.classification_categories = classification_categories if isinstance(options.classification_thresholds,str): # E.g. deer=0.75,cow=0.75 tokens = options.classification_thresholds.split(',') classification_thresholds = {} # token = tokens[0] for token in tokens: subtokens = token.split('=') assert (len(subtokens) == 2) and (is_float(subtokens[1])), \ 'Illegal classification threshold {}'.format(token) classification_thresholds[subtokens[0]] = float(subtokens[1]) # ...for each token options.classification_thresholds = classification_thresholds # ...if classification thresholds are still in string format # Validate the classes in the threshold list for class_name in options.classification_thresholds.keys(): assert class_name in classification_category_name_to_id, \ 'Category {} specified at the command line, but is not available in the results file'.format( class_name) # ...if we need to deal with classification categories if options.n_threads <= 1 or options.debug_max_images is not None: # i_image = 14; im = images[i_image]; im for i_image,im in enumerate(tqdm(images)): if options.debug_max_images is not None and i_image > options.debug_max_images: break _process_detections(im,options) # ...for each image else: print('Starting a pool with {} threads'.format(options.n_threads)) pool = ThreadPool(options.n_threads) try: process_detections_with_options = partial(_process_detections, options=options) _ = list(tqdm(pool.imap(process_detections_with_options, images), total=len(images))) finally: pool.close() pool.join() print('Pool closed and joined for folder separation') if options.remove_empty_folders: print('Removing empty folders from {}'.format(options.base_output_folder)) remove_empty_folders(options.base_output_folder)
# ...def separate_detections_into_folders #%% Interactive driver if False: pass #%% options = SeparateDetectionsIntoFoldersOptions() options.results_file = os.path.expanduser( '~/data/snapshot-safari-2022-08-16-KRU-v5a.0.0_detections.json') options.base_input_folder = os.path.expanduser('~/data/KRU/KRU_public') options.base_output_folder = os.path.expanduser('~/data/KRU-separated') options.n_threads = 100 options.render_boxes = True options.allow_existing_directory = True #%% options = SeparateDetectionsIntoFoldersOptions() options.results_file = os.path.expanduser('~/data/ena24-2022-06-15-v5a.0.0_megaclassifier.json') options.base_input_folder = os.path.expanduser('~/data/ENA24/images') options.base_output_folder = os.path.expanduser('~/data/ENA24-separated') options.n_threads = 100 options.classification_thresholds = 'deer=0.75,cow=0.75,bird=0.75' options.render_boxes = True options.allow_existing_directory = True #%% separate_detections_into_folders(options) #%% Testing various command-line invocations """ # With boxes, no classification python separate_detections_into_folders.py \ ~/data/ena24-2022-06-15-v5a.0.0_megaclassifier.json \ ~/data/ENA24/images ~/data/ENA24-separated \ --threshold 0.17 --animal_threshold 0.2 --n_threads 10 \ --allow_existing_directory --render_boxes --line_thickness 10 --box_expansion 10 # No boxes, no classification (default) python separate_detections_into_folders.py \ ~/data/ena24-2022-06-15-v5a.0.0_megaclassifier.json \ ~/data/ENA24/images ~/data/ENA24-separated \ --threshold 0.17 --animal_threshold 0.2 --n_threads 10 --allow_existing_directory # With boxes, with classification python separate_detections_into_folders.py \ ~/data/ena24-2022-06-15-v5a.0.0_megaclassifier.json ~/data/ENA24/images ~/data/ENA24-separated \ --threshold 0.17 --animal_threshold 0.2 --n_threads 10 --allow_existing_directory \ --render_boxes --line_thickness 10 --box_expansion 10 \ --classification_thresholds "deer=0.75,cow=0.75,bird=0.75" # No boxes, with classification python separate_detections_into_folders.py \ ~/data/ena24-2022-06-15-v5a.0.0_megaclassifier.json ~/data/ENA24/images ~/data/ENA24-separated \ --threshold 0.17 --animal_threshold 0.2 --n_threads 10 --allow_existing_directory \ --classification_thresholds "deer=0.75,cow=0.75,bird=0.75" """ #%% Command-line driver def main(): # noqa parser = argparse.ArgumentParser() parser.add_argument('results_file', type=str, help='Input .json filename') parser.add_argument('base_input_folder', type=str, help='Input image folder') parser.add_argument('base_output_folder', type=str, help='Output image folder') parser.add_argument('--threshold', type=float, default=None, help='Default confidence threshold for all categories (defaults to ' + \ 'selection based on model version, other options may override this ' + \ 'for specific categories)') parser.add_argument('--animal_threshold', type=float, default=None, help='Confidence threshold for the animal category') parser.add_argument('--human_threshold', type=float, default=None, help='Confidence threshold for the human category') parser.add_argument('--vehicle_threshold', type=float, default=None, help='Confidence threshold for vehicle category') parser.add_argument('--classification_thresholds', type=str, default=None, help='List of classification thresholds to use for species-based folder ' + \ 'separation, formatted as, e.g., "deer=0.75,cow=0.75"') parser.add_argument('--n_threads', type=int, default=1, help='Number of threads to use for parallel operation (default=1)') parser.add_argument('--allow_existing_directory', action='store_true', help='Proceed even if the target directory exists and is not empty') parser.add_argument('--no_overwrite', action='store_true', help='Skip images that already exist in the target folder, must also ' + \ 'specify --allow_existing_directory') parser.add_argument('--skip_empty_images', action='store_true', help='Do not copy empty images to the output folder') parser.add_argument('--move_images', action='store_true', help='Move images (rather than copying) (not recommended this if you have not ' + \ 'backed up your data!)') parser.add_argument('--render_boxes', action='store_true', help='Render bounding boxes on output images; may result in some ' + \ 'metadata not being transferred') parser.add_argument('--line_thickness', type=int, default=default_line_thickness, help='Line thickness (in pixels) for rendering, only meaningful if ' + \ 'using render_boxes (defaults to {})'.format( default_line_thickness)) parser.add_argument('--box_expansion', type=int, default=default_box_expansion, help='Box expansion (in pixels) for rendering, only meaningful if ' + \ 'using render_boxes (defaults to {})'.format( default_box_expansion)) parser.add_argument('--category_names_to_blur', type=str, default=None, help='Comma-separated list of category names to blur ' + \ '(or a single category name, e.g. "person")') parser.add_argument('--remove_empty_folders', action='store_true', help='Remove all empty folders from the target folder at the end of the process, ' + \ 'whether or not they were created by this script') if len(sys.argv[1:])==0: parser.print_help() parser.exit() args = parser.parse_args() # Convert to an options object options = SeparateDetectionsIntoFoldersOptions() args_to_object(args, options) def validate_threshold(v,name): # print('{} {}'.format(v,name)) if v is not None: assert v >= 0.0 and v <= 1.0, \ 'Illegal {} threshold {}'.format(name,v) validate_threshold(args.threshold,'default') validate_threshold(args.animal_threshold,'animal') validate_threshold(args.vehicle_threshold,'vehicle') validate_threshold(args.human_threshold,'human') if args.threshold is not None: if args.animal_threshold is not None \ and args.human_threshold is not None \ and args.vehicle_threshold is not None: raise ValueError('Default threshold specified, but all category thresholds ' + \ 'also specified... not exactly wrong, but it\'s likely that you ' + \ 'meant something else.') options.category_name_to_threshold['animal'] = args.animal_threshold options.category_name_to_threshold['person'] = args.human_threshold options.category_name_to_threshold['vehicle'] = args.vehicle_threshold options.overwrite = (not args.no_overwrite) separate_detections_into_folders(options) if __name__ == '__main__': main()