Source code for megadetector.data_management.remove_exif

"""

remove_exif.py

Removes all EXIF/IPTC/XMP metadata from a folder of images, without making
backup copies, using pyexiv2.  Ignores non-jpeg images.

This module is rarely used, and pyexiv2 is not thread-safe, so pyexiv2 is not
included in package-level dependency lists.  YMMV.

"""

#%% Imports and constants

import os
import argparse

from megadetector.utils.path_utils import recursive_file_list

from multiprocessing.pool import Pool as Pool
from tqdm import tqdm


#%% Support functions

[docs] def remove_exif_from_image(fn): """ Remove EXIF information from a single image pyexiv2 is not thread safe, do not call this function in parallel within a process. Parallelizing across processes is fine. Args: fn (str): image file from which we should remove EXIF information Returns: bool: whether EXIF removal succeeded """ import pyexiv2 # type: ignore try: img = pyexiv2.Image(fn) img.clear_exif() img.clear_iptc() img.clear_xmp() img.close() except Exception as e: print('EXIF error on {}: {}'.format(fn,str(e))) return False return True
#%% Remove EXIF data
[docs] def remove_exif(image_base_folder,recursive=True,n_processes=1): """ Removes all EXIF/IPTC/XMP metadata from a folder of images, without making backup copies, using pyexiv2. Ignores non-jpeg images. Args: image_base_folder (str): the folder from which we should remove EXIF data recursive (bool, optional): whether to process [image_base_folder] recursively n_processes (int, optional): number of concurrent workers. Because pyexiv2 is not thread-safe, only process-based parallelism is supported. """ try: import pyexiv2 # type: ignore #noqa except: print('pyexiv2 not available; try "pip install pyexiv2"') raise ##%% List files assert os.path.isdir(image_base_folder), \ 'Could not find folder {}'.format(image_base_folder) all_files = recursive_file_list(image_base_folder, recursive=True, return_relative_paths=False, convert_slashes=True) image_files = [s for s in all_files if \ (s.lower().endswith('.jpg') or s.lower().endswith('.jpeg'))] ##%% Remove EXIF data (execution) if n_processes == 1: # fn = image_files[0] for fn in tqdm(image_files): remove_exif_from_image(fn) else: # pyexiv2 is not thread-safe, so we need to use processes pool = None try: print('Starting parallel process pool with {} workers'.format(n_processes)) pool = Pool(n_processes) _ = list(tqdm(pool.imap(remove_exif_from_image,image_files),total=len(image_files))) finally: if pool is not None: pool.close() pool.join() print('Pool closed and joined for EXIF removal')
# ...remove_exif(...) #%% Command-line driver def main(): """ Command-line interface to remove EXIF data from images. """ parser = argparse.ArgumentParser( description='Removes EXIF/IPTC/XMP metadata from images in a folder' ) parser.add_argument( 'image_base_folder', type=str, help='Folder to process for EXIF removal' ) parser.add_argument( '--nonrecursive', action='store_true', help="Don't recurse into [image_base_folder] (default is recursive)" ) parser.add_argument( '--n_processes', type=int, default=1, help='Number of concurrent processes for EXIF removal (default: 1)' ) args = parser.parse_args() recursive = (not args.nonrecursive) print('Processing folder: {}'.format(args.image_base_folder)) if not os.path.isdir(args.image_base_folder): raise ValueError('Folder not found at {}'.format(args.image_base_folder)) remove_exif( image_base_folder=args.image_base_folder, recursive=recursive, n_processes=args.n_processes ) print('Finished removing EXIF data') if __name__ == '__main__': main()