Subsample directories of files

This script will allow you to trim a folder(s) full of files to a fixed number of files, sampling randomly. My use case was to select a subsample of training images for CV research, but it could be used for whatever you like!

#!/usr/bin/python
# Programmed by Alex Knaust 2013-03-12

import os, sys, random


def sampleFiles(k, dirs):
	'''Deletes a random sample of files, leaving k remaining
	from each directory. The directories must have identical filenames
	and identical filecounts, otherwise we are in trouble
	'''
	totalFiles = len(os.listdir(dirs[0]))
	if k >= totalFiles:
		print 'Not enough files to sample from'
		exit(1)
	
	# python magic at work
	stuff = [sorted([os.path.join(d, p) for p in os.listdir(d)]) for d in dirs]
	destroy = random.sample(zip(*stuff), totalFiles - k)
	
	return destroy
		

if __name__ == '__main__':
	if len(sys.argv) < 3:
		print('Pass number of files to sample, and directories to sample from')
		sys.exit(1)
	 
	k = int(sys.argv[1])
	dirs = sys.argv[2:]
	destroy = sampleFiles(k, dirs)
	
	ans = raw_input('Will delete %d files, OK? y/n : ' % (len(destroy) * len(dirs)))
	if ans[0].lower() == 'n':
		print 'Goodbye'
		sys.exit(1)
	
	#do the actual deletion
	for files in destroy:
		for f in files:
			os.remove(f) 

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>