Subsample directories of files

This script will allow you to trim a folder(s) full of files to a fixed number of files, sampling randomly. My use case was to select a subsample of training images for CV research, but it could be used for whatever you like!

# Programmed by Alex Knaust 2013-03-12

import os, sys, random

def sampleFiles(k, dirs):
	'''Deletes a random sample of files, leaving k remaining
	from each directory. The directories must have identical filenames
	and identical filecounts, otherwise we are in trouble
	totalFiles = len(os.listdir(dirs[0]))
	if k >= totalFiles:
		print 'Not enough files to sample from'
	# python magic at work
	stuff = [sorted([os.path.join(d, p) for p in os.listdir(d)]) for d in dirs]
	destroy = random.sample(zip(*stuff), totalFiles - k)
	return destroy

if __name__ == '__main__':
	if len(sys.argv) < 3:
		print('Pass number of files to sample, and directories to sample from')
	k = int(sys.argv[1])
	dirs = sys.argv[2:]
	destroy = sampleFiles(k, dirs)
	ans = raw_input('Will delete %d files, OK? y/n : ' % (len(destroy) * len(dirs)))
	if ans[0].lower() == 'n':
		print 'Goodbye'
	#do the actual deletion
	for files in destroy:
		for f in files:

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>