This script will allow you to trim a folder(s) full of files to a fixed number of files, sampling randomly. My use case was to select a subsample of training images for CV research, but it could be used for whatever you like!
#!/usr/bin/python
# Programmed by Alex Knaust 2013-03-12
import os, sys, random
def sampleFiles(k, dirs):
'''Deletes a random sample of files, leaving k remaining
from each directory. The directories must have identical filenames
and identical filecounts, otherwise we are in trouble
'''
totalFiles = len(os.listdir(dirs[0]))
if k >= totalFiles:
print 'Not enough files to sample from'
exit(1)
# python magic at work
stuff = [sorted([os.path.join(d, p) for p in os.listdir(d)]) for d in dirs]
destroy = random.sample(zip(*stuff), totalFiles - k)
return destroy
if __name__ == '__main__':
if len(sys.argv) < 3:
print('Pass number of files to sample, and directories to sample from')
sys.exit(1)
k = int(sys.argv[1])
dirs = sys.argv[2:]
destroy = sampleFiles(k, dirs)
ans = raw_input('Will delete %d files, OK? y/n : ' % (len(destroy) * len(dirs)))
if ans[0].lower() == 'n':
print 'Goodbye'
sys.exit(1)
#do the actual deletion
for files in destroy:
for f in files:
os.remove(f)