python parallel-rm-rf.py <dir> <thread_count>

· 参数说明
··· dir 删除目录
··· thread_count默认是4

parallel-rm-rf.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

#
# copyright (c) 2015   Ben England, Red Hat, under Apache License
# see http://www.apache.org/licenses/LICENSE-2.0 for license details
#
# this script does equivalent of "rm -rf " command but in parallel on
# subdirectories

import os
import errno
import multiprocessing
import sys
import time

debug = (os.getenv('DEBUG') is not None)

NOTOK = 1   # process exit status meaning failure


def usage(msg):
    print('ERROR: ' + msg)
    print('usage: parallel-untar.py your-file.tar [ max-threads ]')
    sys.exit(NOTOK)


# python generator to recursively walk directory tree
# looking for all subdirectories,
# returning child directories before their parents
# this allows us to construct list of directories to delete
# in parallel with threads that delete them.

def find_subdirs(d):
    entries = os.listdir(d)
    for e in entries:
        entry_path = os.path.join(d, e)
        if not os.path.islink(entry_path) and os.path.isdir(entry_path):
            for subd in find_subdirs(entry_path):
                yield subd
    yield d


# parse command line inputs and display them

thread_count = 4
topdir = sys.argv[1]
start_time = time.time()

if len(sys.argv) > 2:
    try:
        thread_count = int(sys.argv[2])
    except ValueError as e:
        usage('could not parse thread count %s' % sys.argv[2])
elif len(sys.argv) < 2:
    usage('must supply top-level directory to delete')
print('deleting directory tree %s with up to %d parallel threads' %
      (topdir, thread_count))
if not os.path.isdir(topdir):
    print('parallel-rm-rf.py does not work on anything other than a directory')
    sys.exit(1)


# we use the multiprocessing module to create subprocesses so that the
# python GIL (Global Interpreter Lock) cannot
# get in the way of parallel processing
# this class uses a pipe to receive pathnames of directories to delete
# the parent thread is responsible for partitioning directories amongst
# subprocesses
# each thread sends back a tuple at the end that contains counters
# so parent can print out how the work was divided

class rmThread(multiprocessing.Process):

    def __init__(self, parent_conn_in, child_conn_in, index_in):
        self.index = index_in
        self.parent_conn = parent_conn_in
        self.child_conn = child_conn_in
        self.file_count = 0
        self.dir_count = 0
        self.dir_remove_collisions = 0
        self.dir_remove_nonempty = 0
        multiprocessing.Process.__init__(self)

    def __str__(self):
        return 'rmThread index=%d thread_count=%d directories=%d' % (
            self.index, self.thread_count, len(self.dirlist))

    def run(self):
        while [True]:
            d = self.child_conn.recv()
            if d == os.sep:
                break
            if debug:
                print('thread %d dir %s' % (self.index, d))
            try:
                dir_contents = os.listdir(d)
            except OSError as e:
                if e.errno == errno.ENOENT:
                    self.dir_remove_collisions += 1
                    # not a problem, someone else might have removed
                    continue
                raise e

            # delete contents of directory
            # rather than have competing threads lock directories,
            # we rely on the filesystem to handle cases
            # where two threads attempt to delete at same time
            # one of the threads will get ENOENT in this case,
            # but that's ok, doesn't matter

            for dentry in dir_contents:
                de_path = os.path.join(d, dentry)
                if (not os.path.islink(de_path)) and os.path.isdir(de_path):
                    continue
                if debug:
                    print('%d deleting entry %s' % (self.index, de_path))
                try:
                    os.unlink(de_path)
                    self.file_count += 1
                except OSError as e:
                    if e.errno == errno.ENOENT:
                        self.dir_remove_collisions += 1
                        continue
                    raise e

            # delete directory and non-empty parent directories up to topdir
            # we can't delete d if it contains a subdirectory
            # (that hasn't been deleted yet)
            # that's ok, we'll get ENOTEMPTY and stop
            # other threads could be doing this same thing
            # (e.g. thread that deleted child of d)
            # again, rely on filesystem to deal with this,
            # one thread gets an ENOENT exception
            # that's ok, just stop

            while len(d) >= len(topdir):
                try:
                    os.rmdir(d)
                    self.dir_count += 1
                    if debug:
                        print('thread %d deleted directory %s' %
                              (self.index, d))
                except OSError as e:
                    if e.errno == errno.ENOTEMPTY:
                        self.dir_remove_nonempty += 1  # ok, will delete later
                        break
                    if e.errno == errno.ENOENT:
                        self.dir_remove_collisions += 1  # other thread did it
                        break
                    raise e
                d = os.path.dirname(d)

        self.child_conn.send((self.file_count, self.dir_count,
                             self.dir_remove_collisions,
                             self.dir_remove_nonempty))
        if debug:
            print('child exiting: ' + str(worker_pool))


# MAIN PROGRAM -- create & start worker threads, wait for them to finish

worker_pool = []
for n in range(0, thread_count):
    (parent_conn, child_conn) = multiprocessing.Pipe()
    t = rmThread(parent_conn, child_conn, n)
    worker_pool.append(t)
    t.daemon = True
    t.start()
if debug:
    print('thread pool: ' + str(worker_pool))

# round-robin schedule child threads to process directories
# FIXME: we could do something much more intelligent later on
# like scheduling based on total file count assigned to each thread

index = 0
for d in find_subdirs(topdir):
    worker_pool[index].parent_conn.send(d)
    index += 1
    if index >= thread_count:
        index = 0

elapsed_time = time.time() - start_time
print('constructed directory list and awaiting thread completions ' +
      'after %9.2f sec' % elapsed_time)

total_dirs = 0
total_files = 0
for worker in worker_pool:
    worker.parent_conn.send(os.sep)  # tell child that we're done
    (w_file_count, w_dir_count, w_dir_remove_collisions,
     w_dir_remove_nonempty) = worker.parent_conn.recv()
    worker.join()  # wait for child to exit
    print(('after %7.2f sec thread %d removed %d files and %d dirs ' +
           'with %d collisions and %d non-empty dirs') % (
          time.time() - start_time,
          worker.index,
          w_file_count,
          w_dir_count,
          w_dir_remove_collisions,
          w_dir_remove_nonempty))
    total_dirs += w_dir_count
    total_files += w_file_count

elapsed_time = time.time() - start_time
print('elapsed time = %7.2f sec' % elapsed_time)
fps = total_files / elapsed_time
print('files per second = %8.2f' % fps)
dps = total_dirs / elapsed_time
print('directories per second = %8.2f' % dps)

脚本传送门