python – parallel processing of files from a directory

This code does parallel processing of files read from a directory. It divides the directory into ‘core’ number of file chunks and process those chunks in parallel. ‘cores’ is the number of cores in the linux system.

from multiprocessing import Process
import os
from time import sleep
import multiprocessing

pros = ()

def getFiles(directory):
    '''returns the files from a directory'''
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

def countFiles(directory):
    return len((name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))))

def foo(fileList):
    print(fileList)

def isCommon(a, b):
    aSet = set(a)
    bSet = set(b)
    if len(aSet.intersection(bSet)) > 0:
        return(True)
    return(False)

if __name__ == "__main__":
  '''get count of files in directory and split it in based on number of cores'''
  directory = ""
  noCores = multiprocessing.cpu_count()
  totalFilesCount = countFiles(directory)
  chunkSize = totalFilesCount/noCores
  totalChunks = noCores
  print("total files", totalFilesCount, "total chunks", totalChunks, "chunk size", chunkSize)
  filesProcessed = 0
  currentChunk = 0
  fileObj = getFiles(directory)

  listOFFiles = ()
  while filesProcessed < totalFilesCount:
      filesList = ()

      # if it is last chunk and totalFilesCount can't be divided equally then put leftover files in last core to get processed
      if currentChunk == totalChunks - 1 and totalFilesCount%noCores:
          chunkSize += totalFilesCount%noCores

      reached = 0
      for f in fileObj:
          filesList.append(f)
          if chunkSize == reached:
              break
          reached += 1

      listOFFiles.append(filesList)
      p = Process(target=foo, args=(filesList,))
      pros.append(p)
      p.start()

      currentChunk += 1
      filesProcessed += chunkSize

  for t in pros:
     t.join()

  for a, b in zip(listOFFiles, listOFFiles(1:)):
     assert isCommon(a, b) == False