I have a python3 script to detect the use of mhtml in xml tags. However, it’s not as optimized as I would like and would like some suggestions, if possible.
This is in an effort to help mitigate and detect CVE-2021-40444. Right now, the script extracts the xml files from the document and I was wondering if there is a way to do this without the extraction process. I have to run the extraction process twice to deal with a None type error when using
xmlfiles = (os.walk(output_file))
from zipfile import ZipFile import os import subprocess import codecs import sys import time print('Current Python version information:'+sys.version) print('Purpose:CVE 2021-40444 scanning documents for use of mhtmlnAuthor:Patrick GraynThis was made for python version 3.9.7nn') mhtml_links=() input_file=input('Enter Path to document to process:') outfile=input('Enter path to export xml files to:') def scan(): try: output_file = os.mkdir(outfile, 755) except FileExistsError: output_file = outfile pass with ZipFile(input_file, 'r') as zipObj: zipObj.extractall(outfile) print('nPath Created successfullyn') try: xmlfiles = (os.walk(output_file)) except TypeError: print('No files detected checking output file again') try: output_file = os.mkdir(outfile, 755) except FileExistsError: output_file = outfile pass with ZipFile(input_file, 'r') as zipObj: zipObj.extractall(outfile) for (root, dirs, files) in os.walk(outfile, topdown=True): for name in files: with codecs.open(os.path.join(root,name), 'r', encoding='utf-8', errors='ignore') as read_file: lines = read_file.readlines() for line in lines: if 'mhtml' in line: mhtml_links.append(str(os.path.join(root,name))) print('ATTACK DETECTEDn'+line) print('nnn') print('files containing exploit:n') for item in mhtml_links: print(item) scan()