python – Is there a way to run this without extracting the archive but still reading all the xml files and for mhtl use in office applications(CVE 2021-40444)?

I have a python3 script to detect the use of mhtml in xml tags. However, it’s not as optimized as I would like and would like some suggestions, if possible.

This is in an effort to help mitigate and detect CVE-2021-40444. Right now, the script extracts the xml files from the document and I was wondering if there is a way to do this without the extraction process. I have to run the extraction process twice to deal with a None type error when using xmlfiles = (os.walk(output_file))

from zipfile import ZipFile
import os
import subprocess
import codecs
import sys
import time
print('Current Python version information:'+sys.version)
print('Purpose:CVE 2021-40444 scanning documents for use of mhtmlnAuthor:Patrick GraynThis was made for python version 3.9.7nn')

mhtml_links=()
input_file=input('Enter Path to document to process:')
outfile=input('Enter path to export xml files to:')

def scan():
    try:
        output_file = os.mkdir(outfile, 755)
    except FileExistsError:
        output_file = outfile
        pass
    with ZipFile(input_file, 'r') as zipObj:
        zipObj.extractall(outfile)

    print('nPath Created successfullyn')
    try:
        xmlfiles = (os.walk(output_file))
    except TypeError:
        print('No files detected checking output file again')
        try:
            output_file = os.mkdir(outfile, 755)
        except FileExistsError:
            output_file = outfile
            pass
    with ZipFile(input_file, 'r') as zipObj:
        zipObj.extractall(outfile)

    for (root, dirs, files) in os.walk(outfile, topdown=True):
        for name in files:
            with codecs.open(os.path.join(root,name), 'r', encoding='utf-8', errors='ignore') as read_file:
                lines = read_file.readlines()
                for line in lines:
                    if 'mhtml' in line:
                        mhtml_links.append(str(os.path.join(root,name)))
                        print('ATTACK DETECTEDn'+line)

                
    print('nnn')
    print('files containing exploit:n')
    for item in mhtml_links:
        print(item)
scan()