Downloader in Python – Code Review Stack Exchange


I wrote this code in python and it works fine for me but I do know that the code is not optimised and a lot of refactoring needs to be done. So I need review on how this code can be improved. I started writing this when WGET library was not working for me and I wanted a light weight script for my other projects. I am also thinking of replacing requests library with aiohttp .Being a beginner in this, I look forward for your reviews.

Thank You.

import requests
import os
from uuid import uuid4
from urllib.parse import urlparse, unquote
import re
from datetime import datetime
from requests.exceptions import HTTPError, ReadTimeout,InvalidSchema
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm


class Rget:
  def __init__(self, url, dest=os.getcwd(), filename=None, progress_bar=True, headers=None):
    self.url = url
    self.dest = self.check_if_dir_exist(dest)
    self.filename = filename
    self.progress_bar = progress_bar
    # self.headers = self.fetch_headers(headers)

  def check_if_dir_exist(self, dest):
    """
    Function to check whether the directory exist.
    If Directory is not present it creates one and returns the path.
    """
    if not os.path.exists(dest):
      os.makedirs(dest)
    return dest
  
  def detect_filename(self, url, response):
    """
    Function to autodetect file name from url and content disposition
    headers.
    """
    if not self.filename == None:
      self.filename = self.get_valid_filename(self.filename)
    else:
      if 'filename' in response.headers.get('Content-Disposition'):
        filename = response.headers.get('Content-Disposition') 
          .split('filename=')(1).split(';')(0).replace('"', '')

      else:
        filename = os.path.basename(urlparse(unquote(response.url))(2))        

      self.filename = self.get_valid_filename(filename)
  
  def get_valid_filename(self, filename):
    """
    Return the given string converted to a string that can be used for a clean
    filename. Remove leading and trailing spaces; convert other spaces to
    underscores; and remove anything that is not an alphanumeric, dash,
    underscore, or dot.

    https://github.com/django/django/blob/master/django/utils/text.py
    """
    s = str(filename).strip()
    separator = ' '
    return re.sub(r'(?u)(^-w.)', separator, s)
  
  def fix_existing_filename(self, filename, dest):
    """
    Function that checks whether the file is already downloaded(exists)
    If already downloaded adds a prefix of current timestamp and returns
    the filename along with proper extension
    """
    name, ext = filename.rsplit('.', 1)
    time = datetime.now().strftime('%m-%d-%Y_%I.%M.%S%p')
    name = name+'_'+time
    return name+'.'+ext
  
  def requests_retry_session(self,
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
    ):
    """
    A high level function that I certainly didnot write
    and I don't remember where I copied it from so if somebody knows whose code
    this is then inform me.

    What it bascially does is it automatically retries the request be it
    HEAD, POST, GET, DELETE for 3 times(defalut) can be changed.
    """
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session
  
  def download(self):
    """
    Function to download file into a temporary file and rename 
    it to user provided filename or autodetected filename.
    """
    try:
      with self.requests_retry_session().get(self.url, stream=True, timeout=3) as response:
        response.raise_for_status()
        self.detect_filename(self.url, response)
        self.file_size = int(response.headers('Content-Length').strip())
        with open(os.path.join(self.dest, 'rget_'+str(uuid4())+'.tmp'), 'wb+') as temp:
          with tqdm(
            total = self.file_size,
            initial=0,
            unit='B',
            desc=self.filename,
            ascii=True,
            unit_scale=True,
            unit_divisor=1024,
          ) as progressBar:

            for chunk in response.iter_content(chunk_size=8192):
              temp.write(chunk)
              progressBar.update(len(chunk))

        if os.path.exists(os.path.join(self.dest, self.filename)):
          self.filename = self.fix_existing_filename(self.filename, self.dest)
        os.rename(temp.name, os.path.join(self.dest, self.filename))

      return self.filename

    #* A bit of Exception handling to showoff ;)
    except ReadTimeout:
        return('Maximum Retries reached, Check your internet connection and try again')
    
    except:
      return 'Please check the url and try again'