PDF parser in Python – Code Review Stack Exchange

Autodidact and inspired by David Beazley code (to improve my skills in Python) i would like to get your feed back on this Parser code.

NB: The lazy property let the code be computed only one time if used (https://github.com/dabeaz/python-cookbook/blob/master/src/8/lazily_computed_attributes/example1.py)



class lazyproperty:
    def __init__(self, func):
        self.func = func
    def __get__(self, instance, cls):
        if instance is None:
            return self
        else:
            value = self.func(instance)
            setattr(instance, self.func.__name__, value)
            return value


class PDFParser():
    """

    """
    def __init__(self,filepath,page_num=0):
        self.filepath = filepath
        try:
            self._doc = fitz.open(filepath)
            self.page_num = page_num
            self._page = self._doc(page_num)
        except Exception as e: 
            print("Lecture PDF impossible. {}".format(e))

    @lazyproperty
    def text(self):
        return self._page.getText()

    @lazyproperty
    def _pixs(self):
        imgs = self._doc.getPageImageList(self.page_num)
        pixs =()
        for img in imgs:
            xref = img(0)
            pix = fitz.Pixmap(self._doc, xref)
            pixs.append(pix)
        return pixs

    @lazyproperty
    def _pixpage(self):
        pix = self._page.getPixmap(colorspace=fitz.csGRAY)
        return pix
    
    @property   
    def img(self):
        return self.imgs(0)


    @lazyproperty
    def imgs(self):
        pixs = self._pixs
        imgsarray = ()
        for pix in pixs:
            img = self.pix2np(pix)
            imgsarray.append(img)
        return imgsarray
        

    def write(self,outputdir,fullpage=False):
        filename = os.path.basename(self.filepath).split('.pdf')(0)
        def writePNG(pix,filepath):
            # This is GRAY or RGB
            try:       
                pix.writePNG(filepath)
            # CMYK: convert to RGB first
            except:               
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.writePNG(filepath)
                pix = None
        if fullpage:
            filepath = os.path.join(outputdir,'{}_p{}.png'.format(filename,self.page_num))
            pix = self._pixpage
            writePNG(pix,filepath)
            return
        pixs = self._pixs
        for i,pix in enumerate(pixs):
            filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(filename,self.page_num,i))
            writePNG(pix,filepath)
        return

    @staticmethod
    def pix2np(pix):
        """
        Convert pixmap to image np.ndarray
        https://stackoverflow.com/questions/53059007/python-opencv
        param pix: pixmap
        """
        import numpy as np
        #https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
        im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        try:
            im = np.ascontiguousarray(im(..., (2, 1, 0)))  # rgb to bgr
        except IndexError:
            #Trick to convert Gray rto BGR, (im.reshape)
            logger.warning("Shape of image array is {}".format(im.shape)) 
            im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
            im = np.ascontiguousarray(im(..., (2, 1, 0)))
        return im

node.js – Rating controller – Code Review Stack Exchange

Rating controller using express js and mongoose to controll the Rating of a product. I create the rating and then update the product rating, but I create and update in the same route endpoint. Is this a bad idea?

const express = require('express');
const {body,validationResult} = require('express-validator');

const Rating = require('../../models/rating/rating');
const Product = require('../../models/product/product');


exports.createRating = async (req,res) => {
    const errors = validationResult(req);
    if(!errors.isEmpty()){
        return res.status(422).json({errors: errors.array()})
    }
  
    const {idProduct,rating} = req.body;
    
    try {
        const newRating = await new Rating({
            idProduct,
            rating
        });
        await newRating.save();

        const product = await updateRating(idProduct);

        return res.status(200).json(product);
        
    } catch (error) {
        res.json(error.message)
    }
}
const updateRating = async (idProduct) => {
    try{
        const avgRating = await getRatingByProduct(idProduct);
        
        const product = await Product.updateOne(
            {'_id' : idProduct},
            {$set : { rating : avgRating }},
        );        
        return product;
        
    }
}
const getRatingByProduct = async idProduct => {
    
    try {
        const rating = await Rating.aggregate((
            {$match : {idProduct: idProduct}},
            {$group : {_id: null, amount: {$avg: "$rating"}}}
        ));
        
        return rating(0).amount;
        
    } catch (error) {
        return error.message;
    }
}
exports.validator = () => {
    return (
        body('rating')
            .not().isEmpty()
            .custom(value => {
                if(value < 0 || value > 5) return Promise.reject('Rating must be gr than 0');
                return true;
            })
    )
}
 

Downloader in Python – Code Review Stack Exchange

I wrote this code in python and it works fine for me but I do know that the code is not optimised and a lot of refactoring needs to be done. So I need review on how this code can be improved. I started writing this when WGET library was not working for me and I wanted a light weight script for my other projects. I am also thinking of replacing requests library with aiohttp .Being a beginner in this, I look forward for your reviews.

Thank You.

import requests
import os
from uuid import uuid4
from urllib.parse import urlparse, unquote
import re
from datetime import datetime
from requests.exceptions import HTTPError, ReadTimeout,InvalidSchema
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm


class Rget:
  def __init__(self, url, dest=os.getcwd(), filename=None, progress_bar=True, headers=None):
    self.url = url
    self.dest = self.check_if_dir_exist(dest)
    self.filename = filename
    self.progress_bar = progress_bar
    # self.headers = self.fetch_headers(headers)

  def check_if_dir_exist(self, dest):
    """
    Function to check whether the directory exist.
    If Directory is not present it creates one and returns the path.
    """
    if not os.path.exists(dest):
      os.makedirs(dest)
    return dest
  
  def detect_filename(self, url, response):
    """
    Function to autodetect file name from url and content disposition
    headers.
    """
    if not self.filename == None:
      self.filename = self.get_valid_filename(self.filename)
    else:
      if 'filename' in response.headers.get('Content-Disposition'):
        filename = response.headers.get('Content-Disposition') 
          .split('filename=')(1).split(';')(0).replace('"', '')

      else:
        filename = os.path.basename(urlparse(unquote(response.url))(2))        

      self.filename = self.get_valid_filename(filename)
  
  def get_valid_filename(self, filename):
    """
    Return the given string converted to a string that can be used for a clean
    filename. Remove leading and trailing spaces; convert other spaces to
    underscores; and remove anything that is not an alphanumeric, dash,
    underscore, or dot.

    https://github.com/django/django/blob/master/django/utils/text.py
    """
    s = str(filename).strip()
    separator = ' '
    return re.sub(r'(?u)(^-w.)', separator, s)
  
  def fix_existing_filename(self, filename, dest):
    """
    Function that checks whether the file is already downloaded(exists)
    If already downloaded adds a prefix of current timestamp and returns
    the filename along with proper extension
    """
    name, ext = filename.rsplit('.', 1)
    time = datetime.now().strftime('%m-%d-%Y_%I.%M.%S%p')
    name = name+'_'+time
    return name+'.'+ext
  
  def requests_retry_session(self,
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
    ):
    """
    A high level function that I certainly didnot write
    and I don't remember where I copied it from so if somebody knows whose code
    this is then inform me.

    What it bascially does is it automatically retries the request be it
    HEAD, POST, GET, DELETE for 3 times(defalut) can be changed.
    """
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session
  
  def download(self):
    """
    Function to download file into a temporary file and rename 
    it to user provided filename or autodetected filename.
    """
    try:
      with self.requests_retry_session().get(self.url, stream=True, timeout=3) as response:
        response.raise_for_status()
        self.detect_filename(self.url, response)
        self.file_size = int(response.headers('Content-Length').strip())
        with open(os.path.join(self.dest, 'rget_'+str(uuid4())+'.tmp'), 'wb+') as temp:
          with tqdm(
            total = self.file_size,
            initial=0,
            unit='B',
            desc=self.filename,
            ascii=True,
            unit_scale=True,
            unit_divisor=1024,
          ) as progressBar:

            for chunk in response.iter_content(chunk_size=8192):
              temp.write(chunk)
              progressBar.update(len(chunk))

        if os.path.exists(os.path.join(self.dest, self.filename)):
          self.filename = self.fix_existing_filename(self.filename, self.dest)
        os.rename(temp.name, os.path.join(self.dest, self.filename))

      return self.filename

    #* A bit of Exception handling to showoff ;)
    except ReadTimeout:
        return('Maximum Retries reached, Check your internet connection and try again')
    
    except:
      return 'Please check the url and try again'