python – Text Normalizer on large text files


I am working on a text normalizer. It works just fine with small text files but takes a very long time with large text files such as 5 MB or more.

Is there anything to change in the code to make it run faster on large text files? My guess would be something in the __preprocess(tmp) and __prenormalise(text)?

Thank you for your time

# -*- coding: utf-8 -*-
import re
import sys
import json
import os
import codecs
import copy
from num2words import num2words
from text_unidecode import unidecode
import argparse

class TextNormaliser:
    def __init__(self, debug=False):
        """
        Args:
            debug (bool, optional): Debug mode
        """
        self.debug = debug
        self.abbreviations = {}
        self.acronyms = {}
        self.currencies = {}

        self.months = (
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december')

        self.number_scale = (
            'thousand', 'thousands', 'million', 'millions',
            'billion', 'billions', 'trillion', 'trillions')

        path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(path, 'resources', 'abbreviations.json')) as jf:
            self.abbreviations = json.load(jf)
        with open(os.path.join(path, 'resources', 'acronyms.json')) as jf:
            self.acronyms = json.load(jf)
        with open(os.path.join(path, 'resources', 'currencies.json')) as jf:
            self.currencies = json.load(jf)
        with open(os.path.join(path, 'resources', 'domains.json')) as jf:
            self.domains = json.load(jf)

    def normalise(self, text):
        """Normalise text.

        The function covers numbers, email addresses, ascii characters, etc.

        Args:
            text (str): Input string

        Returns:
            textn (srt): Normalised text
            tokens ((tuples)): List of tuples to track back normalisation

        Examples:
            >>> textn, tokens = tn.normalise("My email is, a@b.com.")
            tokens: (Original, Normalised, Display)
            my email is a at b dot com
            (('My', ('my'), 'My'), ('email', ('email'), 'email'),
            ('is,', ('is'), 'is'),
            ('a@b.com.', ('a', 'at', 'b', 'dot', 'com'), 'a@b.com'))
        """
        return self.__normalise(text)

    def normalise_file(self, path):
        """Normalise text from a file.

        The function covers numbers, email addresses, ascii characters, etc.

        Args:
            path (str): Path to a file

        Returns:
            textn (srt): Normalised text, or None if file does not exists
            tokens ((tuples)): List of tuples to track back normalisation,
                or None if file doesnot exists

        Raises:
            Exception: If file cannot be read

        Examples:
            >>> textn = tn.normalise_file('./trans.txt')
        """
        try:
            if os.path.isfile(path):
                with codecs.open(path, encoding='utf-8') as f:
                        return self.__normalise(f.readline())
            else:
                return None, None
        except Exception as e:
            raise Exception('ERR Normalise_file: {}'.format(e))

    def __normalise(self, text):
        text = self.__prenormalise(text)
        tmp = ()
        for idx, t in enumerate(text.split()):
            tmp.append((t, idx))
        original = copy.deepcopy(tmp)

        # Preprocessing
        tokens = self.__preprocess(tmp)
        
        # Convert to result format
        ret_text, ret_tokens = self.__generate_results(original, tokens)
        return ret_text, ret_tokens

    def __prenormalise(self, text):
        text = text.replace('n', '').replace('r', '')
        text = re.sub(r'b?b', ' ', text)
        text = re.sub(r'b!b', ' ', text)
        text = re.sub(r'b"b', ' ', text)
        text = re.sub(r'b--b', ' ', text)

        chars = list(text)
        for i, c in enumerate(chars):
            if i < 1 or i > len(chars)-1:
                continue
            if c == ',':
                if not(chars(i-1).isnumeric() and
                   chars(i-1).isnumeric()):
                    chars(i) = ', '
            text = ''.join(chars)
        return text

    def __preprocess(self, tokens):
        # Remove spaces and some special encoding
        for idx, t in enumerate(tokens):
            i = t(1)
            t = t(0)
            t = t.replace('&amp;', '&')

            hints = ('(Music)', '(Laughter)', '(Applause)')
            for hint in hints:
                t = t.replace(hint, '')

            del tokens(idx)
            tokens.insert(idx, (t.strip(), i))

        # Remove last dot
        if len(tokens):
            if tokens(-1)(0).endswith('.'):
                i = tokens(-1)(1)
                t = tokens(-1)(0)
                del tokens(-1)
                tokens.append((t(:-1), i))

        return tokens
    
   
    def __rstrip(self, token):
        for i in range(5):
            if len(token):
                if token(-1) in (',', '.', ';', '!', '?', ':', '"'):
                    token = token(:-1)
                else:
                    break
        return token

    def __lstrip(self, token):
        for i in range(5):
            if len(token):
                if token(0) in (',', '.', ';', '!', '?', ':', '"', '''):
                    token = token(1:)
                else:
                    break
        return token


    def __generate_results(self, original, normalised):
        words = ()
        for t in normalised:
            if len(t(0)):
                words.append(t(0))
        text = ' '.join(words)

        tokens = ()
        if len(original):
            for t in original:
                idx = t(1)
                words = ()
                for t2 in normalised:
                    if idx == t2(1):
                        words.append(t2(0))
                display_text = self.__rstrip(t(0))
                display_text = self.__lstrip(display_text)
                tokens.append((t(0), words, display_text))
        else:
            tokens.append(('', '', ''))

        return text, tokens   

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--textfile', type=str, required=True, help='input directory or file')
    args = parser.parse_args()
    tn = TextNormaliser(False)
    with open(args.textfile) as fd:
        lines = fd.readlines()
        for line in lines:
            line = line.strip()
            normalised, tokens = tn.normalise(line)
            print(normalised)
```