python – Downloading and parsing research papers

I am trying to write a script which gets a research paper from a website by calling their API and then traverse it sentence-wise with some conditions.

The paper is accessible in XML format. I am directly using the nltk.sent_tokenize() to break the relevant part of XML document into list of sentences and then searching for “diseases”(number of diseases in my dataframe is 12000) in every sentence using regex and if a match is found then i search for “biomarkers”(this dataframe has 20000 rows and datatype of row is of type list with each list having an average of 2 values i.e. there are more than 40000 values which are to be matched) in the same sentence. And finally the result is saved in database if disease and biomarker are found in the same sentence.

In the worst case steps taken to accomplish the task would be 900.000(papers which are to be traversed)*15(number of sentences in each paper)*12.000(number of disease which are to be searched)*40.000(number of marker which are to be searched).

At the moment the code is parsing 60-70 papers in an hour which is a bit too slow.

The following is the code which I have tried. I am searching for the existing bottlenecks in my code. Any suggestions to optimise the code would be highly appreciated.

import timeit
import mysql.connector
from urllib.request import urlopen
import urllib.request
import xml.dom.minidom
import xml.etree.ElementTree as ET
import requests
import xml
import lxml.etree
from lxml import etree
import re
import math
import nltk 
import pandas as pd

start = timeit.default_timer()
print("start_time:", start)
mydb = mysql.connector.connect(host="localhost",user="root",password="",database="biomarker")
mycursor = mydb.cursor() 
mycursor.execute("DROP TABLE  IF EXISTS papers_sentence_com_ppr")
mycursor.execute("create table papers_sentence_com_ppr (id INT AUTO_INCREMENT PRIMARY KEY, paper_id VARCHAR(255), marker VARCHAR(255),marker_id VARCHAR(255), disease_name VARCHAR(255),AUTHORS  TEXT, sentence  TEXT)")
mycursor.execute("ALTER TABLE biomarker.papers_sentence_com_ppr CONVERT TO CHARACTER SET utf8")
#mycursor.execute("create table papers (id INT AUTO_INCREMENT PRIMARY KEY, paper_id VARCHAR(255), count VARCHAR(255),disease_name VARCHAR(255))")
df1 = pd.read_sql_query("select name from biomarker.disease2", mydb)

df = pd.read_sql_query("select * from biomarker.table_35", mydb)
print(df1)

biomarker_txt = df.at(2,'CA')
biomarker = biomarker_txt.split(" ")
print(len(biomarker))
#mycursor.execute("create table papers (id INT AUTO_INCREMENT PRIMARY KEY, paper_id VARCHAR(255), count VARCHAR(255),disease_name VARCHAR(255))")



urla='https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=(%22biomarker%22%20OR%20%22biomarkers%22%20OR%20%22biological%20marker%22%20OR%20%22biological%20markers%22)%20%20AND%20%20(LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22)%20AND%20%20(HAS_ABSTRACT%3Ay)%20%20AND%20%20(SRC%3A%22PPR%22)&resultType=idlist&pageSize=1000&format=xml'
urlb='https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=(%22biomarker%22%20OR%20%22biomarkers%22%20OR%20%22biological%20marker%22%20OR%20%22biological%20markers%22)%20%20AND%20%20(LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22)%20AND%20%20(HAS_ABSTRACT%3Ay)%20%20AND%20%20(SRC%3A%22PPR%22)&resultType=idlist&pageSize=1000&format=xml'


array1 = ()
re1=requests.get(urla)
root = ET.fromstring(re1.content)
for hitCount in root.iter('hitCount'):
    hit_count=int(hitCount.text)
result_value=hit_count
hit_count1=hit_count/1000
hit_count=math.ceil(hit_count1)

hit_count=10




counter1=0
counter3=0
y=0
i=1


for x in range(hit_count):
    
    re1=requests.get(urla)
    root1 = ET.fromstring(re1.content)
    
    for id in root1.iter('id'):
        id_text=id.text
       
        array1.append(id.text)
    for nextCursorMark in root1.iter('nextCursorMark'):
       
        counter3=counter3+1
        print(counter3)
        urla=urlb
        urla =urla+"&cursorMark="+nextCursorMark.text







for i in range(result_value):# will run 900.000 times for 900.000 papers
    print("paper no:", i,)
    paper_id=(array1(i)) #array1 contains list of paper ids
    print("paper_id:", paper_id)       
    make_url= 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=ext_id:'+paper_id+'&resultType=core&format=xml'
    re2=requests.get(make_url)
    root2 = ET.fromstring(re2.content)
    for abstractText in root2.iter('abstractText'):
           
            abstract_text_without_tags= re.sub(r"<(^>)*>"," ",abstractText.text )#extract the relevant xml part
            
           
            
            
            nltk_tokens = nltk.sent_tokenize(abstract_text_without_tags)#break the text into sentences
            
            for text in range(len(nltk_tokens)):#depends on the length of text
                    
                    for zz in range(len(df)):#df has 20.000 rows
                        biomarker_txt=((df.at(zz,'CA'))) 
                        biomarker = biomarker_txt.split(" ")# a cell could have more than value which are separated by an space
                        for tt in range(len(biomarker)):
                            if len(biomarker(tt))>2:
                                matches_for_marker = re.findall(rf"b{re.escape(biomarker(tt))}b", nltk_tokens(text))
                                if len(matches_for_marker)!=0:
                                    for y in range(len(df1)):
                                        disease_name=(df1.at(y,'name'))
                                        
                                        regex_for_dis = rf"b{disease_name}b"
                                        
                        
                                        matches_for_dis= re.findall(regex_for_dis, nltk_tokens(text), re.IGNORECASE | re.MULTILINE)
                                        #matches_for_dis = (re.findall(rf"b{(df1.at(y,'name'))}b", nltk_tokens(text), re.IGNORECASE | re.MULTILINE) for y in  range(len(df1)))
                       
                                        
                                        if len(matches_for_dis)!=0:
                                            
                            
                                            for firstPublicationDate in root2.iter('firstPublicationDate'):
                                                firstPublicationDate=firstPublicationDate.text
                                            for authorString in root2.iter('authorString'):
                                                counter1=counter1+1
                                            
                                            mycursor.execute("insert into papers_sentence_com_ppr (id, paper_id,firstPublicationDate, marker,marker_id, disease_name, AUTHORS, sentence) values (%s,%s,%s, %s, %s,%s, %s,%s)", (counter1, paper_id,firstPublicationDate, biomarker(tt),(df.at(zz,'Entry')), (df1.at(y,'name')), authorString.text, nltk_tokens(text)))
                                            #print("***********************************************************DATABASE_ENTRY*************************************************************n")
                                            mydb.commit()






```