Better way to extract html table to dictionary using beautifulsoup in python

I am scrapping html pages. Part of the page has a table which has acts and sections of those acts mentioned in table format. For some other project I need to convert them to Dictionary. The key values are previously set (in the other project). I want to use the same key values for the dictionary and then replace corresponding sections with each new input. The code I have designed works but I am looking for better way to write it. Presently the code looks quite lengthy. The code:

from bs4 import BeautifulSoup as bs, NavigableString

openFile = open('/some path/samplePage.html')
soup = bs(openFile, 'html.parser')

acts = soup.select('#act_table td:nth-of-type(1)')
sections = soup.select('#act_table td:nth-of-type(2)')
dictionary = {}


ipc = 'indian penal code'
poa = 'prevention of atrocities'
pcso = 'protection of children from sexual'
pcr = 'protection of civil rights'


if len(acts) < 1:
    print('no act mentioned')
elif len(acts) < 2:
    act1 = tuple(acts(0).contents)
    sections1 = tuple(sections(0).contents)
elif len(acts) < 3:
    act1 = tuple(acts(0).contents)
    sections1 = tuple(sections(0).contents)
    act2 = tuple(acts(1).contents)
    sections2 = tuple(sections(1).contents)
elif len(acts) < 4:
    act1 = tuple(acts(0).contents)
    sections1 = tuple(sections(0).contents)
    act2 = tuple(acts(1).contents)
    sections2 = tuple(sections(1).contents)
    act3 = tuple(acts(2).contents)
    sections3 = tuple(sections(2).contents)
elif len(acts) < 5:
    act1 = tuple(acts(0).contents)
    sections1 = tuple(sections(0).contents)
    act2 = tuple(acts(1).contents)
    sections2 = tuple(sections(1).contents)
    act3 = tuple(acts(2).contents)
    sections3 = tuple(sections(2).contents)
    act4 = tuple(acts(3).contents)
    sections4 = tuple(sections(3).contents)
else:
    act1 = tuple(acts(0).contents)
    sections1 = tuple(sections(0).contents)
    act2 = tuple(acts(1).contents)
    sections2 = tuple(sections(1).contents)
    act3 = tuple(acts(2).contents)
    sections3 = tuple(sections(2).contents)
    act4 = tuple(acts(3).contents)
    sections4 = tuple(sections(3).contents)
    act5 = tuple(acts(4).contents)


if len(acts) == 0:
    pass
# for first act in list
elif len(acts) == 1:
    if ipc in str(act1).lower():
        dictionary('IPC') = sections1
    elif poa in str(act1).lower():
        dictionary('PoA') = sections1
    elif pcso in str(act1).lower():
        dictionary('PCSO') = sections1
    elif pcr in str(act1).lower():
        dictionary('PCR') = sections1
    else:
        dictionary('Any Other Act') = str(act1).lower()
    print(dictionary)

# for 2nd act in list
elif len(acts) == 2:
    if ipc in str(act1).lower():
        dictionary('IPC') = sections1
    elif poa in str(act1).lower():
        dictionary('PoA') = sections1
    elif pcso in str(act1).lower():
        dictionary('PCSO') = sections1
    else:
        dictionary('Any Other Act') = str(act1).lower()
    if ipc in str(act2).lower():
        dictionary('IPC') = sections2
    elif poa in str(act2).lower():
        dictionary('PoA') = sections2
    elif pcso in str(act2).lower():
        dictionary('PCSO') = sections2
    else:
        dictionary('Any Other Act') = act2
    print(dictionary)
# for 3rd act in list
elif len(acts) == 3:
    if ipc in str(act1).lower():
        dictionary('IPC') = sections1
    elif poa in str(act1).lower():
        dictionary('PoA') = sections1
    elif pcso in str(act1).lower():
        dictionary('PCSO') = sections1
    elif pcr in str(act1).lower():
        dictionary('PCR') = sections1
    else:
        dictionary('Any Other Act') = str(act1).lower()
    if ipc in str(act2).lower():
        dictionary('IPC') = sections2
    elif poa in str(act2).lower():
        dictionary('PoA') = sections2
    elif pcso in str(act2).lower():
        dictionary('PCSO') = sections2
    elif pcr in str(act2).lower():
        dictionary('PCR') = sections2
    else:
        dictionary('Any Other Act') = act2
    #for 3rd option
    if ipc in str(act3).lower():
        dictionary('IPC') = sections3
    elif poa in str(act3).lower():
        dictionary('PoA') = sections3
    elif pcso in str(act3).lower():
        dictionary('PCSO') = sections3
    elif pcr in str(act3).lower():
        dictionary('PCR') = sections3
    else:
        dictionary('Any Other Act') = act3
    print(dictionary)
    # for 4th act in list
elif len(acts) == 4:
    if ipc in str(act1).lower():
        dictionary('IPC') = sections1
    elif poa in str(act1).lower():
        dictionary('PoA') = sections1
    elif pcso in str(act1).lower():
        dictionary('PCSO') = sections1
    elif pcr in str(act1).lower():
        dictionary('PCR') = sections1
    else:
        dictionary('Any Other Act') = str(act1).lower()
    if ipc in str(act2).lower():
        dictionary('IPC') = sections2
    elif poa in str(act2).lower():
        dictionary('PoA') = sections2
    elif pcso in str(act2).lower():
        dictionary('PCSO') = sections2
    elif pcr in str(act2).lower():
        dictionary('PCR') = sections2
    else:
        dictionary('Any Other Act') = act2
    # for 3rd option
    if ipc in str(act3).lower():
        dictionary('IPC') = sections3
    elif poa in str(act3).lower():
        dictionary('PoA') = sections3
    elif pcso in str(act3).lower():
        dictionary('PCSO') = sections3
    elif pcr in str(act3).lower():
        dictionary('PCR') = sections3
    else:
        dictionary('Any Other Act') = act3
    # 4th Option
    if ipc in str(act4).lower():
        dictionary('IPC') = sections4
    elif poa in str(act4).lower():
        dictionary('PoA') = sections4
    elif pcso in str(act4).lower():
        dictionary('PCSO') = sections4
    elif pcr in str(act4).lower():
        dictionary('PCR') = sections4
    else:
        dictionary('Any Other Act') = act4
elif len(acts) == 5:
    if ipc in str(act1).lower():
        dictionary('IPC') = sections1
    elif poa in str(act1).lower():
        dictionary('PoA') = sections1
    elif pcso in str(act1).lower():
        dictionary('PCSO') = sections1
    elif pcr in str(act1).lower():
        dictionary('PCR') = sections1
    else:
        dictionary('Any Other Act') = str(act1).lower()
    if ipc in str(act2).lower():
        dictionary('IPC') = sections2
    elif poa in str(act2).lower():
        dictionary('PoA') = sections2
    elif pcso in str(act2).lower():
        dictionary('PCSO') = sections2
    elif pcr in str(act2).lower():
        dictionary('PCR') = sections2
    else:
        dictionary('Any Other Act') = act2
    # for 3rd option
    if ipc in str(act3).lower():
        dictionary('IPC') = sections3
    elif poa in str(act3).lower():
        dictionary('PoA') = sections3
    elif pcso in str(act3).lower():
        dictionary('PCSO') = sections3
    elif pcr in str(act3).lower():
        dictionary('PCR') = sections3
    else:
        dictionary('Any Other Act') = act3
    # 4th Option
    if ipc in str(act4).lower():
        dictionary('IPC') = sections4
    elif poa in str(act4).lower():
        dictionary('PoA') = sections4
    elif pcso in str(act4).lower():
        dictionary('PCSO') = sections4
    elif pcr in str(act4).lower():
        dictionary('PCR') = sections4
    else:
        dictionary('Any Other Act') = act4
print(dictionary)

The HTML code of one of the files is here:

link to the source code

Web scraping – I need to extract a data value from Google using BeautifulSoup in Python

If you go to this link:
"https://www.google.com/search?q=usd+to+clp&rlz=1C1CHBD_esVE816VE816&oq=usd+to+clp&aqs=chrome..69i57j35i39j0l2j69i60l4.1479j0j7&sourceid=chrome&"

You can see the exchange rate between USD and CLP (Chilean Peso).

I want to extract this number …

I've already tried to do the next one … (and many more things)

import requests
from bs4 import BeautifulSoup

result = requests.get("https://www.google.com/search?q=usd+to+clp&rlz=1C1CHBD_esVE816VE816&oq=usd+to+clp&aqs=chrome..69i57j35i39j0l2j69i60l4.1479j0j7&sourceid=chrome&ie=UTF-8")

src = result.content
soup = BeautifulSoup(src, features="html.parser")
value = soup.find ('span', class_ = "DFlfde.SwHCTb").get_text()
print(value)

It will be excellent if someone could help me, thank you.

Pd: Yes, I am a newcomer, sorry: P.

html – Extract a message ID with Beautifulsoup

Here's what, I'm in a college job whose goal is to pull news from different sections of a website. I chose the Globo site because it has a very good message pattern. I use beautifulsoup for that, which is wonderful, but I have to extract the ID of these messages, since I will put them in a database, and since each message has a different ID, I will use this ID to avoid repeated messages in my bank. Every message on the globe site belongs to this class: (Feed-Post-BSTN-Article-Form-Type-Materia-ID = "68d33d6e4dd5a282175a68e80c833154") Here the Material-ID has to be extracted. However, if I search for this class with beatifulsoup, the ID is not included. I looked around, but found that I was looking for a class and a fixed pass, but this pass varies from topic to topic, so I can not find that pass. Do you have an idea how to get this ID? I leave you with a more specific expression so that you can get an idea of ​​the structure of the website. An example of a website I use to determine the message ID is: https://g1.globo.com/educacao/index/feed/page-100.ghtml ID that I want to extract is marked

Pull href tags with BeautifulSoup with Python

As if that were not extremely obvious, I'm a new programmer.

But my codewords are far from my opinion the most ideal representation. I am also not sure if there are unnecessary parts because I tried to turn this on and off for a few days, so I started and stopped several times

The goal is clean code with a # comment in each line (unless extremely basic) to improve my note habits. Please let me know what you would do overall to improve it. Many Thanks!

import urllib.request as ur
from bs4 import BeautifulSoup


url = str (input (& # 39; enter URL & # 39;)) #Converting the input to a string
html = ur.urlopen (url) .read () #read html
soup = BeautifulSoup (html, "html.parser") # Get all anchor tags
Count_ = int (input (& # 39; Enter count: & # 39;)) #converts the input to an integer
pos_1 = int (input (& # 39; Enter position: & # 39;)) # convert the input to an integer
Tags = soup (& # 39; a & # 39;
final = & # 39; & # 39; #url the list of names before the break
curpos = & # 39; & # 39;
print (& # 39; Retrieving: & # 39; url) #prints start point / URL
count = int (count_) + 1
while count> 1: #Start of a particular loop that runs until count is less than 1
pos = 0
for day in tags:
if pos == int (pos_1) - 1: # conditional statement to position
curpos = tag.get (& # 39; href & # 39 ;, none)
break
pos = pos + 1 # increases the pos value for each day
final = curpos #
url = str (curpos) #
html = ur.urlopen (url) .read ()
Soup = BeautifulSoup (html, "html.parser")
Tags = soup (& # 39; a & # 39;
count = count - 1 # For each iteration in the loop, 1 is subtracted from count
print (& # 39; retrieving: & # 39 ;, final)

python – Cleaner method of appending data to the list in BeautifulSoup

So I tried different methods to get data from different websites. as such, between the use of JSON or BeautifulSoup. Currently I have written a scrapper to store data such as [{Title,Description,Replies,Topic_Starter, Total_Views}]; but it has pretty much no reusable code. I've found out how to correct my approach of attaching data to a single list for simplicity and reusability. But I pretty much hit a stone with my current skills.

received from requests import
from bs4 import BeautifulSoup
Import pandas as pd
import sleep from sleep


url = & # 39; https: //forum.lowyat.net/ReviewsandGuides&#39;

list_topic = []
list_description = []
list_replies = []
list_topicStarted = []
list_totalViews = []


def getContentFromURL (_url):
To attempt:
Answer = get (_url)
html_soup = BeautifulSoup (response.text, & # 39; lxml & # 39;
return html_soup
Exception exception as e:
print (& # 39; Error.getContentFromURL: & # 39 ;, e)
return None


def iterateThroughPages (_lastindexpost, _postperpage, _url):
Indices = & # 39; / + & # 39;
Index = 0
for i in range (index, _lastindexpost):
print ("retrieve data" + url)
To attempt:
extractDataFromRow1 (getContentFromURL (_url))
extractDataFromRow2 (getContentFromURL (_url))
print (& # 39; The current page index is: & # 39; + str (index))
print (_url)
while I <= _lastindexpost:
                for table in get(_url):
                    if table != None:
                        new_getPostPerPage = i + _postperpage
                        newlink = f'{url}{indices}{new_getPostPerPage}'
                        print(newlink)
                        bs_link = getContentFromURL(newlink)
                        extractDataFromRow1(bs_link)
                        extractDataFromRow2(bs_link)
                        # threading to prevent spam. Waits 0.5 secs before executing
                        sleep(0.5)
                    i += _postperpage
                    print('current page index is: ' + str(i))
                    if i > _lastindexpost:
# If I get more than the input page (etc. 1770), it will stop
print (& # 39; No available contribution to retrieve & # 39;)
return
Exception exception as e:
print (& # 39; Error.iterateThroughPages: & # 39 ;, e)
return None


def extractDataFromRow1 (_url):
To attempt:
for containers in _url.find_all (& # 39; td & # 39 ;, <class>: <row1>, <valign>: & middle; & # 39; middle & # 39; 39}):
# Get data from the topic title in the table cell
topic = container.select_one (
& # 39; a[href^="/topic/"]& # 39;) .text.replace (" n", "")
description = container.select_one (
& # 39; # 39 & div.desc). text.replace (" n", "")
If topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description & # 39; & # 39; is:
dict_description = & # 39; No data & # 39;
#list_description.append (dict_description)
#so no empty string #
list_topic.append (dict_topic)
list_description.append (dict_description)
otherwise:
none
Exception exception as e:
print (& # 39; Error.extractDataFromRow1: & # 39 ;, e)
return None


def extractDataFromRow2 (_url):
To attempt:
for containers in _url.select (& # 39; table[cellspacing="1"] > tr & # 39;[2:32]:
replies = container.select_one (td: nth-of-type (4) #). text.strip ()
topic_started = container.select_one (
# Td: nth-of-type (5) #). text.strip ()
total_views = container.select_one (
# Td: nth-of-type (6) #). text.strip ()
if answers or topic_started or total_views are not None:
dict_replies = answers
dict_topicStarted = topic_started
dict_totalViews = total views
if dict_replies & # 39; & # 39; is:
dict_replies = & # 39; No data & # 39;
elif dict_topicStarted is & # 39;:
dict_topicStarted = & # 39; No data & # 39;
elif dict_totalViews is & # 39;:
dict_totalViews = & # 39; No data & # 39;
list_replies.append (dict_replies)
list_topicStarted.append (dict_topicStarted)
list_totalViews.append (dict_totalViews)
otherwise:
print (& # 39; no data & # 39;)
none
Exception exception as e:
print (& # 39; Error.extractDataFromRow2: & # 39 ;, e)
return None


# limit to 1740
print (iterateThroughPages (1740, 30, url))
new_panda = pd.DataFrame (
{& # 39; Title & # 39 ;: list_topic, & # 39; Description & # 39 ;: list_description,
& # 39; answers & # 39 ;: list_replies, & # 39; Topic Starter & # 39 ;: list_topicStarted, & # 39; Total Views & # 39 ;: list_totalViews})
print (new_panda)

I am sure that my use To attempt At this point is also superfluous, my wide choice of cunning included, and the use of While and To the will most likely be misused.