web scraping – Como extrair todos os comentários de um perfil no instagram para um arquivo excel?

Olá!

Estou rodando um scraper para extrair os comentários de perfis abertos do instagram para um excel. Entretanto, o código que estou usando extrai cada comentário para um bloco de notas, separados em pastas por postagem e conta.

Eu preciso de um excel final, com as colunas (conta, conta que comentou e o comentário).

O código que estou usando é esse: https://github.com/hehpollon/Instagram-Crawler

Muito obrigado!

import time
import os
import requests
from utils.browser import Browser
from docopt import docopt
from tqdm import tqdm
import html

def downloadImage(imageUrl, imagePath):
    img_data = requests.get(imageUrl).content
    with open(imagePath, 'wb') as handler:
        handler.write(img_data)

def writeToFile(filePath, data):
    file = open(filePath, 'w', encoding='utf8')
    for i in data:
        if type(i) is list:
            i = "n".join(i)
        try:
            file.write(str(i)+"n")
        except Exception as e:
            pass
    file.close()

def makeDir(dirPath):
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)
    else:
        if len(os.listdir(dirPath)) == 3:
            return False
    return True

def extractLang(data):
    result = ""
    try:
        result = data.split('lang="')(1).split('"')(0)
    except Exception as e:
        pass
    return result

def extractLikes(data, lang="en"):
    result = ""
    try:
        if lang == "en":
            result = data(0)(1:)
        else:
            result = data(1)(:-2)
    except Exception as e:
        pass
        result = ""
    return result

def extractComments(data, lang="en"):
    result = ""
    try:
        if lang == "en":
            result = data(2)
        else:
            result = data(3)(:-1)
    except Exception as e:
        pass
        result = ""
    return result

def extractDateTime(data):
    result = ""
    try:
        result = data.split('datetime="')(1).split('"')(0)
    except Exception as e:
        pass
        result = ""
    return result

def extractCommentsMessage(data):
    results = ()
    try:
        sp = data.split("sqdOP yWX7d     _8A5w5   ZIAjV")
        if len(sp) > 2:
            for i in range(len(sp)):
                if i > 1:
                    name = sp(i).split(">")(1).split("<")(0)
                    message = sp(i).split(">")(5).split("<")(0)
                    results.append(name+": "+message)
    except Exception as e:
        pass
        results = ()
    return results

def extractCaption(data):
    result = ""
    try:
        splitData = data.split('<img alt="')
        if len(splitData) > 1:
            result = splitData(1).split('"')(0)
        else:
            # only english?
            result = data.split('{"node":{"text":"')(1).split('"}')(0)
            result = result.encode('utf-8').decode('unicode-escape')
    except Exception as e:
        pass
        result = ""
    return result

def runCrawl(limitNum = 0, queryList = (), is_all_comments=False):
    browser = Browser("driver/chromedriver")
    for query in queryList:
        browser.clearLink()
        makeDir("data")
        makeDir("data/"+query)
        mUrl = ""
        if query(0) == "#":
            mUrl = "https://www.instagram.com/explore/tags/"+query(1:)+"/?hl=en"
        else:
            mUrl = "https://www.instagram.com/"+query+"/?hl=en"
        browser.goToPage(mUrl)
        print("collecting url of " + query + "...")
        browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum)
        print("finish scoll collecting!")

        print("collecting data...")
        slist = list(set(browser.urlList))
        for url in tqdm(slist):
            dirName = url.split("/")(4)
            # skip if already crawled 
            if not makeDir("data/"+query+"/"+dirName):
                continue
            browser.goToPage(url)
            if is_all_comments:
                browser.expandComments()
            cur = browser.getPageSource()
            writeToFile("data/"+query+"/"+dirName+"/raw.html", (cur))
            infoData = cur.split("<meta content=")(1).split(" ")
            # extract data
            lang = extractLang(cur)
            likes = extractLikes(infoData, lang)
            comments = extractComments(infoData, lang)
            caption = extractCaption(cur)
            dateTime = extractDateTime(cur)
            commentMessages = extractCommentsMessage(cur)
            # print("likes:",likes," comments:", comments," caption:", caption, 
            #     "commentMessages:", commentMessages, "dateTime:", dateTime)
            writeToFile(
                "data/"+query+"/"+dirName+"/info.txt", 
                (   
                    "likes: ", likes, "",
                    "comments: ", comments, "",
                    "caption: ", caption, "",
                    "commentMessages: ", commentMessages, "",
                    "dateTime: ", dateTime, ""
                )
            )
            # download image
            imageUrl = html.unescape(cur.split('meta property="og:image" content="')(1).split('"')(0))
            downloadImage(imageUrl,"data/"+query+"/"+dirName+"/image.jpg")
            time.sleep(1)
        print("query " + query + " collecting finish")

    time.sleep(2)
    browser.driver.quit()
    print("FINISH!")

def main():
    args = docopt("""
    Usage:
        crawl.py (-q QUERY) (-n NUMBER) (--a) (-h HELP)
    
    Options:
        -q QUERY  username, add '#' to search for hashtags, e.g. 'username' or '#hashtag'
                  For multiple query seperate with comma, e.g. 'username1, username2, #hashtag'
        -n NUM    number of returned posts (default: 1000)
        --a       collect all comments
        -h HELP   show this help message and exit
    """)
    hasChromeDriver = False
    for i in os.listdir("./driver"):
        if "chromedriver" in i:
            hasChromeDriver = True
            break
    if not hasChromeDriver:
        print("ERROR! NO 'chromedriver' Found")
        print("Please install chromedriver at https://sites.google.com/a/chromium.org/chromedriver/")
        return
    limitNum = int(args.get('-n', 1000))
    query = args.get('-q', "")
    is_all_comments = args.get('--a', False)
    if not query:
        print('Please input query!')
    else:
        queryList = query.replace(" ","").split(",")
        runCrawl(limitNum=limitNum, queryList=queryList, is_all_comments=is_all_comments)

main()

node.js – Web Scraping NODEJS – Os processos pausam e não dão continuidade

Preciso de ajuda!

Desenvolvi um web-scraping que pega informações e cadastra no banco de dados. Rodo o mesmo web-scraping em varios processos(eles pegam informações diferentes).

No meio do processo o sistema parece que da uma pausada, as vezes volta à funcionar, outras vezes nem volta…

Alguém sabe como contornar esse problema?

Obs: Desenvolvi em NodeJS.

Obrigado!

python – Scraping webelements if found

Im currently working doing a webscrape which in my case if stackoverflow but will be a template over time for other sites.

I have currently done:

from typing import Optional

import attr
import requests
from bs4 import BeautifulSoup
from loguru import logger
from requests import RequestException
from requests.exceptions import (
    ConnectionError,
    ConnectTimeout,
    ProxyError,
    ReadTimeout,
    ChunkedEncodingError,
    SSLError,
    Timeout
)


@attr.dataclass
class Data:
    """Scraped info about product Considering to have dataclass on a seperated py file for easier maintaining"""
    store: str = attr.ib(factory=str)
    name: Optional(str) = attr.ib(factory=str)
    price: Optional(str) = attr.ib(factory=str)
    image: Optional(str) = attr.ib(factory=str)


class Product:
    def from_page(url: str) -> Data:
        with requests.get(url) as response:
            if response.status_code == 404:
                return Data(store="Shelta")
            if response.ok:
                doc = BeautifulSoup(response.text, 'html.parser')
            else:
                response.raise_for_status()

        name = doc.select_one('h1.product-page-header')
        price = doc.select_one('span.price')
        image = doc.select_one('meta(property="og:image")')

        return Data(
            store="Shelta",
            name=name.text.strip() if name else '',
            price=price.text.strip() if price else '',
            image=image('content') if image else '',
        )


def main():
    while True:
        try:
            product = Product.from_page(url="https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
            if product is None:
                logger.info('No new payload')
            else:
                logger.info(f'New payload: {product}')
                break
        except (
                ReadTimeout,
                Timeout,
                ConnectTimeout,
                ConnectionError,
                ChunkedEncodingError,
                SSLError,
                ProxyError
        ) as err:
            logger.info(
                f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__}: {err}")
            continue

        except RequestException as err:
            logger.exception(
                f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__},: {err}")
            raise RequestException from err

        except Exception as err:
            logger.exception(
                f"{type(err).__name__} at line {err.__traceback__.tb_lineno} of {__file__},: {err}")
            raise Exception from err


if __name__ == '__main__':
    main()

where later on I will return test and do some stuff with it but thats not the point and the review I am looking for. The review im looking for is that I would like to know if there is any improvements I can do when it comes to scraping data. Basically if we find the webelement and its text/href/info that needs to be scraped, we want to apply that to the dataclass and later on return it.

Is there anything I can improve when it comes to scrape the correct data and if its not found then it should just return the default value from the dataclass?

python – Scraping OddsPortal with requests only

This is a scraper written to do most of what had been attempted by another user in this question:

How can I optimise this webscraping code

I did the rewrite because I felt bad that the new user didn’t get an opportunity to talk about their code (even though the off-topic closure was correct). Also the site, as with so many others, is badly designed enough to be an interesting challenge. The design objective of this code is to scrape an arbitrary number of years and pages from OddsPortal without needing to involve a browser (i.e. Selenium).

On the balance, there are advantages and disadvantages to using Requests: when I lose the layer of abstraction provided by Selenium, I gain speed but also the code is probably more fragile given some of the hoops I have to jump through to decode their timestamp and odds formats. Any feedback welcome.

import json
import re
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pprint import pprint
from typing import Dict, Any, Tuple, Optional, Iterable
from urllib.parse import urljoin

from requests import Session
from bs4 import BeautifulSoup, Tag


BASE = 'https://www.oddsportal.com'
TERRIBLE_TIMESTAMP_PAT = re.compile(r'^t(d+)-')


def decode_terrible_timestamp_class(tag: Tag) -> datetime:
    for cls in tag('class'):
        match = TERRIBLE_TIMESTAMP_PAT.search(cls)
        if match:
            break
    else:
        raise ValueError(f'{tag} does not seem to contain a valid timestamp')

    stamp = int(match(1))
    return datetime.fromtimestamp(stamp, tz=timezone.utc)


# Refer to https://www.oddsportal.com/res/x/global-210609145530.js
# this.d = function (str)
ODDS_TABLE = str.maketrans(
    'axcteopzf',
    '1234567.|',
)


def decode_terrible_odds(decorated: Tag) -> Tuple(float, Optional(float)):
    odds = decorated('xodd').translate(ODDS_TABLE).split('|')

    if len(odds) == 1:
        return float(odds), None

    preferred, normal = odds
    return float(normal), float(preferred)


@dataclass
class GameData:
    sport: str
    sport_path: str
    country: str
    country_path: str
    season: str
    season_path: str

    when: datetime
    path: str

    home_team: str
    away_team: str
    home_team_won: bool
    away_team_won: bool
    home_score: int
    away_score: int

    home_odds: float
    home_odds_preferred: float
    draw_odds: float
    draw_odds_preferred: float
    away_odds: float
    away_odds_preferred: float

    bookmakers: int

    @classmethod
    def from_tags(
        cls,
        sport: Tag,
        country: Tag,
        season: Tag,
        date_span: Tag,
        time: Tag,
        teams: Tag,
        score: Tag,
        home_odds: Tag,
        draw_odds: Tag,
        away_odds: Tag,
        bookmakers: Tag,
    ):
        home_score, away_score = score.text.split(':')

        when = datetime.combine(
            decode_terrible_timestamp_class(date_span).date(),
            decode_terrible_timestamp_class(time).time(),
            tzinfo=timezone.utc,
        )

        team_anchor = teams.find('a')
        any_span = team_anchor.find('span')
        if any_span:
            home_team, away_team = (
                t.text if isinstance(t, Tag) else t.strip('- ')
                for t in team_anchor.children
            )
            home_team_won, away_team_won = (
                isinstance(t, Tag)
                for t in team_anchor.children
            )
        else:
            home_team, away_team = team_anchor.text.split('-')
            home_team_won, away_team_won = False, False

        home_odds_norm, home_odds_pref = decode_terrible_odds(home_odds)
        draw_odds_norm, draw_odds_pref = decode_terrible_odds(draw_odds)
        away_odds_norm, away_odds_pref = decode_terrible_odds(away_odds)

        game = cls(
            sport=sport.text.strip(),
            sport_path=sport('href'),
            country=country.text.strip(),
            country_path=country('href'),
            season=season.text.strip(),
            season_path=season('href'),
            when=when,
            path=team_anchor('href'),
            home_team=home_team,
            away_team=away_team,
            home_team_won=home_team_won,
            away_team_won=away_team_won,
            home_score=int(home_score),
            away_score=int(away_score),
            home_odds=home_odds_norm,
            home_odds_preferred=home_odds_pref,
            draw_odds=draw_odds_norm,
            draw_odds_preferred=draw_odds_pref,
            away_odds=away_odds_norm,
            away_odds_preferred=away_odds_pref,
            bookmakers=int(bookmakers.text),
        )
        return game


def start_search(session: Session, year: int) -> Dict(str, Any):
    url = urljoin(BASE, f'/soccer/england/premier-league-{year}-{year+1}/results/')
    with session.get(url) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'lxml')

    script_pat = re.compile(r'new PageTournament(')
    for script in doc.select('body > script'):
        match = script_pat.search(script.string)
        if match:
            break
    else:
        raise ValueError('ID script not found')

    start = match.end()
    end = script.string.find('}', start) + 1
    json_str = script.string(start: end)
    ids = json.loads(json_str)
    return ids


def get_page(session: Session, sid: int, id: str, page: int) -> Iterable(GameData):
    bookie_hash = 'X0'
    use_premium = 1
    timezone_offset = 0

    archive_url = urljoin(
        BASE,
        f'/ajax-sport-country-tournament-archive'
        f'/{sid}'
        f'/{id}'
        f'/{bookie_hash}'
        f'/{use_premium}'
        f'/{timezone_offset}'
        f'/{page}'
        f'/'
    )

    with session.get(archive_url) as resp:
        resp.raise_for_status()
        start = resp.text.find('{')
        end = resp.text.rfind('}') + 1
        json_str = resp.text(start: end)

    html = json.loads(json_str)('d')('html')
    doc = BeautifulSoup(html, 'lxml')

    head = doc.find('th')
    sport, country, season = head.find_all('a')

    for tr in doc.find_all('tr'):
        date_header = tr.select_one('th(colspan="3")')
        if date_header:
            date_span = date_header.find('span')
            continue

        if tr.select_one('td.table-time'):
            yield GameData.from_tags(
                sport, country, season, date_span,
                *tr.find_all('td'),
            )


def main():
    with Session() as session:
        session.headers = {'User-Agent': 'Mozilla/5.0'}

        ids = start_search(session, year=2020)

        for page in range(1, 3):
            for game in get_page(session, ids('sid'), ids('id'), page):
                pprint(asdict(game))


if __name__ == '__main__':
    main()

r – Web Scraping com Rvest – Problema com raspagem dos dados (links principalmente)

Estou tentando fazer um Web Scraping do site da Web Of Science, mas esstou enfrentando problemas com a raspagem de links do site.

Minha intenção é raspar títulos dos artigos, links que direcionam para cada página do artigo dentro da Web of Science para que eu possa raspar outros dados como: resumo, palavras-chave, entre outros. E por fim fazer um looping para raspar essas informações até a última página da pesquisa.

Iniciei com o seguinte código:

library(rvest)
library(dplyr)

link <- “https://apps.webofknowledge.com/Search.do?product=WOS&SID=5Bzr6AeuFKanEWXAFWh&search_mode=GeneralSearch&prID=45752f34-12a3-474b-8fcf-6b21a2196ed7”

page <- read_html(link)

titulo_artigo <- page %>%
html_nodes(“.snowplow-full-record value”) %>%
html_text()

links_dos_artigos <-page %>%
html_nodes(“.snowplow-full-record value”) %>%
html_attr(“href”)

Entretanto, links_dos_artigos retornam apenas valores NA e não os links que eu preciso

Agradeço se alguém puder ajudar.

python – Web Scraping de resultados electorales

Estoy tratando de hacer web scraping a una página web con Python, esta en específico: http://resultadoshistorico.onpe.gob.pe/EG2021/EleccionesPresidenciales/RePres/P/020000/021300.
Sin embargo, al abrir el link van a darse cuenta que te redirecciona a la página inicial (http://resultadoshistorico.onpe.gob.pe/EG2021/), en donde uno tiene que dar clic en ‘Elecciones Presidenciales’, en ‘Ámbito’ elegir ‘Perú’ y así sucesivamente para escojer la provincia en específico de la que uno quiere ‘rascar’ la información (solo me interesa la tabla que contiene el total de votos por cada candidato).

Las librerías que estoy usando son BeautifulSoup, Pandas y Requests. Sin embargo, debido a lo señalado en el párrafo anterior, no puedo extraer la tabla que me interesa. Es como si la página estuviera protegida o algo por el estilo.

Les agradezco si me pueden indicar qué procedimiento seguir.

Web Scraping PHP – Pegando Link

Criei esse código de web scraping em PHP, mas não estou conseguindo capturar o endereço do link, está trazendo o titulo do link. Como resolver?

<?php

require_once "simple_html_dom.php";

  $url = 'https://www.sold.com.br/';
  $html = file_get_html($url);
  $hold = array();

  libxml_use_internal_errors(true);

  if(!empty($html)) {
   //$element = $data = "";
   $i = 0;

  foreach ($html->find(".bloco-leilao") as $element) {
    

      foreach($element -> find(".data")as $data){
        $hold($i)("Data") = trim($data->plaintext); 
        
        }
      foreach($element -> find(".leilao-tipo-bem")as $tipoBem){
        $hold($i)("Bem") = trim($tipoBem->plaintext);
       
        }
      foreach($element -> find(".tipo")as $tipo){      
        $hold($i)("Tipo") = trim($tipo->plaintext);
        
        }
      foreach($element -> find(".leilao-descricao")as $descricao){      
        $hold($i)("Descricao") = trim($descricao->plaintext);
        
        }
        foreach($element -> find("(href)")as $link){      
            $hold($i)("Lik") = $link->plaintext;
            
            }
        $i++;
        }
 
  
}

  print_r($hold); exit;

?>

web scraping – Bs4 and requests Python

I was trying to make a pokedex (https://replit.com/@Yoplayer1py/Gui-Pokedex) and I wanted to get the pokemon’s description from https://www.pokemon.com/us/pokedex/{pokemon name} there is a tag contains the description and the p tag’s class is : version-xactive.

When i print the description i get nothing or sometimes i get None.

here’s the code:

import requests
from bs4 import BeautifulSoup

# Assign URL
url = "https://www.pokemon.com/us/pokedex/"+text_id_name.get(1.0, "end-1c")
    
# Fetch raw HTML content
html_content = requests.get(url).text
    
# Now that the content is ready, iterate 
# through the content using BeautifulSoup:
soup = BeautifulSoup(html_content, "html.parser")
    
# similarly to get all the occurences of a given tag
print(soup.find('p', attrs={'class': 'version-xactive'}).text)

The text_id_name.get(1.0, "end-1c") is from tkinter text input.

it shows that :

Exception in Tkinter callback
Traceback (most recent call last):
  File "/usr/lib/python3.8/tkinter/__init__.py", line 1883, in __call__
    return self.func(*args)
  File "main.py", line 57, in load_pokemon
    print(soup.find('p', attrs={'class': 'version-xactive'}).text)
AttributeError: 'NoneType' object has no attribute 'text'

Thanks in advance !!

I will do any type of data entry, web scraping, data mining and web research work for $1

I will do any type of data entry, web scraping, data mining and web research work

Are you looking for excellent service for you or your company’s DATA related work? Yes, You are at right place

I will do for you :

  1. Data entry (Online/Offline)
  2. Data Scraping
  3. DATA Entry in EXCEL
  4. ReType data In MsWord
  5. Web research, scraping
  6. Copy-Paste
  7. Data Collection from Social media (Linkedin, FB, Insta, Twiter Etc)
  8. PDF convert
  9. Excel or word data entry
  10. Market Research

and So on.

What you get before and after service:

◈ 100% Accuracy and Validation.

◈ Delivery on time.

◈ Manually typing.

◈ Unlimited revisions.

◈ Perfect Communication system

◈ Correction if you want when you need.

I always ready to take a different kind of data entry related project, So do not worry, just message me and talk with me.

.(tagsToTranslate)dataextraction(t)webresearch(t)webscraping(t)datamining(t)dataentrywork

functions – Disable shortlinks like ?p=1234 to prevent scraping

I have a site with several thousand public pages that use randomly generated slugs for their pretty urls. The content is public in the sense that if someone knows the URL, they can see the content. I’m looking to prevent someone from just hammering the server with /?p=XXXX incrementally to view all the pages.

I’ve added remove_action('wp_head', 'wp_shortlink_wp_head', 10, 0); to prevent the site from publishing the shortlinks in the page’s head. But it doesn’t take a genius to see the site is run by WordPress and thus try incrementally increasing a post number to scrape all the content.

Is there a way to use something like if( isset( $_GET("p") ) ) { ... } to disable accessing shortlinks while still allowing the pretty urls to resolve?