2 years ago

#71529

test-img

Berci Vagyok

Pagination on a website where the URL stays the same python, scrapy, selenium

I am scraping a website that contains companies, each page contains 26 companies, I am able to login and successfully scrape the first page of the website, but have trouble figuring out how to click to the next pages and scrape those as well.

The XPath of the 'next page button' is the following: /html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a I tried using a better XPath many times but this is the only one that worked. (I managed to make pagination work but then scraping was not working with another code)

Unfortunately, I am not able to provide credentials to the website.

I am using scrapy and selenium in python.

# -*- coding: utf-8 -*-
from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which 

#login info
username = "xxx"
password = "xxx"

class HtSpiderSelenium(scrapy.Spider):
    name = 'ht_selenium1'
    allowed_domains = ['https://app.xxx.bootstart.tech']
    start_urls = ['https://app.xxx.bootstart.tech']
    
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")

        driver = webdriver.Chrome(executable_path="./chromedriver")

        driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https%3A%2F%2Fapp.xxx.bootstart.tech%2F%3Fredirect_fragment%3D%xxx&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")

        driver.find_element_by_id("username").send_keys(username)
        driver.find_element_by_id("password").send_keys(password)
        driver.find_element_by_name("login").click()
        sleep(10)


        self.html = driver.page_source
        driver.close()

    def parse(self, response):
        resp = Selector(text=self.html)
        for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
            yield {
                'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
                'startup descript': ''.join(startup.xpath('//div//p//div//text()').getall()),
                'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
                'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
            }

python

selenium

web-scraping

scrapy

0 Answers

Your Answer

Accepted video resources