python – Getting xpath indices to count forward to preserve table structure

Scraping tables from the web get complicated when there are 2 or more values in a cell. In order to preserve the table structure, I have devised a way to count the row-number index of its xpath, implementing a nested list when the row number stays the same.

    def get_structured_elements(name):
        """For target data that is nested and structured,
        such as a table with multiple values in a single cell."""
        driver = self.driver
    
        i = 2 # keep track of 'i' to retain the document structure.
        number_of_items = number_of_items_found()
        elements = (None) * number_of_items # len(elements) will exceed number_of_items.

        target_data = driver.find_elements("//table/tbody/tr(" + i + ")/td(2)/a")
    
        while i - 2 < number_of_items:
            for item in target_data:
                # print(item.text, i-1)
                if elements(i - 2) == None:
                    elements(i - 2) = item.text # set to item.text value if position is empty. 
                else:
                    elements(i - 2) = (elements(i - 2)) 
                    elements(i - 2).append(item.text) # make nested list and append new value if position is occupied.
            i += 1
    
        return elements

This simple logic was working fine, until I sought to manage all locator variables in one place to make the code more reusable: How do I store this expression "//table/tbody/tr(" + i + ")/td(2)/a" in a list or dictionary so that it still works when plugged in?

The solution (i.e. hack) I came up with is a function that takes in the front and back half of the iterating xpath as arguments, returning front_half + str(i) + back_half if i is part of the parent (iterator) function’s local variable.

def split_xpath_at_i(front_half, back_half):
    """Splits xpath string at its counter index. 
    The 'else' part is to aviod errors 
    when this function is called outside an indexed environment. """
    
    if 'i' in locals():
        string = front_half + str(i) + back_half
    else:
        string = front_half+"SPLIT_i"+back_half

    return string

xpath = (split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"), 
         "//table/tbody/tr/td(3)/a(1)"
        )

def xpath_index_iterator():
    for i in range(10):
        print(split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"))


xpath_index_iterator()

# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a
# //table/tbody/tr(SPLIT_i)/td(2)/a

Problem is, split_xpath_at_i is blind to variables in its immediate environment. What I eventually came up with is to make use of the iterator function’s attribute to define the counter i so that the variable can be made available to split_xpath_at_i like so:

def split_xpath_at_i(front_half, back_half):
    """Splits xpath string at its counter index. 
    The 'else' part is to aviod errors 
    when this function is called outside an indexed environment. """
    try:
        i = xpath_index_iterator.i
    except:
        pass
    
    if 'i' in locals():
        string = front_half + str(i) + back_half
    else:
        string = front_half+"SPLIT_i"+back_half

    return string

xpath = (split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"), 
         "//table/tbody/tr/td(3)/a(1)"
        )
    
def xpath_index_iterator():
    xpath_index_iterator.i = 0
    lst = ()
    for xpath_index_iterator.i in range(10):
        print(split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"))

xpath_index_iterator()

# //table/tbody/tr(0)/td(2)/a
# //table/tbody/tr(1)/td(2)/a
# //table/tbody/tr(2)/td(2)/a
# //table/tbody/tr(3)/td(2)/a
# //table/tbody/tr(4)/td(2)/a
# //table/tbody/tr(5)/td(2)/a
# //table/tbody/tr(6)/td(2)/a
# //table/tbody/tr(7)/td(2)/a
# //table/tbody/tr(8)/td(2)/a
# //table/tbody/tr(9)/td(2)/a

The problem gets more complicated when I try to invoke split_xpath_at_i via a locator list:

def split_xpath_at_i(front_half, back_half):
    """Splits xpath string at its counter index. 
    The 'else' part is to aviod errors 
    when this function is called outside an indexed environment. """
    try:
        i = xpath_index_iterator.i
    except:
        pass
    
    if 'i' in locals():
        string = front_half + str(i) + back_half
    else:
        string = front_half+"SPLIT_i"+back_half

    return string

xpath = (split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"), 
         "//table/tbody/tr/td(3)/a(1)"
        )
    
def xpath_index_iterator():
    xpath_index_iterator.i = 0
    lst = ()
    for xpath_index_iterator.i in range(10):
#         print(split_xpath_at_i("//table/tbody/tr(",")/td(2)/a"))
        lst.append(xpath(0))
    return lst

xpath_index_iterator()

# ('//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a',
#  '//table/tbody/tr(9)/td(2)/a')

What would a professional approach to this problem look like?

-----

The code below was modified from the Selenium manual.

test.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from query import Input
import page

cnki = Input()
driver = cnki.webpage('http://big5.oversea.cnki.net/kns55/')

current_page = page.MainPage(driver)
current_page.submit_search('禮學')
current_page.switch_to_frame()
result = page.SearchResults(driver)

structured = result.get_structured_elements('titles') # I couldn't get this to work.
simple = result.simple_get_structured_elements() # but this works fine.

query.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium import webdriver

class Input:
    """This class deals with fields on the website 
    in which queries are be made, and handles user input."""
    
    # CONSTANTS
    
    URL = None

    XPATH = {
        "titles": '',
        "authors": '',
        "dates": '',
        "journals": '',
        "year_issues": '',
        "urls": ''
    }
        
    def __init__(self):
        self.driver = webdriver.Chrome
    
    def webpage(self, url):
        driver = self.driver()
        driver.get(url)
        
        return driver

page.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from element import BasePageElement
from locators import InputLocators, OutputLocators
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait


class SearchTextElement(BasePageElement):
    """This class gets the search text from the specified locator"""

    #The locator for search box where search string is entered
    locator = None


class BasePage:
    """Base class to initialize the base page that will be called from all
    pages"""

    def __init__(self, driver):
        self.driver = driver

class MainPage(BasePage):
    """Home page action methods come here. I.e. Python.org"""

    search_keyword = SearchTextElement()
    
    def submit_search(self, keyword):
        """Submits keyword and triggers the search"""
        SearchTextElement.locator = InputLocators.SEARCH_FIELD
        self.search_keyword = keyword

    def select_dropdown_item(self, item):
        driver = self.driver
        by, val = InputLocators.SEARCH_ATTR
        driver.find_element(by, val + "/option(text()='" + item + "')").click()

    def click_search_button(self):
        driver = self.driver
        element = driver.find_element(*InputLocators.SEARCH_BUTTON)
        element.click()
        
    def switch_to_frame(self):
        """Use this function to get access to hidden elements. """
        driver = self.driver
        driver.switch_to.default_content()
        driver.switch_to.frame('iframeResult')

    # Maximize the number of items on display in the search results.
    def max_content(self):
        driver = self.driver
        max_content = driver.find_element_by_css_selector('#id_grid_display_num > a:nth-child(3)')
        max_content.click()
    
    
    def stop_loading_page_when_element_is_present(self, locator):
        driver = self.driver
        
        ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
        wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
    
        wait.until(
            EC.presence_of_element_located(locator))
        driver.execute_script("window.stop();")


    def next_page(self):
        driver = self.driver

        self.stop_loading_page_when_element_is_present(InputLocators.NEXT_PAGE)
        driver.execute_script("window.stop();")
    
        try:
            driver.find_element(*InputLocators.NEXT_PAGE).click()
            print("Navigating to Next Page")
        except (TimeoutException, WebDriverException):
            print("Last page reached")
        
        
  
        
class SearchResults(BasePage):
    """Search results page action methods come here"""

    def __init__(self, driver):
        self.driver = driver
        i = None # get_structured_element counter
        
    def wait_for_page_to_load(self):
        driver = self.driver
        wait = WebDriverWait(driver, 100)
        wait.until(
            EC.presence_of_element_located(*InputLocators.MAIN_BODY))
    
    def get_single_element(self, name):
        """Returns a single value as target data."""
        driver = self.driver
        target_data = driver.find_element(*OutputLocators.CNKI(str(name.upper())))
        # SearchTextElement.locator = OutputLocators.CNKI(str(name.upper()))
        # target_data = SearchTextElement()
        return target_data
    
    def number_of_items_found(self):
        """Return the number of items found on a single page."""
        driver = self.driver
        target_data = driver.find_elements(*OutputLocators.CNKI('INDEX'))
        
        return len(target_data)
    
    def get_elements(self, name):
        """Returns simple list of values in specific data field in a table."""
        driver = self.driver
        target_data = driver.find_elements(*OutputLocators.CNKI(str(name.upper())))
        
        elements = ()
        for item in target_data:
            elements.append(item.text)
        
        return elements


    def get_structured_elements(self, name):
        """For target data that is nested and structured,
        such as a table with multiple values in a single cell."""
        driver = self.driver

        i = 2 # keep track of 'i' to retain the document structure.
        number_of_items = self.number_of_items_found()
        elements = (None) * number_of_items

        while i - 2 < number_of_items:
            
            target_data = driver.find_elements(*OutputLocators.CNKI(str(name.upper())))

            for item in target_data:
                print(item.text, i - 1)
                if elements(i - 2) == None:
                    elements(i - 2) = item.text
                elif isinstance(elements(i - 2), list):
                    elements(i - 2).append(item.text)
                else:
                    elements(i - 2) = (elements(i - 2))
                    elements(i - 2).append(item.text)
            i += 1
    
        return elements
    
    def simple_get_structured_elements(self):
        """Simple structured elements code with fixed xpath."""
        driver = self.driver

        i = 2 # keep track of 'i' to retain the document structure.
        number_of_items = self.number_of_items_found()
        elements = (None) * number_of_items
        
        while i - 2 < number_of_items:
            target_data = driver.find_elements_by_xpath
            ('//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr('
                 + str(i) + ')/td(2)/a')

            for item in target_data:
                print(item.text, i-1)
                if elements(i - 2) == None:
                    elements(i - 2) = item.text
                elif isinstance(elements(i - 2), list):
                    elements(i - 2).append(item.text)
                else:
                    elements(i - 2) = (elements(i - 2))
                    elements(i - 2).append(item.text)
            i += 1

        return elements

element.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium.webdriver.support.ui import WebDriverWait


class BasePageElement():
    """Base page class that is initialized on every page object class."""
    
    def __set__(self, obj, value):
        """Sets the text to the value supplied"""
        driver = obj.driver
        
        text_field = WebDriverWait(driver, 100).until(
            lambda driver: driver.find_element(*self.locator))
        text_field.clear()
        text_field.send_keys(value)
        text_field.submit()

    def __get__(self, obj, owner):
        """Gets the text of the specified object"""
        driver = obj.driver
        
        WebDriverWait(driver, 100).until(
            lambda driver: driver.find_element(*self.locator))
        element = driver.find_element(*self.locator)
        return element.get_attribute("value")

locators.py

This is where split_xpath_at_i sits.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from selenium.webdriver.common.by import By
# import page

class InputLocators():
    """A class for main page locators. All main page locators should come here"""
        
    def dropdown_list_xpath(attribute, value):
        string = "//select(@" + attribute + "='" + value + "')"
        
        return string
    
    MAIN_BODY = (By.XPATH, '//GridTableContent/tbody')
    SEARCH_FIELD = (By.NAME, 'txt_1_value1') # (By.ID, 'search-content-box')
    SEARCH_ATTR = (By.XPATH, dropdown_list_xpath('name', 'txt_1_sel'))
    SEARCH_BUTTON = (By.ID, 'btnSearch')
    NEXT_PAGE = (By.LINK_TEXT, "下頁")

class OutputLocators():
    """A class for search results locators. All search results locators should
    come here"""
    
    def split_xpath_at_i(front_half, back_half):
        # try:
        #     i = page.SearchResults.g_s_elem
        # except:
        #     pass

        if 'i' in locals():
            string = front_half + str(i) + back_half
        else:
            string = front_half+"SPLIT_i"+back_half
    
        return string

    CNKI = {
        "TITLES": (By.XPATH, split_xpath_at_i('//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr(', ')/td(2)/a')),
        "AUTHORS": (By.XPATH, split_xpath_at_i('//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr(', ')/td(3)/a')),
        "JOURNALS": '//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr/td(4)/a',
        "YEAR_ISSUE": '//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr/td(5)/a',
        "DOWNLOAD_PATHS": '//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr/td(1)/table/tbody/tr/td/a(1)', 
        "INDEX": (By.XPATH, '//*(@id="Form1")/table/tbody/tr(2)/td/table/tbody/tr/td(1)/table/tbody/tr/td/a(2)')
    }


    # # Interim Data
    # CAPTIONS = 
    # LINKS = 
    
    # Target Data
    # TITLES = 
    # AUTHORS = 
    # JOURNALS = 
    # VOL = 
    # ISSUE = 
    # DATES = 
    # DOWNLOAD_PATHS =