First attempt at a python webscrape


I wanted an easy way to import a large database of crystallographic values into a program I’m working on. I installed Selenium and put together a simple script to scrape space groups and coordinates. There are a total of 230 space groups with a varying number of coordinates and possible translations to consider. each group is displayed like this on a separate page

Creating a dictionary to organize all space groups and letters seemed like the best way to go about it, and so this is what I came up with:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re, os

class SetOptions:
    def __init__(self, wyckoff, view, window):
        self.wyckoff=wyckoff
        self.driver_path='/usr/local/bin/chromedriver'
        # Driver options
        self.options=Options()
        self.options.headless = view
        self.options.add_argument(window)
        self.driver = webdriver.Chrome(options=self.options, executable_path=self.driver_path)
        # Strings found between the coordinate set on each page
        self.html_indices = ('Coordinates', 'specific')

    def scrape_positions(self, group, start, stop):
        space_groups = {}
        self.driver.get("https://www.cryst.ehu.es/cgi-bin/cryst/programs/nph-wp-list?gnum=%s" %(group))
        self.driver.find_element_by_xpath("/html/body/table(2)/tbody/tr/td(2)/form/table(2)/tbody/tr/td(1)/input").click()
        raw_html=self.driver.page_source
         # Strip all html 
        stripped_html=re.sub('<(^>)+>', '', raw_html)
         # Split string data between identifier strings
        unsorted_pos = stripped_html.split(start)(1).split(stop)(0)
         # Parse translations, coordinates, letters, and multiplicities into line by line string
        sorted_pos = os.linesep.join((s for s in unsorted_pos.splitlines() if s))
         # Separate letter/multiplicity/symmetry strings from coordinate strings
        letters = re.sub(r'(.*?)', '', sorted_pos)
        regex = re.compile(".*?((.*?))")
         # Check for the presence of translations and index that count
        centring_count = letters.split().count('+')
         # If present, parse translations separately as they will always occur before coordinates
        c_centring=(c for c in re.findall(regex, sorted_pos)(:centring_count)(::-1))
         # Once translations are removed from parenthetical strings only coordinate sets remain
        pos = ((pos) for pos in re.findall(regex, sorted_pos)(centring_count:)(::-1))
        # Join letter/multiplicity strings, discard sym strings
        letters=(s(:s.find(next(filter(str.isalpha, s)))+1)
        for s in letters.split() if s(0).isnumeric())(::-1)
        # if translations are present append the data to the end of letter/position lists
        if c_centring:
            base = centring_count
            letters.append('translation')
            pos.append(c_centring)
        else:
            base=1
        # Add each letter/multiplicity pair as keys with their corresponding coordinates
        for let in letters:
            index=(int(re.findall(r"d+", let)(0))//base # add translations last 
            if letters.index(let) < len(letters)-1 else len(pos))(0)
            space_groups.update({let: pos(:index)(::-1)})
            pos=pos(index:)

        return space_groups

    def update_groups(self, index_1, index_2):
        for i in range(index_1, index_2):
            self.wyckoff.update({i: self.scrape_positions(i, *self.html_indices)})
        # Print string data that is easily copied and pasted into a text editor
        return str(self.wyckoff).replace("('","(").replace("')",")")
#--------------------------------------------------------------------------

wyckoff = {}
print(SetOptions(wyckoff, True, "--window-size=1920, 1200").update_groups(166,167))

Output:

> {166: {'3a': ((0,0,0)), '3b': ((0,0,1/2)), '6c': ((0,0,z), (0,0,-z)),
> '9d': ((1/2,0,1/2),             (0,1/2,1/2), (1/2,1/2,1/2)), '9e':
> ((1/2,0,0), (0,1/2,0), (1/2,1/2,0)), '18f': ((x,0,0), (0,x,0),
> (-x,-x,0), (-x,0,0), (0,-x,0), (x,x,0)), '18g': ((x,0,1/2), (0,x,1/2),
> (-x,-x,1/2), (-x,0,1/2), (0,-x,1/2), (x,x,1/2)), '18h': ((x,-x,z),
> (x,2x,z), (-2x,-x,z), (-x,x,-z), (2x,x,-z), (-x,-2x,-z)), '36i':
> ((x,y,z), (-y,x-y,z), (-x+y,-x,z), (y,x,-z), (x-y,-y,-z),
> (-x,-x+y,-z), (-x,-y,-z), (y,-x+y,-z), (x-y,x,-z), (-y,-x,z),
> (-x+y,y,z), (x,x-y,z)), 'translation': ((1/3,2/3,2/3', '2/3,1/3,1/3',
> '0,0,0))}}

Any feedback is welcome as I’m sure there are better approaches.