lec1
Demo on laptopt (not VM)¶
You can’t quite follow along and see the GUI.
In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
options = Options()
#options.headless = True
#b = webdriver.Chrome(options=options, executable_path=”chromium.chromedriver”)
b = webdriver.Chrome(options=options)
In [2]:
import time
print(“A”)
time.sleep(3)
print(“B”)
A
B
Step 1: wait for the table to grab the HTML¶
In [3]:
b.get(“https://tyler.caraza-harter.com/cs320/tricky/page1.html”)
#print(b.page_source[:100])
# polling
def wait_for_element(elem_id):
max_seconds = 10
for i in range(max_seconds * 10):
time.sleep(0.1)
try:
elem = b.find_element_by_id(elem_id)
break
except NoSuchElementException:
print(“not ready yet”)
return elem
wait_for_element(“coords”)
html = b.page_source
not ready yet
not ready yet
not ready yet
not ready yet
not ready yet
not ready yet
not ready yet
not ready yet
not ready yet
Step 2: convert HTML to a table¶
In [4]:
from bs4 import BeautifulSoup # let us search for elements in HTML too
In [5]:
# we’ll create a bs4 tree using the HTML selenium gave us
page = BeautifulSoup(b.page_source, “html.parser”)
type(page)
Out[5]:
bs4.BeautifulSoup
In [6]:
# page.find(“TAG”) — get the first instance of that TAG
tbls = page.find_all(“table”)
assert len(tbls) == 2
In [7]:
import pandas as pd
tbl = tbls[-1]
rows = []
for tr in tbl.find_all(“tr”):
row = [cell.get_text() for cell in tr.find_all(“td”)]
rows.append(row)
pd.DataFrame(rows[1:], columns=rows[0])
Out[7]:
x y
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
5 10 11
6 12 13
7 14 15
8 16 17
9 18 19
Demo 2: clicks¶
In [25]:
b.get(“https://tyler.caraza-harter.com/cs320/tricky/page2.html”)
while True:
try:
btn = b.find_element_by_id(“more”)
except NoSuchElementException:
break
btn.click()
In [28]:
print(b.page_source[-300:])
d>08/16/1992