While working on Miser, I wanted programmatic access to my most recent credit-card transactions. Since Chase doesn’t have an API for this, I had to resort to Selenium, a package for programmatic browser control.
I was fairly impressed when I first ran the WebDriver demo — the browser was being automatically controlled by a Python script, and I could see the whole thing happening. The possible applications are numerous and exciting.
Anyway, I’ve come up with a working scraper, attached below and in this gist. Though it is specific to Chase accounts, this should give you a good idea of how to scrape your own data from the financial institution of your choice.
from selenium import webdriver
import time
def get_chase_amazon_driver(username, password):
"""Return a logged-in Chase Amazon card selenium driver instance."""
driver = webdriver.Firefox()
driver.get("http://www.chase.com")
time.sleep(2)
inputElement = driver.find_element_by_id("usr_name")
inputElement.send_keys(username)
pwdElement = driver.find_element_by_id("usr_password")
pwdElement.send_keys(password)
pwdElement.submit()
return driver
def _goto_link(driver, text):
"""Follow a link with a WebDriver."""
l = driver.find_element_by_partial_link_text(text)
driver.get(l.get_attribute('href'))
def get_recent_activity_rows(chase_driver):
"""Return the 25 most recent CC transactions, plus any pending
transactions.
Returns:
A list of lists containing the columns of the Chase transaction list.
"""
_goto_link(chase_driver, "See activity")
time.sleep(10)
rows = chase_driver.find_elements_by_css_selector("tr.summary")
trans_list = []
for row in rows:
tds = row.find_elements_by_tag_name('td')
tds = tds[1:] # skip the link in first cell
trans_list.append([td.text for td in tds])
return trans_list
def get_activity(username, password):
"""For a given username, retrieve recent account activity for
a Chase CC."""
rows = None
d = get_chase_amazon_driver(username, password)
time.sleep(8)
try:
rows = get_recent_activity_rows(d)
except Exception, e:
print e
finally:
d.quit()
return rows
if __name__ == '__main__':
import getpass
uname = raw_input("Username: ")
pwd = getpass.getpass()
print get_activity(uname, pwd)