Scraping your banking data with Selenium

While working on Miser, I wanted programmatic access to my most recent credit-card transactions. Since Chase doesn’t have an API for this, I had to resort to Selenium, a package for programmatic browser control.

I was fairly impressed when I first ran the WebDriver demo — the browser was being automatically controlled by a Python script, and I could see the whole thing happening. The possible applications are numerous and exciting.

Anyway, I’ve come up with a working scraper, attached below and in this gist. Though it is specific to Chase accounts, this should give you a good idea of how to scrape your own data from the financial institution of your choice.

from selenium import webdriver
import time


def get_chase_amazon_driver(username, password):
    """Return a logged-in Chase Amazon card selenium driver instance."""
    driver = webdriver.Firefox()
    driver.get("http://www.chase.com")

    time.sleep(2)

    inputElement = driver.find_element_by_id("usr_name")
    inputElement.send_keys(username)

    pwdElement = driver.find_element_by_id("usr_password")
    pwdElement.send_keys(password)

    pwdElement.submit()
    return driver


def _goto_link(driver, text):
    """Follow a link with a WebDriver."""
    l = driver.find_element_by_partial_link_text(text)
    driver.get(l.get_attribute('href'))


def get_recent_activity_rows(chase_driver):
    """Return the 25 most recent CC transactions, plus any pending
    transactions.

    Returns:
        A list of lists containing the columns of the Chase transaction list.
    """
    _goto_link(chase_driver, "See activity")
    time.sleep(10)

    rows = chase_driver.find_elements_by_css_selector("tr.summary")
    trans_list = []

    for row in rows:
        tds = row.find_elements_by_tag_name('td')
        tds = tds[1:]  # skip the link in first cell
        trans_list.append([td.text for td in tds])

    return trans_list


def get_activity(username, password):
    """For a given username, retrieve recent account activity for
    a Chase CC."""
    rows = None
    d = get_chase_amazon_driver(username, password)
    time.sleep(8)

    try:
        rows = get_recent_activity_rows(d)
    except Exception, e:
        print e
    finally:
        d.quit()

    return rows

if __name__ == '__main__':
    import getpass

    uname = raw_input("Username: ")
    pwd = getpass.getpass()

    print get_activity(uname, pwd)