"""Download newsletters from Intellicontact. """ import os import csv import time import traceback from os.path import isfile from BeautifulSoup import BeautifulSoup, Tag from pywinauto.application import Application BASE = r'c:\workbench\intellicontact' FIREFOX = r"c:\program files\Mozilla Firefox\Firefox.exe %s" URL = "http://app.intellicontact.com/icp/core/message/compose?message_id=%s" RAW = BASE + r'\raw\%s-%s.html' EXTRACTED = BASE + r'\extracted\%s-%s.htm' reader = csv.reader(open('newsletters.csv')) headers = reader.next() for mid, postdate in reader: year, month, day = tuple(postdate.split('-')) url = URL % mid save_as = RAW % (month, day) extracted = EXTRACTED % (month, day) if isfile(save_as) and isfile(extracted): continue print url # Save the file from Firefox. # =========================== if not isfile(save_as): print ' =>', save_as app = Application().start_(FIREFOX % url) time.sleep(3) if not app.windows_(): # Firefox wasn't already running; we launched it app = Application().connect_(title_re=".* - Mozilla Firefox") firefox = app.window_(title_re=".* - Mozilla Firefox") firefox.TypeKeys("%FA") # File -> Save As while app.SaveAs.FileNameEdit.TextBlock() != save_as: app.SaveAs.FileNameEdit.SetEditText(save_as) app.SaveAs.TypeKeys("%S") # "Save" button firefox.TypeKeys("%FC") # File -> Close Tab # Wait for the file to appear before doing the next one. # ====================================================== while 1: if isfile(save_as): break time.sleep(0.2) # Post-process # ============ if not isfile(extracted): print ' =>', extracted soup = BeautifulSoup(open(save_as)) content_type = Tag(soup, 'meta') content_type['http-equiv'] = 'Content-Type' content_type['content'] = 'text/html; charset=utf-8' try: text = soup.textarea.string except: print " => ERROR FINDING CONTENT" traceback.print_exc() continue text = text.replace('<', '<') text = text.replace('>', '>') text = text.replace('&', '&') text = text.replace('"', '"') soup = BeautifulSoup(text) try: soup.head.insert(0, content_type) except: print " => ERROR INSERTING Content-Type" traceback.print_exc() else: print " => SUCCESS!" open(extracted, 'w+').write(soup.prettify()) print