"""Given an HTML-barf listing, generate a CSV. """ import csv from datetime import date from BeautifulSoup import BeautifulSoup soup1 = BeautifulSoup(open('1-50.htm')) soup2 = BeautifulSoup(open('51-89.htm')) months = dict() months['Jan'] = 1 months['Feb'] = 2 months['Mar'] = 3 months['Apr'] = 4 months['May'] = 5 months['Jun'] = 6 months['Jul'] = 7 months['Aug'] = 8 months['Sep'] = 9 months['Oct'] = 10 months['Nov'] = 11 months['Dec'] = 12 want1 = [5, 9, 12, 16, 19, 26, 29, 34, 37, 41, 43] want2 = [1, 3, 10, 16, 20, 24, 27, 30, 32, 34, 35, 36, 37] out = csv.writer(open('newsletters.csv', 'w+')) out.writerow(['mid', 'postdate']) for (soup, want) in ((soup1, want1), (soup2, want2)): for row in soup('tr', bgcolor=['#E0E0E0', '#EAEAEA']): cols = row('td') rownum = int(cols[0].font.string) if rownum not in want: continue mid = row['id'][3:] # id="row9999999" month, day, year = [x.strip(',') for x in cols[4].font.string.split()] postdate = date(int(year), months[month], int(day)) out.writerow([mid, postdate])