Created
October 9, 2014 14:22
-
-
Save oliversinden/ef6052faff3e5932a055 to your computer and use it in GitHub Desktop.
Webtrends API extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import mechanize | |
from mechanize import Browser | |
from datetime import datetime, date, time | |
import itertools | |
import csv | |
import pdb | |
import gzip | |
import StringIO | |
import urllib | |
import HTMLParser | |
import string | |
from lxml import etree | |
import datetime | |
import pdb | |
import time | |
extractingtoday = False | |
#extractdate= "2014m06d19" | |
oneday = datetime.timedelta(days=1) | |
d = date(2014,10,8) | |
br = Browser() | |
br.add_password("https://ws.webtrends.com", "[username]", "[password]") | |
br.addheaders = [('Accept-Encoding', 'gzip'),('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'), ('Cache-Control', 'max-age=0')] | |
while extractingtoday == False: | |
print d | |
dt = d.timetuple() | |
extractdate = str(dt[0]) + "m" + str(dt[1]) + "d" + str(dt[2]) | |
QueryString = ("[querystring]" + extractdate + "&end_period=" + extractdate +"&period_type=agg&measures=0*1&format=html&suppress_error_codes=true") | |
#pdb.set_trace() | |
##stra debug | |
query = None | |
while query is None: | |
try: | |
query = br.open(QueryString) | |
except: | |
pass | |
br.response().get_data() | |
compressedstream = StringIO.StringIO(query.read()) | |
gzipper = gzip.GzipFile(fileobj = compressedstream) | |
unzgipdata = gzipper.read() | |
parser = etree.XMLParser(recover=True) | |
data = unzgipdata.decode('utf-8') | |
tree = etree.fromstring(data.encode('utf-8'), parser=parser) | |
output_data = [] | |
rowcount = 0 | |
for elem_row in tree[1].findall('tr'): | |
if rowcount > 1 : | |
row = [] | |
for elem_cell in elem_row.findall('td'): | |
if elem_cell.text is None: | |
row.append(elem_cell.text) | |
else: | |
row.append(elem_cell.text.encode('utf-8')) | |
output_data.append(row) | |
rowcount = rowcount + 1 | |
br.close | |
with open(extractdate+'.csv', 'wb') as csvfile: | |
csvwriter = csv.writer(csvfile, dialect='excel') | |
for t in output_data: | |
csvwriter.writerow(t) | |
##edn debug | |
d = d + oneday | |
if d == date.today(): | |
extractingtoday = True | |
#time.sleep(40) | |
#soup.p jumps to the first <P> tag inside a document, wherever it is. soup.table.tr.td jumps to the first column of the first row of the first table in the document. | |
#These members actually alias to the first method, covered below. I mention it here because the alias makes it very easy to zoom in on an interesting part of a well-known parse tree. | |
#An alternate form of this idiom lets you access the first <FOO> tag as .fooTag instead of .foo. For instance, soup.table.tr.td could also be expressed as soup.tableTag.trTag.tdTag, or even soup.tableTag.tr.tdTag. This is useful if you like to be more explicit about what you're doing, or if you're parsing XML whose tag names conflict with the names of Beautiful Soup methods and members. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment