""" Extract PDF from IEEE Xplore embeds and write to given path Set PDF_DUMP_DIR to wherever PDFs should be saved (named as .pdf) to run script -> 'python ieexplore.py ' (ID can be determined from the 'arnumber' argument on the article's IEEE Xplore landing page) Please note - this script was only written because of borked PDF embeds in Ubuntu - (writing this was faster than fixing) - for most people it will be faster than nothing... Luke Hutton - 2011 """ import urllib2, cookielib, sys, shutil, os from xml.dom.minidom import parseString, parse PDF_DUMP_DIR = '%s/Documents/papers' % os.getenv('HOME') paper_id = sys.argv[1] embed_url = 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % paper_id cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) response = opener.open(embed_url) #print response.read() for line in response: if ".pdf" in line: stripline = line.rsplit('"') pdf_url = stripline[1] pdf_response = opener.open(pdf_url) local_filename = os.path.join(PDF_DUMP_DIR, "%s.pdf" % paper_id) local_pdf = open(local_filename, "w") local_pdf.write(pdf_response.read()) local_pdf.close() print "Successfully written %s" % local_filename sys.exit() print "Could not extract PDF - check ID was correct and URL format hasn't changed"