Partial PEP8 fixes to modules.papers

author: Fernando Borretti <eudoxiahp@gmail.com> 2015-01-15 19:16:56 -0200
committer: Fernando Borretti <eudoxiahp@gmail.com> 2015-01-15 19:16:56 -0200
commit: 0aa1b19ad07a4098cf86badef931b4bee5a83756 (patch)
tree: cb8a7da51da5c83a5a3a0585d25915b28ed19507
parent: b90dc5b3e893fef1283fbba30d29d47adc38a100 (diff)
download: paperbot-0aa1b19ad07a4098cf86badef931b4bee5a83756.tar.gz
paperbot-0aa1b19ad07a4098cf86badef931b4bee5a83756.zip
1 files changed, 107 insertions, 69 deletions
diff --git a/modules/papers.py b/modules/papers.py
index f411cb8..84943c9 100755
--- a/modules/papers.py
+++ b/modules/papers.py
@@ -10,24 +10,39 @@ import lxml.etree
 from StringIO import StringIO
 import modules.scihub
 import urllib
-import traceback 
+import traceback
 
 import pdfparanoia
 
 logchannel = os.environ.get("LOGGING", None)
-proxy_list = [  {'proxy_url':None,'proxy_type':'normal'},
-                {'proxy_url':'http://ec2-54-218-13-46.us-west-2.compute.amazonaws.com:8500/plsget', 'proxy_type':'custom_flask_json'} ]
+
+PROXY = 'http://ec2-54-218-13-46.us-west-2.compute.amazonaws.com:8500/plsget'
+USER_AGENT = 'Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'
+
+proxy_list = [
+    {
+        'proxy_url': None,
+        'proxy_type': 'normal'},
+    {
+        'proxy_url': PROXY,
+        'proxy_type': 'custom_flask_json'
+    }
+]
+
+
 def nullLog(msg):
     pass
 
+
 class paperbot_download_request(object):
-    _log=nullLog
+    _log = nullLog
+
     def get(self, pdf_url, use_generator=False, **kwargs):
         proxies_left_to_try = len(proxy_list)
         extension = ".txt"
         request_iteration = 0
         proxy_url_index = 0
-        user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
+        user_agent = USER_AGENT
         headers = {
             "User-Agent": user_agent,
         }
@@ -38,32 +53,37 @@ class paperbot_download_request(object):
             proxy_type = proxy_list[proxy_url_index]['proxy_type']
             _log('proxies_left_to_try: %d proxy_url_index %d' % (proxies_left_to_try, proxy_url_index))
             _log('request_iteration: %d' % request_iteration)
-            #perform default behaviour if proxy is None
+            # perform default behaviour if proxy is None
             if proxy_list[proxy_url_index]['proxy_url'] is None:
                 if pdf_url.startswith("https://"):
                     response = requests.get(pdf_url,  verify=False, **kwargs)
                 else:
                     response = requests.get(pdf_url, **kwargs)
             else:
-                #check type of proxy
+                # check type of proxy
                 if proxy_type == 'custom_flask_json':
-                    data = {'pdf_url' : pdf_url,
-                            'headers' : kwargs.get('headers', None),
-                            'request_iteration' : request_iteration
-                            }
-                    
+                    data = {
+                        'pdf_url': pdf_url,
+                        'headers': kwargs.get('headers', None),
+                        'request_iteration': request_iteration
+                    }
+
                     headers["Content-Type"] = "application/json"
-                    
+
                     _log('trying custom_flask_json, proxy_url %s' % proxy_url)
-                    response = requests.get(proxy_url, data=json.dumps(data), headers=headers)
+                    response = requests.get(proxy_url, data=json.dumps(data),
+                                            headers=headers)
                 elif proxy_type == 'normal':
-                    #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop
-                    proxies = { 
-                      "http": proxy_url,
-                      "https": proxy_url,
+                    # i'm not even checking if http or https is in the pdf_url,
+                    # since the default proxy of None is already being tried in
+                    # this loop
+                    proxies = {
+                        "http": proxy_url,
+                        "https": proxy_url,
                     }
                     headers = kwargs.get('headers', None)
-                    #I don't know if passing None or {} for headers is bad, so I put this if:
+                    # I don't know if passing None or {} for headers is bad, so
+                    # I put this if:
                     if headers is not None:
                         response = requests.get(pdf_url, headers=headers, proxies=proxies)
                     else:
@@ -75,32 +95,34 @@ class paperbot_download_request(object):
                 if "pdf" in response.headers["content-type"]:
                     extension = ".pdf"
                     _log('yielding tuply with PDF in response')
-                    #yield (response, extension)
-                    proxies_left_to_try=0
+                    # yield (response, extension)
+                    proxies_left_to_try = 0
                     break
-                    #return
+                    # return
 
             if 'proxies_remaining' in response.headers:
                 _log('proxies_remaining in headers: %s' % response.headers['proxies_remaining'])
-                #decrement the index if the custom proxy doesn't have any more internal proxies to try
+                # decrement the index if the custom proxy doesn't have any more
+                # internal proxies to try
                 if response.headers['proxies_remaining'] == 0 or response.headers['proxies_remaining'] == '0':
-                    proxies_left_to_try-=1
-                    request_iteration=0
-                    proxy_url_index+=1
+                    proxies_left_to_try -= 1
+                    request_iteration = 0
+                    proxy_url_index += 1
                 else:
                     _log('request_iteration+=1')
-                    request_iteration+=1
+                    request_iteration += 1
 
-            else:    
-                #decrement the index to move on to the next proxy in our proxy_list
-                proxies_left_to_try-=1
-                request_iteration=0
-                proxy_url_index+=1
+            else:
+                # decrement the index to move on to the next proxy in our
+                # proxy_list
+                proxies_left_to_try -= 1
+                request_iteration = 0
+                proxy_url_index += 1
         if use_generator:
             return
         _log('last yield in paperbot_download_request')
         yield (response, extension)
-        
+
 
 def download(phenny, input, verbose=True):
     """
@@ -136,7 +158,8 @@ def download(phenny, input, verbose=True):
         line = line.strip()
 
     # don't bother if there's nothing there
-    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
+    if len(line) < 5 or ("http://" not in line and "https://" not in line) or \
+       not line.startswith("http"):
         return
     for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
         # fix an UnboundLocalError problem
@@ -169,18 +192,19 @@ def download(phenny, input, verbose=True):
             item = content[0]
             title = item["title"]
 
-            if item.has_key("DOI"):
+            if "DOI" in item:
                 _log("Translator DOI")
-                lgre = requests.post("http://libgen.org/scimag/librarian/form.php", data={"doi":item["DOI"]})
+                lgre = requests.post("http://libgen.org/scimag/librarian/form.php",
+                                     data={"doi": item["DOI"]})
                 tree = parse_html(lgre.content)
                 if tree.xpath("//h1")[0].text != "No file selected":
                     phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
                     return
 
-            if item.has_key("attachments"):
+            if "attachments" in item:
                 pdf_url = None
                 for attachment in item["attachments"]:
-                    if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
+                    if "mimeType" in attachment and "application/pdf" in attachment["mimeType"]:
                         pdf_url = attachment["url"]
                         break
 
@@ -209,17 +233,17 @@ def download(phenny, input, verbose=True):
 
                         #check type of proxy
                         if proxy_type == 'custom_flask_json':
-                            
+
                             headers["Content-Type"] = "application/json"
                             data = {'pdf_url' : pdf_url,
                                     'request_iteration' : request_iteration
                                     }
-                            
+
                             request_iteration+=1
                             response = requests.get(proxy_url, data=json.dumps(data), headers=headers)
                         elif proxy_type == 'normal':
                             #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop
-                            proxies = { 
+                            proxies = {
                               "http": proxy_url,
                               "https": proxy_url,
                             }
@@ -228,7 +252,7 @@ def download(phenny, input, verbose=True):
                     paperbot_download_request_obj = paperbot_download_request()
                     paperbot_download_request_obj._log = _log
                     gen = paperbot_download_request_obj.get(pdf_url, use_generator=False, headers=headers)
-                    #this is stupidly ugly
+                    # this is stupidly ugly
                     for genresponse in gen:
                         response, extension = genresponse
 
@@ -251,7 +275,8 @@ def download(phenny, input, verbose=True):
                             data = pdfparanoia.scrub(StringIO(data))
                             try:
                                 _log('after pdfparanoia.scrub')
-                                requests.get('http://localhost:8500/remoteprint', headers={'msg':'after pdfparanoia.scrub'})
+                                requests.get('http://localhost:8500/remoteprint',
+                                             headers={'msg': 'after pdfparanoia.scrub'})
                             except:
                                 pass
                             break
@@ -264,7 +289,7 @@ def download(phenny, input, verbose=True):
                                     proxies_left_to_try-=1
                                     proxy_url_index+=1
                                     request_iteration=0
-                            else:    
+                            else:
                                 #decrement the index to move on to the next proxy in our proxy_list
                                 proxies_left_to_try-=1
                                 proxy_url_index+=1
@@ -272,7 +297,7 @@ def download(phenny, input, verbose=True):
                             # this is to avoid a PDFNotImplementedError
                             pass
 
-                    if item.has_key("DOI"):
+                    if "DOI" in item:
                         phenny.say(modules.scihub.libgen(data, item["DOI"]))
                         return
 
@@ -311,9 +336,12 @@ def download(phenny, input, verbose=True):
             _log("Scihubber -> (%s, %s)" % (shurl, doi))
         if shurl:
             if "pdfcache" in shurl:
-                if doi: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi))
-                else: phenny.say(download_url(shurl, _log, cookies=modules.scihub.shcookie))
-            else: phenny.say(shurl)
+                if doi:
+                    phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi))
+                else:
+                    phenny.say(download_url(shurl, _log, cookies=modules.scihub.shcookie))
+            else:
+                phenny.say(shurl)
         elif verbose and explicit:
             _log("All approaches failed")
             phenny.say(download_url(line, _log))
@@ -334,20 +362,21 @@ def download_ieee(url):
     # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
     raise NotImplementedError
 
+
 def download_url(url, _log=nullLog, **kwargs):
     paperbot_download_request_obj = paperbot_download_request()
     paperbot_download_request_obj._log = _log
     response_generator = paperbot_download_request_obj.get(url, use_generator=True, headers={"User-Agent": "origami-pdf"})
-    cc=0
+    cc = 0
     for response in response_generator:
         _log('using generator for %s time' % cc)
-        cc+=1
+        cc += 1
         paperbot_download_request_obj2 = paperbot_download_request()
         paperbot_download_request_obj2._log = _log
         content = response.content
-        #response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs)
-        #content = response.content
-        
+        # response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs)
+        # content = response.content
+
         # just make up a default filename
         title = "%0.2x" % random.getrandbits(128)
 
@@ -374,7 +403,7 @@ def download_url(url, _log=nullLog, **kwargs):
                 # citation_title = ...
 
             # wow, this seriously needs to be cleaned up
-            if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
+            if citation_pdf_url and citation_title and "ieeexplore.ieee.org" not in citation_pdf_url:
                 citation_title = citation_title.encode("ascii", "ignore")
                 response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
                 content = response.content
@@ -382,24 +411,26 @@ def download_url(url, _log=nullLog, **kwargs):
                     extension = ".pdf"
                     title = citation_title
             else:
-                if "sciencedirect.com" in url and not "ShoppingCart" in url:
+                if "sciencedirect.com" in url and "ShoppingCart" not in url:
                     _log('download_url got a sciencedirect URL')
                     try:
                         try:
                             title = tree.xpath("//h1[@class='svTitle']")[0].text
                             pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
-                        except IndexError: 
+                        except IndexError:
                             title = tree.xpath("//title")[0].text
                             pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
-                        
+
                         if 'http' not in pdf_url:
                             main_url_split = response.url.split('//')
                             http_prefix = main_url_split[0]
                             if 'http' in http_prefix:
                                 domain_url = main_url_split[1].split('/')[0]
-                                pdf_url = http_prefix + '//' + domain_url + ('/' if pdf_url[0]!='/' else '') + pdf_url
-                        gen = paperbot_download_request_obj2.get(pdf_url, use_generator=False, headers={"User-Agent": "sdf-macross"})
-                        #this is stupidly ugly
+                                pdf_url = http_prefix + '//' + domain_url + ('/' if pdf_url[0] != '/' else '') + pdf_url
+                        gen = paperbot_download_request_obj2.get(pdf_url,
+                                                                 use_generator=False,
+                                                                 headers={"User-Agent": "sdf-macross"})
+                        # this is stupidly ugly
                         for genresponse in gen:
                             new_response, extension = genresponse
                         new_content = new_response.content
@@ -430,10 +461,10 @@ def download_url(url, _log=nullLog, **kwargs):
                     # get the document id
                     document_id = None
                     if url[-1] != "/":
-                        #if "stable/" in url:
-                        #elif "discover/" in url:
-                        #elif "action/showShelf?candidate=" in url:
-                        #elif "pss/" in url:
+                        # if "stable/" in url:
+                        # elif "discover/" in url:
+                        # elif "action/showShelf?candidate=" in url:
+                        # elif "pss/" in url:
                         document_id = url.split("/")[-1]
 
                     if document_id.isdigit():
@@ -482,7 +513,7 @@ def download_url(url, _log=nullLog, **kwargs):
                         pass
                     else:
                         if pdf_url.startswith("/"):
-                            url_start = url[:url.find("/",8)]
+                            url_start = url[:url.find("/", 8)]
                             pdf_url = url_start + pdf_url
                         response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
                         content = response.content
@@ -517,6 +548,7 @@ def download_url(url, _log=nullLog, **kwargs):
 
     return url
 
+
 def parse_html(content):
     if not isinstance(content, StringIO):
         content = StringIO(content)
@@ -524,22 +556,25 @@ def parse_html(content):
     tree = lxml.etree.parse(content, parser)
     return tree
 
+
 def check_if_html(response):
     return "text/html" in response.headers["content-type"]
 
+
 def find_citation_pdf_url(tree, url):
     """
     Returns the <meta name="citation_pdf_url"> content attribute.
     """
     citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
-    if citation_pdf_url and  not citation_pdf_url.startswith("http"):
+    if citation_pdf_url and not citation_pdf_url.startswith("http"):
         if citation_pdf_url.startswith("/"):
-            url_start = url[:url.find("/",8)]
+            url_start = url[:url.find("/", 8)]
             citation_pdf_url = url_start + citation_pdf_url
         else:
             raise Exception("unhandled situation (citation_pdf_url)")
     return citation_pdf_url
 
+
 def find_citation_title(tree):
     """
     Returns the <meta name="citation_title"> content attribute.
@@ -547,6 +582,7 @@ def find_citation_title(tree):
     citation_title = extract_meta_content(tree, "citation_title")
     return citation_title
 
+
 def extract_meta_content(tree, meta_name):
     try:
         content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
@@ -555,6 +591,7 @@ def extract_meta_content(tree, meta_name):
     else:
         return content
 
+
 def filter_fix(url):
     """
     Fixes some common problems in urls.
@@ -563,6 +600,7 @@ def filter_fix(url):
         url = url.replace(".proxy.lib.pdx.edu", "")
     return url
 
+
 def fix_ieee_login_urls(url):
     """
     Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
@@ -588,12 +626,12 @@ def fix_ieee_login_urls(url):
     # default case when things go wrong
     return url
 
+
 def fix_jstor_pdf_urls(url):
     """
     Fixes urls pointing to jstor pdfs.
     """
     if "jstor.org/" in url:
-        if ".pdf" in url and not "?acceptTC=true" in url:
+        if ".pdf" in url and "?acceptTC=true" not in url:
             url += "?acceptTC=true"
     return url
-
author	Fernando Borretti <eudoxiahp@gmail.com>	2015-01-15 19:16:56 -0200
committer	Fernando Borretti <eudoxiahp@gmail.com>	2015-01-15 19:16:56 -0200
commit	0aa1b19ad07a4098cf86badef931b4bee5a83756 (patch)
tree	cb8a7da51da5c83a5a3a0585d25915b28ed19507
parent	b90dc5b3e893fef1283fbba30d29d47adc38a100 (diff)
download	paperbot-0aa1b19ad07a4098cf86badef931b4bee5a83756.tar.gz paperbot-0aa1b19ad07a4098cf86badef931b4bee5a83756.zip