added looping through the new local variable proxy_list, added provisions for custom_flask_json proxy, which aims to provide a way for remote users to return PDFs

author: Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu> 2014-08-24 14:32:08 -0700
committer: Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu> 2014-08-24 14:32:08 -0700
commit: ae35ea0895ae027902105ccfd9702f84e027879f (patch)
tree: 0e0a107fc56dc4aa5db1e224358609836fcce03e
parent: 6cb280fa7b31ddefe8844bf71ddb32823b435f9f (diff)
download: paperbot-ae35ea08.tar.gz
paperbot-ae35ea08.zip
1 files changed, 62 insertions, 29 deletions
diff --git a/modules/papers.py b/modules/papers.py
index 8117463..7a00bd0 100644..100755
--- a/modules/papers.py
+++ b/modules/papers.py
@@ -14,6 +14,8 @@ import urllib
 import pdfparanoia
 
 logchannel = os.environ.get("LOGGING", None)
+proxy_list = [  {'proxy_url':None,'proxy_type':'normal'},
+                {'proxy_url':'http://localhost:8500/plsget', 'proxy_type':'custom_flask_json'} ]
 
 def download(phenny, input, verbose=True):
     """
@@ -100,36 +102,67 @@ def download(phenny, input, verbose=True):
                 if pdf_url:
                     user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
 
-                    headers = {
-                        "User-Agent": user_agent,
-                    }
-
-                    response = None
-                    if pdf_url.startswith("https://"):
-                        response = requests.get(pdf_url, headers=headers, verify=False)
-                    else:
-                        response = requests.get(pdf_url, headers=headers)
-
-                    # detect failure
-                    if response.status_code != 200:
-                        shurl, _ = modules.scihub.scihubber(pdf_url)
-                        if shurl:
-                            if "libgen" in shurl:
-                                phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
-                            elif "pdfcache" not in shurl:
-                                phenny.say(shurl)
+                    proxies_left_to_try = len(proxy_list)
+                    request_iteration = 0
+                    while proxies_left_to_try:
+                        headers = {
+                            "User-Agent": user_agent,
+                        }
+                        response = None
+                        proxy_url = proxy_list[proxy_url_index]['proxy_url']
+                        proxy_type = proxy_list[proxy_url_index]['proxy_type']
+
+                        #perform default behaviour if proxy is None
+                        if proxy_list[proxy_url_index]['proxy_url'] is None:
+                            if pdf_url.startswith("https://"):
+                                response = requests.get(pdf_url, headers=headers, verify=False)
                             else:
-                                phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
-                        return
-
-                    data = response.content
-
-                    if "pdf" in response.headers["content-type"]:
-                        try:
-                            data = pdfparanoia.scrub(StringIO(data))
-                        except:
-                            # this is to avoid a PDFNotImplementedError
-                            pass
+                                response = requests.get(pdf_url, headers=headers)
+                        else:
+
+                            #check type of proxy
+                            if proxy_type == 'custom_flask_json':
+                                headers['pdf_url'] = pdf_url
+                                headers['request_iteration'] = request_iteration
+                                response = requests.get(proxy_url, headers=headers)
+                            elif proxy_type == 'normal':
+                                #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop
+                                proxies = { 
+                                  "http": proxy_url,
+                                  "https": proxy_url,
+                                }
+                                response = requests.get(pdf_url, headers=headers, proxies=proxies)
+
+                        # detect failure
+                        if response.status_code != 200:
+                            shurl, _ = modules.scihub.scihubber(pdf_url)
+                            if shurl:
+                                if "libgen" in shurl:
+                                    phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
+                                elif "pdfcache" not in shurl:
+                                    phenny.say(shurl)
+                                else:
+                                    phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
+                            return
+
+                        data = response.content
+
+                        if "pdf" in response.headers["content-type"]:
+                            try:
+                                data = pdfparanoia.scrub(StringIO(data))
+                                break
+                            except:
+                                #check for custom_flask_json proxy response, which indicates if the given custom proxy has more internal proxies to try with
+                                if 'proxies_remaining' in response.headers:
+                                    #decrement the index if the custom proxy doesn't have any more internal proxies to try
+                                    if response.headers['proxies_remaining'] == 0:
+                                        proxies_left_to_try-=1    
+                                else:    
+                                    #decrement the index to move on to the next proxy in our proxy_list
+                                    proxies_left_to_try-=1
+
+                                # this is to avoid a PDFNotImplementedError
+                                pass
 
                     if item.has_key("DOI"):
                         phenny.say(modules.scihub.libgen(data, item["DOI"]))
author	Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu>	2014-08-24 14:32:08 -0700
committer	Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu>	2014-08-24 14:32:08 -0700
commit	ae35ea0895ae027902105ccfd9702f84e027879f (patch)
tree	0e0a107fc56dc4aa5db1e224358609836fcce03e
parent	6cb280fa7b31ddefe8844bf71ddb32823b435f9f (diff)
download	paperbot-ae35ea08.tar.gz paperbot-ae35ea08.zip