diff options
author | Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu> | 2014-08-24 14:32:08 -0700 |
---|---|---|
committer | Nathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu> | 2014-08-24 14:32:08 -0700 |
commit | ae35ea0895ae027902105ccfd9702f84e027879f (patch) | |
tree | 0e0a107fc56dc4aa5db1e224358609836fcce03e | |
parent | 6cb280fa7b31ddefe8844bf71ddb32823b435f9f (diff) | |
download | paperbot-ae35ea08.tar.gz paperbot-ae35ea08.zip |
added looping through the new local variable proxy_list, added provisions for custom_flask_json proxy, which aims to provide a way for remote users to return PDFs
-rwxr-xr-x[-rw-r--r--] | modules/papers.py | 91 |
1 files changed, 62 insertions, 29 deletions
diff --git a/modules/papers.py b/modules/papers.py index 8117463..7a00bd0 100644..100755 --- a/modules/papers.py +++ b/modules/papers.py @@ -14,6 +14,8 @@ import urllib import pdfparanoia logchannel = os.environ.get("LOGGING", None) +proxy_list = [ {'proxy_url':None,'proxy_type':'normal'}, + {'proxy_url':'http://localhost:8500/plsget', 'proxy_type':'custom_flask_json'} ] def download(phenny, input, verbose=True): """ @@ -100,36 +102,67 @@ def download(phenny, input, verbose=True): if pdf_url: user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" - headers = { - "User-Agent": user_agent, - } - - response = None - if pdf_url.startswith("https://"): - response = requests.get(pdf_url, headers=headers, verify=False) - else: - response = requests.get(pdf_url, headers=headers) - - # detect failure - if response.status_code != 200: - shurl, _ = modules.scihub.scihubber(pdf_url) - if shurl: - if "libgen" in shurl: - phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) - elif "pdfcache" not in shurl: - phenny.say(shurl) + proxies_left_to_try = len(proxy_list) + request_iteration = 0 + while proxies_left_to_try: + headers = { + "User-Agent": user_agent, + } + response = None + proxy_url = proxy_list[proxy_url_index]['proxy_url'] + proxy_type = proxy_list[proxy_url_index]['proxy_type'] + + #perform default behaviour if proxy is None + if proxy_list[proxy_url_index]['proxy_url'] is None: + if pdf_url.startswith("https://"): + response = requests.get(pdf_url, headers=headers, verify=False) else: - phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"])) - return - - data = response.content - - if "pdf" in response.headers["content-type"]: - try: - data = pdfparanoia.scrub(StringIO(data)) - except: - # this is to avoid a PDFNotImplementedError - pass + response = requests.get(pdf_url, headers=headers) + else: + + #check type of proxy + if proxy_type == 'custom_flask_json': + headers['pdf_url'] = pdf_url + headers['request_iteration'] = request_iteration + response = requests.get(proxy_url, headers=headers) + elif proxy_type == 'normal': + #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop + proxies = { + "http": proxy_url, + "https": proxy_url, + } + response = requests.get(pdf_url, headers=headers, proxies=proxies) + + # detect failure + if response.status_code != 200: + shurl, _ = modules.scihub.scihubber(pdf_url) + if shurl: + if "libgen" in shurl: + phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) + elif "pdfcache" not in shurl: + phenny.say(shurl) + else: + phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"])) + return + + data = response.content + + if "pdf" in response.headers["content-type"]: + try: + data = pdfparanoia.scrub(StringIO(data)) + break + except: + #check for custom_flask_json proxy response, which indicates if the given custom proxy has more internal proxies to try with + if 'proxies_remaining' in response.headers: + #decrement the index if the custom proxy doesn't have any more internal proxies to try + if response.headers['proxies_remaining'] == 0: + proxies_left_to_try-=1 + else: + #decrement the index to move on to the next proxy in our proxy_list + proxies_left_to_try-=1 + + # this is to avoid a PDFNotImplementedError + pass if item.has_key("DOI"): phenny.say(modules.scihub.libgen(data, item["DOI"])) |