summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu>2014-08-24 14:32:08 -0700
committerNathan McCorkle <nmz787@bryan.svcs.cs.pdx.edu>2014-08-24 14:32:08 -0700
commitae35ea0895ae027902105ccfd9702f84e027879f (patch)
tree0e0a107fc56dc4aa5db1e224358609836fcce03e
parent6cb280fa7b31ddefe8844bf71ddb32823b435f9f (diff)
downloadpaperbot-ae35ea08.tar.gz
paperbot-ae35ea08.zip
added looping through the new local variable proxy_list, added provisions for custom_flask_json proxy, which aims to provide a way for remote users to return PDFs
-rwxr-xr-x[-rw-r--r--]modules/papers.py91
1 files changed, 62 insertions, 29 deletions
diff --git a/modules/papers.py b/modules/papers.py
index 8117463..7a00bd0 100644..100755
--- a/modules/papers.py
+++ b/modules/papers.py
@@ -14,6 +14,8 @@ import urllib
import pdfparanoia
logchannel = os.environ.get("LOGGING", None)
+proxy_list = [ {'proxy_url':None,'proxy_type':'normal'},
+ {'proxy_url':'http://localhost:8500/plsget', 'proxy_type':'custom_flask_json'} ]
def download(phenny, input, verbose=True):
"""
@@ -100,36 +102,67 @@ def download(phenny, input, verbose=True):
if pdf_url:
user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
- headers = {
- "User-Agent": user_agent,
- }
-
- response = None
- if pdf_url.startswith("https://"):
- response = requests.get(pdf_url, headers=headers, verify=False)
- else:
- response = requests.get(pdf_url, headers=headers)
-
- # detect failure
- if response.status_code != 200:
- shurl, _ = modules.scihub.scihubber(pdf_url)
- if shurl:
- if "libgen" in shurl:
- phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
- elif "pdfcache" not in shurl:
- phenny.say(shurl)
+ proxies_left_to_try = len(proxy_list)
+ request_iteration = 0
+ while proxies_left_to_try:
+ headers = {
+ "User-Agent": user_agent,
+ }
+ response = None
+ proxy_url = proxy_list[proxy_url_index]['proxy_url']
+ proxy_type = proxy_list[proxy_url_index]['proxy_type']
+
+ #perform default behaviour if proxy is None
+ if proxy_list[proxy_url_index]['proxy_url'] is None:
+ if pdf_url.startswith("https://"):
+ response = requests.get(pdf_url, headers=headers, verify=False)
else:
- phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
- return
-
- data = response.content
-
- if "pdf" in response.headers["content-type"]:
- try:
- data = pdfparanoia.scrub(StringIO(data))
- except:
- # this is to avoid a PDFNotImplementedError
- pass
+ response = requests.get(pdf_url, headers=headers)
+ else:
+
+ #check type of proxy
+ if proxy_type == 'custom_flask_json':
+ headers['pdf_url'] = pdf_url
+ headers['request_iteration'] = request_iteration
+ response = requests.get(proxy_url, headers=headers)
+ elif proxy_type == 'normal':
+ #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop
+ proxies = {
+ "http": proxy_url,
+ "https": proxy_url,
+ }
+ response = requests.get(pdf_url, headers=headers, proxies=proxies)
+
+ # detect failure
+ if response.status_code != 200:
+ shurl, _ = modules.scihub.scihubber(pdf_url)
+ if shurl:
+ if "libgen" in shurl:
+ phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
+ elif "pdfcache" not in shurl:
+ phenny.say(shurl)
+ else:
+ phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
+ return
+
+ data = response.content
+
+ if "pdf" in response.headers["content-type"]:
+ try:
+ data = pdfparanoia.scrub(StringIO(data))
+ break
+ except:
+ #check for custom_flask_json proxy response, which indicates if the given custom proxy has more internal proxies to try with
+ if 'proxies_remaining' in response.headers:
+ #decrement the index if the custom proxy doesn't have any more internal proxies to try
+ if response.headers['proxies_remaining'] == 0:
+ proxies_left_to_try-=1
+ else:
+ #decrement the index to move on to the next proxy in our proxy_list
+ proxies_left_to_try-=1
+
+ # this is to avoid a PDFNotImplementedError
+ pass
if item.has_key("DOI"):
phenny.say(modules.scihub.libgen(data, item["DOI"]))