diff options
author | U-ACORNSYS\pwang <pwang@pwang01.acornsys.com> | 2013-07-06 01:38:17 -0500 |
---|---|---|
committer | U-ACORNSYS\pwang <pwang@pwang01.acornsys.com> | 2013-07-06 01:38:17 -0500 |
commit | 27b25beeac0ec8d3642ab0854c4265ede6fc2383 (patch) | |
tree | 929e8cbb985edbc2f2c21df01fc6ab7964883760 | |
parent | 64a0ae876de8b8382d047af7ba52f519e139f18d (diff) | |
download | paperbot-27b25bee.tar.gz paperbot-27b25bee.zip |
lxml etree instead of silly things
-rw-r--r-- | modules/scihub.py | 55 |
1 files changed, 19 insertions, 36 deletions
diff --git a/modules/scihub.py b/modules/scihub.py index 079dc64..3f9f10c 100644 --- a/modules/scihub.py +++ b/modules/scihub.py @@ -6,6 +6,8 @@ import requests from HTMLParser import HTMLParser from urlparse import urlparse import itertools +from lxml import etree +from StringIO import StringIO import urllib import base64 import os @@ -21,14 +23,11 @@ def libgen(url, doi, **kwargs): auth_ = requests.auth.HTTPBasicAuth("genesis", "upload") re = requests.get(url, **kwargs) payload = "data:application/pdf;base64," + base64.b64encode(re.content) - re = requests.get("http://libgen.org/scimag/librarian/form.php", auth = auth_, - files = {"uploadedfile":("derp.pdf", payload)}, data = {"doi": doi}) - formp = [] - class FormP(HTMLParser): - def handle_starttag(self, tag, attr): - if tag == "input": - d = dict(attr); form.append((d[name], d[value])) - re = requests.get("http://libgen.org/scimag/librarian/register.php", data = dict(formp), auth = auth_) + re = requests.post("http://libgen.org/scimag/librarian/form.php", auth = auth_, + files = {"uploadedfile":("derp.pdf", re.content)}, data = {"doi": doi}) + shu = etree.parse(StringIO(re.text), etree.HTMLParser()) + formp = dict(map(lambda x: (x.get("name"), x.get("value")), tr.xpath("//input[@name]"))) + re = requests.get("http://libgen.org/scimag/librarian/register.php", data = formp, auth = auth_) return "http://libgen.org/scimag5/" + doi def scihubber(url, **kwargs): @@ -43,26 +42,6 @@ def scihubber(url, **kwargs): a = urlparse(url) geturl = "http://%s.sci-hub.org/%s?%s" % (a.hostname, a.path, a.query) def _go(_url, _doi = None): - _as = [] - _frames = [] - just = [] - justdoi = [] - - class MaybeDOI(HTMLParser): - def handle_starttag(self, tag, attrs): - if tag == "meta": - d = dict(attrs) - if str.find(d.get("name","").encode("utf8"), "doi") != -1: - v = d.get("content","").encode("utf8") - ix = str.find(v, "10.") - if ix != -1: justdoi.append(v[ix:]) - if tag == "a": - d = dict(attrs) - v = d.get("href","").encode("utf8") - if str.find(v, "doi") != -1: - ix = str.find(v, "10.") - if ix != -1: justdoi.append(urllib.unquote(v[ix:])) - class MaybeTail(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "frame": @@ -75,17 +54,21 @@ def scihubber(url, **kwargs): elif tag == "frame" or tag == "iframe": _frames.append(dict(attrs)) re = requests.get(_url, **kwargs).text.encode("utf8") + shu = etree.parse(StringIO(re.text),etree.HTMLParser()) if not _doi: - MaybeDOI().feed(re) - if justdoi: _doi = justdoi[0] - MaybeTail().feed(re) + metas = map(lambda x:x.get("content"), shu.xpath("//meta[contains(@name,'doi')]")) + _as = map(lambda x:urllib.unquote(x.get("href")), shu.xpath("//a[contains(@href,'doi')]")) + maybedoi = filter(lambda x:str.find(x, "10.") != -1, metas + _as) + if maybedoi: + ix = str.find(maybedoi[0],"10.") + _doi = maybedoi[0][ix:] + just = map(lambda x:x.get("src"), shu.xpath("//frame[@name='_pdf']")) if just: return (just[0], _doi) - Derper().feed(re) - qq = filter(lambda x: str.find(x.get("href","").encode("utf8"), "pdf") != -1, _as) - qq += filter(lambda x: str.find(x.get("src","").encode("utf8"), "pdf") != -1, _frames) - qq = filter(None, map(lambda x: x.get("href", x.get("src", None)), qq)) + derp = map(lambda x:x.get("src"), shu.xpath("//frame | //iframe")) + derp += map(lambda x:x.get("href"), shu.xpath("//a")) + derp = filter(None,derp) it = itertools.ifilter(None, - itertools.imap(lambda x: _go("http://%s.sci-hub.org/%s" % (a.hostname, x), _doi), qq)) + itertools.imap(lambda x: _go("http://%s.sci-hub.org/%s" % (a.hostname, x), _doi), derp)) try: return it.next() except StopIteration: return None ret = _go(geturl) |