1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#!/usr/bin/env python
import re, sys, urllib, urllib2, cookielib
from BeautifulSoup import BeautifulSoup, Tag
from urllib2 import HTTPError
import BaseHTTPServer
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders=[('User-agent','Mozilla/5.0')]
testing = 0
if testing == 0:
url = sys.stdin.readline().strip()
#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22FG+GALLAGHER%22&btnG=Search"
#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22a+einstein%22+-pdf&btnG=Search"
#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22feynman%22+-pdf&btnG=Search"
#url ="http://scholar.google.com/scholar?hl=en&lr=&q=drosophila+-pdf&btnG=Search"
try:
if url == "":
url = "http://scholar.google.com/scholar"
f = opener.open(url)
# info = f.info()
# print info
data = f.read()
except HTTPError, e:
code = e.code
try:
msg = BaseHTTPServer.BaseHTTPRequestHandler.responses[code]
except:
print "Google Scholar error (1)", sys.exc_info()[0]
sys.exit(0)
if msg:
print ("Google Scholar error: %s %s (%s)" % (code, msg[0], msg[1]))
else:
print "Google Scholar error (2): ", code
sys.exit(0)
# except:
# print "Google Scholar error (3):", sys.exc_info()[0]
# sys.exit(0)
else:
data = sys.stdin.read()
soup = BeautifulSoup(data)
#print soup.prettify()
#sys.exit(0)
# make sure all image URL point to google
for img in soup.findAll("img"):
if img.has_key('src') and img['src'].startswith('/'):
img['src'] = "http://scholar.google.com" + img['src']
#
# Might be more robust to trawl ALL <A HREF=".."> (as class="w" might
# break) and replace those that start with absolute URL "http://"
# (filtering out any matching http://xxx.google.xxx/, just to be sure!)
#
items = soup.findAll("p", { "class" : "g" })
for item in items:
# print div
wspan = item.find("span", {"class" : "w"})
# print wspan
# Hmm, this should never happen, but it does!
if not wspan:
continue
a = wspan.find('a')
if not a:
continue
if not a['href']:
continue
cul = Tag(soup, "a")
cul['href'] = "/posturl?url="+urllib.quote(a['href'])
img = Tag(soup, "img")
img['src']="http://static.citeulike.org/favicon.gif"
img['style']="border:0"
cul.insert(0,img)
wspan.insert(99, cul)
# print wspan.prettify()
if testing == 0:
print soup
else:
print soup.prettify()
|