"""
scHolar index
http://insitu.lri.fr/~roussel/moulinette/h/h.cgi
The following Python code makes it possible to compute H indices by
using Beautiful Soup to process Google Scholar results.
This code is in the public domain.
For more information, see:
http://en.wikipedia.org/wiki/Hirsch_number
http://scholar.google.com/
http://www.crummy.com/software/BeautifulSoup/
Comments and questions should be sent to Nicolas Roussel (roussel@lri.fr).
"""
import os, sys, socket, re, string
import BeautifulSoup
def debugInfo(text):
print text
sys.stdout.flush()
# ------------------------------------------------------------------------------
class GoogleScholarReference:
def getURL(self):
if self.gid==None: return None
return "http://scholar.google.com/scholar?cluster=%s"%self.gid
def getCitedByURL(self):
if self.gid==None: return None
return "http://scholar.google.com/scholar?cites=%s"%self.gid
def get(self, key, defval=None):
if self.__dict__.has_key(key): return self.__dict__[key]
return defval
_citedby = re.compile('.*Cited by (\d+).*')
_pubyear = re.compile('.*\D(\d\d\d\d)\D.*')
_id_cites = re.compile('.*cites=(\d+).*')
_id_cluster = re.compile('.*cluster=(\d+).*')
def _deHTML(line, group):
text = re.sub(" "," ",line)
text = re.sub("<[^<]+>","",text)
if group:
#text = re.sub("- group of.+","",text)
text = re.sub("- all .+","",text)
return text.strip()
def _splitDescription(reference):
reftext = unicode(reference)
text = reftext.replace("\n"," ")
text = re.sub("
]*>","",text)
text = re.sub("
","",text)
text = re.sub("]*>","",text)
text = re.sub("","",text)
#text = re.sub("\[.+\]","",text)
text = re.sub("\[.+\]","",text)
return text.split("
")
def _tryExp(text,exp,defval):
m = exp.match(text)
if m: return m.group(1)
return defval
#dbgfile = open("/tmp/text.txt","w")
def _parseResults(soup):
result = []
reference = soup.first('p', {'class':'g'})
while reference!=None:
try:
r = GoogleScholarReference()
d = _splitDescription(reference)
r.title = _deHTML(d[0],True)
r.info = _deHTML(d[1],False)
r.citedby = int(_tryExp(d[-1],_citedby,0))
r.year = int(_tryExp(d[1],_pubyear,0))
r.gid = _tryExp(d[-1],_id_cites,None)
if not r.gid: r.gid = _tryExp(d[-1],_id_cluster,None)
result.append(r)
except:
#dbgfile.write("\n-----------------------\n")
#dbgfile.write("%s"%d)
pass
reference = reference.nextSibling
return result
# ------------------------------------------------------------------------------
_qServer,_qPort,_qTimeout = "scholar.google.com",80,5.0
_qTemplate = "/scholar?num=%d&"
def doQuery(query):
refs, offset, inc = [], 0, 100
while True:
q = _qTemplate%inc+query
if offset==0:
debugInfo(u'Sending query to Google Scholar...'%(_qServer,q))
else:
q = q+"&start=%d"%offset
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(_qTimeout)
try:
err = sock.connect_ex((_qServer, _qPort))
except socket.gaierror:
debugInfo(u"unable to connect")
return []
if err!=0:
debugInfo(u"connection timed out (%s seconds)..."%_qTimeout)
return []
sock.send("GET %s HTTP/1.1\n"%q)
sock.send("Host: scholar.google.com\n")
sock.send("Connection: close\n") # since we don't parse the HTTP response
sock.send("Referer: http://scholar.google.com/advanced_scholar_search?hl=en&lr=\n")
sock.send("User-Agent: %s\n"%os.environ.get("HTTP_USER_AGENT","Mozilla/5.0"))
sock.send("\n")
print "query = ", q
debugInfo("reading...")
data = ""
while True:
moredata = sock.recv(4096)
if not moredata: break
data = data+moredata
sock.close()
open("/tmp/scholar-results.html","w").write(data)
debugInfo("parsing...")
soup = BeautifulSoup.BeautifulSoup(data)
morerefs = _parseResults(soup)
refs = refs + morerefs
if len(morerefs)!=inc:
if len(morerefs)==0:
location = None
for line in string.split(data,"\n"):
if line.find("Location: ")!=-1:
location = string.split(line)[-1]
break
message = "query failed..."
if location:
#message = message + ' (try again after a while or after solving this)'%location
message = message + ' (try again after a while)'
elif len(refs)!=0:
message = message + ' (try again after a while)'
debugInfo(message)
break
offset = offset+inc
return refs
def fakeQuery(filename):
debugInfo(u'Opening %s...'%os.path.basename(filename))
data = open(filename).read()
soup = BeautifulSoup.BeautifulSoup(data)
references = _parseResults(soup)
return references
# ------------------------------------------------------------------------------
def computeHindex(references):
h = 0
while True:
h = h + 1
n = 0
for r in references:
if r.citedby>=h: n = n+1
if n