other-code/h-index.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

#!/usr/bin/python

import httplib, urllib, re, sys
from BeautifulSoup import BeautifulSoup

terms = sys.argv[1:]
limit = 100
params = urllib.urlencode( { 'q': "+".join( terms ), 'num': limit } )
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; WindowsNT)'}
url = '/scholar'+"?"+params
conn = httplib.HTTPConnection( 'scholar.google.com' )
conn.request( "GET", url, {}, headers )

resp = conn.getresponse()
cites = []
if resp.status == 200:
    html = resp.read()
    html = html.decode( 'ascii', 'ignore' )
    soup = BeautifulSoup( html )
    for record in soup( 'h3', { 'class': 'r' } ):
        print "we have a match!"
        match = re.search("Cited by ([^<]*)", str(record))
        if match != None:
            cite = int( match.group( 1 ) )
            cites.append( cite )
else:
    print 'Error: '
    print resp.status, resp.reason

cites.sort()
cites.reverse()

h = 0
for cite in cites:
    if cite > h:
        h += 1
print h