# ------------------------------------------
# Google Scholar Trend Miner
# Original Idea by Yaroslav Bulatov
# http://yaroslavvb.blogspot.com/2005/12/trends-in-machine-learning-according.html
# (Re)implementation (C) Konstantin Tretyakov
# ------------------------------------------
import urllib, os, sys, re
import matplotlib
matplotlib.use('Agg')
import pylab
from Numeric import array
# ---------------- Settings ---------------
base_dir = "/projects/www/u/kt/stuff/scholartrend/"
pages_dir = base_dir + "pages/"
plots_relative_dir = "plots/"
if not os.path.exists(pages_dir):
print "Creating %s"%pages_dir
os.mkdir(pages_dir)
if not os.path.exists(base_dir + plots_relative_dir):
print "Creating %s"%(base_dir + plots_relative_dir)
os.mkdir(base_dir + plots_relative_dir)
# Setup urlopener
class MyUrlOpener(urllib.FancyURLopener):
version = "Mozilla/4.0 (compatible; MSIE 6.0; Win32)"
urllib._urlopener = MyUrlOpener()
# --------------- Retriever class --------------
class GoogleScholarQuery:
url_template="http://scholar.google.com/scholar?as_q=%(topic)s&num=1&btnG=Search+Scholar&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=&as_publication=&as_ylo=%(year)d&as_yhi=%(year)d&as_allsubj=all&hl=en&lr="
pages_dir = ""
def __init__(self, pages_dir):
self.pages_dir = pages_dir
def clean(self, topic, year):
"""Removes the file from cache, replaces /sorry/images link to point to google, and moves the file to captcha.html"""
topic_quoted = urllib.quote(topic, '')
target_file = "%s/%s/%d.html" % (pages_dir, topic_quoted, year)
if os.path.exists(target_file):
os.rename(target_file, target_file+".captcha")
f = open(target_file + ".captcha").read()
p = f.find('"/sorry/image')
if (p == -1):
raise Exception("Google returned an unexpected page, which is not even a captcha!")
new_f = f[0:(p+1)] + 'http://scholar.google.com' + f[(p+1):]
f0 = open("%s/captcha.html" % base_dir, "w")
f0.write(new_f)
f0.close()
def get(self, topic, year):
"""Topic is a topic to query on (e.g. "machine learning" (include quotes if required)), year is a year of publications (e.g. 1945). Returns the returned page"""
topic_quoted = urllib.quote(topic, '')
# Check whether the directory pages_dir/topic_quoted already exists
if not os.path.exists(pages_dir + topic_quoted):
os.mkdir(pages_dir + topic_quoted)
target_file = "%s/%s/%d.html"%(pages_dir, topic_quoted, year)
if not os.path.exists(target_file):
#print "Saving file %s" % target_file
# Do the query
query = self.url_template % { "topic" : topic_quoted, "year" : year }
print "Querying for %s/%d"%(topic, year)
sys.stdout.flush()
urllib.urlretrieve(query, target_file)
else:
print "Skipping query for %s/%d"%(topic, year)
f = open(target_file, "r")
result = f.read()
f.close()
return result
def matchcount(self, topic, year):
"""Gets a given page (using get) and counts the number of matched results"""
page = self.get(topic, year)
if "did not match any articles" in page:
return 0
pattern = re.compile('Results .+ of (about)? ?([0-9,]+?)')
m = pattern.search(page)
if not m:
print "It seems that Google found out that we are a bot and started offering its CAPTCHA"
print "Please, wait some hours and try again"
self.clean(topic, year)
raise Exception("GoogleDoesNotLikeOurBotException")
else:
result = int(m.group(2).replace(',',''))
#print "Match count (%s,%d) = %d" % (topic, year, result)
return result
# -------------- makeplot function ----------------
def makeplot(topic, year_from, year_to, normalize_by):
"""Returns the relative filename of a png image containing the plot"""
topic_quoted = urllib.quote(topic, '')
normalize_by_quoted = urllib.quote(normalize_by, '')
target_relative_file = "%s/%s-%d-%d-%s.png"%(plots_relative_dir, topic_quoted, year_from, year_to, normalize_by_quoted)
target_file = "%s/%s"%(base_dir, target_relative_file)
if os.path.exists(target_file):
print "Plot already exists: %s"%target_relative_file
else:
# Create plot
scholar = GoogleScholarQuery(pages_dir)
# For each year make an appropriate query and count the results
results = []
for year in xrange(year_from, year_to+1):
if normalize_by != "":
p_xy = scholar.matchcount("%s %s"%(normalize_by, topic), year)
p_x = scholar.matchcount(normalize_by, year)
#print "VALUES: %d/%d" % (p_xy, p_x)
if p_x == 0:
p_x = 1
p_xy = 0
results.append(float(p_xy)/float(p_x))
else:
results.append(scholar.matchcount(topic, year))
print "Results: " + str(results)
pylab.bar(range(year_from, year_to+1), results, color='b')
if normalize_by != "":
pylab.title("P(%s | %s)"%(topic, normalize_by))
else:
pylab.title(topic)
pylab.savefig(target_file, dpi=50)
return target_relative_file
# -------------- MAIN ------------------
# Check command line params. Should be like topic, yearfrom, yearto, normalize_by
try:
topic = sys.argv[1]
year_from = int(sys.argv[2])
year_to = int(sys.argv[3])
normalize_by = sys.argv[4]
if year_from < 1950 or year_from > 2010 or year_to < 1950 or year_to > 2010:
raise "Invalid parameter"
except:
#print "Usage: %s "%sys.argv[0]
print "Invalid parameters!"
sys.exit(1)
print "