# ------------------------------------------ # Google Scholar Trend Miner # Original Idea by Yaroslav Bulatov # http://yaroslavvb.blogspot.com/2005/12/trends-in-machine-learning-according.html # (Re)implementation (C) Konstantin Tretyakov # ------------------------------------------ import urllib, os, sys, re import matplotlib matplotlib.use('Agg') import pylab from Numeric import array # ---------------- Settings --------------- base_dir = "/projects/www/u/kt/stuff/scholartrend/" pages_dir = base_dir + "pages/" plots_relative_dir = "plots/" if not os.path.exists(pages_dir): print "Creating %s"%pages_dir os.mkdir(pages_dir) if not os.path.exists(base_dir + plots_relative_dir): print "Creating %s"%(base_dir + plots_relative_dir) os.mkdir(base_dir + plots_relative_dir) # Setup urlopener class MyUrlOpener(urllib.FancyURLopener): version = "Mozilla/4.0 (compatible; MSIE 6.0; Win32)" urllib._urlopener = MyUrlOpener() # --------------- Retriever class -------------- class GoogleScholarQuery: url_template="http://scholar.google.com/scholar?as_q=%(topic)s&num=1&btnG=Search+Scholar&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=&as_publication=&as_ylo=%(year)d&as_yhi=%(year)d&as_allsubj=all&hl=en&lr=" pages_dir = "" def __init__(self, pages_dir): self.pages_dir = pages_dir def clean(self, topic, year): """Removes the file from cache, replaces /sorry/images link to point to google, and moves the file to captcha.html""" topic_quoted = urllib.quote(topic, '') target_file = "%s/%s/%d.html" % (pages_dir, topic_quoted, year) if os.path.exists(target_file): os.rename(target_file, target_file+".captcha") f = open(target_file + ".captcha").read() p = f.find('"/sorry/image') if (p == -1): raise Exception("Google returned an unexpected page, which is not even a captcha!") new_f = f[0:(p+1)] + 'http://scholar.google.com' + f[(p+1):] f0 = open("%s/captcha.html" % base_dir, "w") f0.write(new_f) f0.close() def get(self, topic, year): """Topic is a topic to query on (e.g. "machine learning" (include quotes if required)), year is a year of publications (e.g. 1945). Returns the returned page""" topic_quoted = urllib.quote(topic, '') # Check whether the directory pages_dir/topic_quoted already exists if not os.path.exists(pages_dir + topic_quoted): os.mkdir(pages_dir + topic_quoted) target_file = "%s/%s/%d.html"%(pages_dir, topic_quoted, year) if not os.path.exists(target_file): #print "Saving file %s" % target_file # Do the query query = self.url_template % { "topic" : topic_quoted, "year" : year } print "Querying for %s/%d"%(topic, year) sys.stdout.flush() urllib.urlretrieve(query, target_file) else: print "Skipping query for %s/%d"%(topic, year) f = open(target_file, "r") result = f.read() f.close() return result def matchcount(self, topic, year): """Gets a given page (using get) and counts the number of matched results""" page = self.get(topic, year) if "did not match any articles" in page: return 0 pattern = re.compile('Results .+ of (about)? ?([0-9,]+?)') m = pattern.search(page) if not m: print "It seems that Google found out that we are a bot and started offering its CAPTCHA" print "Please, wait some hours and try again" self.clean(topic, year) raise Exception("GoogleDoesNotLikeOurBotException") else: result = int(m.group(2).replace(',','')) #print "Match count (%s,%d) = %d" % (topic, year, result) return result # -------------- makeplot function ---------------- def makeplot(topic, year_from, year_to, normalize_by): """Returns the relative filename of a png image containing the plot""" topic_quoted = urllib.quote(topic, '') normalize_by_quoted = urllib.quote(normalize_by, '') target_relative_file = "%s/%s-%d-%d-%s.png"%(plots_relative_dir, topic_quoted, year_from, year_to, normalize_by_quoted) target_file = "%s/%s"%(base_dir, target_relative_file) if os.path.exists(target_file): print "Plot already exists: %s"%target_relative_file else: # Create plot scholar = GoogleScholarQuery(pages_dir) # For each year make an appropriate query and count the results results = [] for year in xrange(year_from, year_to+1): if normalize_by != "": p_xy = scholar.matchcount("%s %s"%(normalize_by, topic), year) p_x = scholar.matchcount(normalize_by, year) #print "VALUES: %d/%d" % (p_xy, p_x) if p_x == 0: p_x = 1 p_xy = 0 results.append(float(p_xy)/float(p_x)) else: results.append(scholar.matchcount(topic, year)) print "Results: " + str(results) pylab.bar(range(year_from, year_to+1), results, color='b') if normalize_by != "": pylab.title("P(%s | %s)"%(topic, normalize_by)) else: pylab.title(topic) pylab.savefig(target_file, dpi=50) return target_relative_file # -------------- MAIN ------------------ # Check command line params. Should be like topic, yearfrom, yearto, normalize_by try: topic = sys.argv[1] year_from = int(sys.argv[2]) year_to = int(sys.argv[3]) normalize_by = sys.argv[4] if year_from < 1950 or year_from > 2010 or year_to < 1950 or year_to > 2010: raise "Invalid parameter" except: #print "Usage: %s "%sys.argv[0] print "Invalid parameters!" sys.exit(1) print "
"

newfile = makeplot(topic, year_from, year_to, normalize_by)

print "Done\n"
print "
" print "" print "
" print ""%urllib.quote(newfile) print "
"